From a5cc897cdedfdca018a83fac5734ebe086acb817 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 24 Jan 2025 13:13:18 +0000
Subject: [PATCH 001/432] [gn build] Port 0ee037b861f9

---
 llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 9226658d4c767..a4a17a22c1f2c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -181,6 +181,8 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUPromoteKernelArguments.cpp",
     "AMDGPURegBankCombiner.cpp",
     "AMDGPURegBankLegalize.cpp",
+    "AMDGPURegBankLegalizeHelper.cpp",
+    "AMDGPURegBankLegalizeRules.cpp",
     "AMDGPURegBankSelect.cpp",
     "AMDGPURegisterBankInfo.cpp",
     "AMDGPURemoveIncompatibleFunctions.cpp",

From 6292a808b3524d9ba6f4ce55bc5b9e547b088dd8 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Fri, 24 Jan 2025 13:27:56 +0000
Subject: [PATCH 002/432] [NFC][DebugInfo] Use iterator-flavour getFirstNonPHI
 at many call-sites (#123737)

As part of the "RemoveDIs" project, BasicBlock::iterator now carries a
debug-info bit that's needed when getFirstNonPHI and similar feed into
instruction insertion positions. Call-sites where that's necessary were
updated a year ago; but to ensure some type safety however, we'd like to
have all calls to getFirstNonPHI use the iterator-returning version.

This patch changes a bunch of call-sites calling getFirstNonPHI to use
getFirstNonPHIIt, which returns an iterator. All these call sites are
where it's obviously safe to fetch the iterator then dereference it. A
follow-up patch will contain less-obviously-safe changes.

We'll eventually deprecate and remove the instruction-pointer
getFirstNonPHI, but not before adding concise documentation of what
considerations are needed (very few).

---------

Co-authored-by: Stephen Tozer <Melamoto@gmail.com>
---
 clang/lib/CodeGen/CGException.cpp             |  7 ++-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp         |  2 +-
 llvm/include/llvm/IR/BasicBlock.h             |  2 +-
 .../llvm/Transforms/Utils/Instrumentation.h   |  5 ++
 llvm/lib/Analysis/Loads.cpp                   |  2 +-
 llvm/lib/Analysis/LoopNestAnalysis.cpp        |  2 +-
 llvm/lib/Analysis/MustExecute.cpp             |  2 +-
 llvm/lib/Analysis/ValueTracking.cpp           |  2 +-
 llvm/lib/CodeGen/AsmPrinter/WinException.cpp  |  4 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |  6 +-
 llvm/lib/CodeGen/GlobalMerge.cpp              |  2 +-
 llvm/lib/CodeGen/MachineFunction.cpp          |  3 +-
 llvm/lib/CodeGen/SelectOptimize.cpp           |  2 +-
 .../SelectionDAG/FunctionLoweringInfo.cpp     |  6 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  4 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp |  9 ++-
 llvm/lib/CodeGen/WasmEHPrepare.cpp            | 10 ++--
 llvm/lib/CodeGen/WinEHPrepare.cpp             | 58 ++++++++++---------
 llvm/lib/IR/EHPersonalities.cpp               |  2 +-
 llvm/lib/IR/Instructions.cpp                  |  2 +-
 llvm/lib/IR/Verifier.cpp                      | 20 +++----
 llvm/lib/Target/BPF/BPFAdjustOpt.cpp          |  2 +-
 .../Hexagon/HexagonLoopIdiomRecognition.cpp   |  3 +-
 .../WebAssemblyLowerEmscriptenEHSjLj.cpp      |  4 +-
 llvm/lib/Target/X86/X86WinEHState.cpp         |  6 +-
 .../Coroutines/MaterializationUtils.cpp       |  6 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  2 +-
 llvm/lib/Transforms/IPO/PartialInlining.cpp   |  2 +-
 .../Instrumentation/AddressSanitizer.cpp      |  6 +-
 .../Instrumentation/MemorySanitizer.cpp       |  5 +-
 .../NumericalStabilitySanitizer.cpp           |  2 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  6 +-
 .../Instrumentation/PGOMemOPSizeOpt.cpp       |  2 +-
 .../Instrumentation/ThreadSanitizer.cpp       |  6 +-
 llvm/lib/Transforms/ObjCARC/ObjCARC.cpp       |  4 +-
 .../Transforms/ObjCARC/ObjCARCContract.cpp    |  2 +-
 llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp   |  3 +-
 llvm/lib/Transforms/Scalar/GVN.cpp            |  3 +-
 llvm/lib/Transforms/Scalar/GVNSink.cpp        |  2 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |  7 ++-
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  | 12 ++--
 .../lib/Transforms/Scalar/LoopInterchange.cpp | 17 +++---
 .../lib/Transforms/Scalar/LoopSimplifyCFG.cpp |  2 +-
 llvm/lib/Transforms/Scalar/SCCP.cpp           |  2 +-
 .../Transforms/Utils/BreakCriticalEdges.cpp   |  6 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  6 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  | 47 +++++++--------
 llvm/lib/Transforms/Utils/Local.cpp           |  6 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp    |  2 +-
 .../Transforms/Utils/LowerMemIntrinsics.cpp   | 11 ++--
 llvm/lib/Transforms/Utils/MoveAutoInit.cpp    |  2 +-
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      |  4 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  7 +--
 .../llvm-reduce/deltas/ReduceBasicBlocks.cpp  |  2 +-
 llvm/unittests/Analysis/MemorySSATest.cpp     |  2 +-
 .../Analysis/ProfileSummaryInfoTest.cpp       | 10 ++--
 .../Frontend/OpenMPIRBuilderTest.cpp          | 16 ++---
 llvm/unittests/IR/DebugInfoTest.cpp           | 10 ++--
 llvm/unittests/IR/InstructionsTest.cpp        |  4 +-
 llvm/unittests/Transforms/Scalar/LICMTest.cpp |  4 +-
 polly/lib/CodeGen/BlockGenerators.cpp         |  8 +--
 polly/lib/CodeGen/LoopGenerators.cpp          |  2 +-
 .../lib/Transform/MaximalStaticExpansion.cpp  |  4 +-
 64 files changed, 217 insertions(+), 200 deletions(-)

diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp
index 5dc1686e7914c..5a395c924333e 100644
--- a/clang/lib/CodeGen/CGException.cpp
+++ b/clang/lib/CodeGen/CGException.cpp
@@ -1251,11 +1251,12 @@ void CodeGenFunction::ExitCXXTryStmt(const CXXTryStmt &S, bool IsFnTryBlock) {
   llvm::BasicBlock *WasmCatchStartBlock = nullptr;
   if (EHPersonality::get(*this).isWasmPersonality()) {
     auto *CatchSwitch =
-        cast<llvm::CatchSwitchInst>(DispatchBlock->getFirstNonPHI());
+        cast<llvm::CatchSwitchInst>(DispatchBlock->getFirstNonPHIIt());
     WasmCatchStartBlock = CatchSwitch->hasUnwindDest()
                               ? CatchSwitch->getSuccessor(1)
                               : CatchSwitch->getSuccessor(0);
-    auto *CPI = cast<llvm::CatchPadInst>(WasmCatchStartBlock->getFirstNonPHI());
+    auto *CPI =
+        cast<llvm::CatchPadInst>(WasmCatchStartBlock->getFirstNonPHIIt());
     CurrentFuncletPad = CPI;
   }
 
@@ -2252,7 +2253,7 @@ void CodeGenFunction::ExitSEHTryStmt(const SEHTryStmt &S) {
   // __except blocks don't get outlined into funclets, so immediately do a
   // catchret.
   llvm::CatchPadInst *CPI =
-      cast<llvm::CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      cast<llvm::CatchPadInst>(CatchPadBB->getFirstNonPHIIt());
   llvm::BasicBlock *ExceptBB = createBasicBlock("__except");
   Builder.CreateCatchRet(CPI, ExceptBB);
   EmitBlock(ExceptBB);
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 90651c3bafe26..0d53e8cb45fe7 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -918,7 +918,7 @@ void MicrosoftCXXABI::emitBeginCatch(CodeGenFunction &CGF,
   VarDecl *CatchParam = S->getExceptionDecl();
   llvm::BasicBlock *CatchPadBB = CGF.Builder.GetInsertBlock();
   llvm::CatchPadInst *CPI =
-      cast<llvm::CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      cast<llvm::CatchPadInst>(CatchPadBB->getFirstNonPHIIt());
   CGF.CurrentFuncletPad = CPI;
 
   // If this is a catch-all or the catch parameter is unnamed, we don't need to
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index f85b221a211b9..e22fe1e7e7dc8 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -673,7 +673,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   void replaceSuccessorsPhiUsesWith(BasicBlock *New);
 
   /// Return true if this basic block is an exception handling block.
-  bool isEHPad() const { return getFirstNonPHI()->isEHPad(); }
+  bool isEHPad() const { return getFirstNonPHIIt()->isEHPad(); }
 
   /// Return true if this basic block is a landing pad.
   ///
diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
index 4f67d079d1469..0e2c0d9bfa605 100644
--- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
@@ -204,6 +204,11 @@ struct InstrumentationIRBuilder : IRBuilder<> {
   explicit InstrumentationIRBuilder(Instruction *IP) : IRBuilder<>(IP) {
     ensureDebugInfo(*this, *IP->getFunction());
   }
+
+  explicit InstrumentationIRBuilder(BasicBlock *BB, BasicBlock::iterator It)
+      : IRBuilder<>(BB, It) {
+    ensureDebugInfo(*this, *BB->getParent());
+  }
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 11ccfa33821ca..9279f19b72a3f 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -284,7 +284,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
                 DL.getTypeStoreSize(LI->getType()).getFixedValue());
   const Align Alignment = LI->getAlign();
 
-  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+  Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt();
 
   // If given a uniform (i.e. non-varying) address, see if we can prove the
   // access is safe within the loop w/o needing predication.
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
index fe6d270b9ac53..ead5cf610d9e1 100644
--- a/llvm/lib/Analysis/LoopNestAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -346,7 +346,7 @@ static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
   // "guarded" inner loop which contains "only" Phi nodes corresponding to the
   // LCSSA Phi nodes in the exit block.
   auto IsExtraPhiBlock = [&](const BasicBlock &BB) {
-    return BB.getFirstNonPHI() == BB.getTerminator() &&
+    return &*BB.getFirstNonPHIIt() == BB.getTerminator() &&
            all_of(BB.phis(), [&](const PHINode &PN) {
              return all_of(PN.blocks(), [&](const BasicBlock *IncomingBlock) {
                return IncomingBlock == InnerLoopExit ||
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index d5c665753075c..fde6bbf9eb181 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -275,7 +275,7 @@ bool SimpleLoopSafetyInfo::isGuaranteedToExecute(const Instruction &Inst,
     // exit.  At the moment, we use a (cheap) hack for the common case where
     // the instruction of interest is the first one in the block.
     return !HeaderMayThrow ||
-           Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
+           &*Inst.getParent()->getFirstNonPHIOrDbg() == &Inst;
 
   // If there is a path from header to exit or latch that doesn't lead to our
   // instruction's block, return false.
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 38f88850be0f1..264fedd6b66b9 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8246,7 +8246,7 @@ static bool programUndefinedIfUndefOrPoison(const Value *V,
     if (!BB || !Visited.insert(BB).second)
       break;
 
-    Begin = BB->getFirstNonPHI()->getIterator();
+    Begin = BB->getFirstNonPHIIt();
     End = BB->end();
   }
   return false;
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index 6d6432b61f2d7..97b4a6a42d81d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -928,8 +928,8 @@ void WinException::computeIP2StateTable(
       BaseState = NullState;
       StartLabel = Asm->getFunctionBegin();
     } else {
-      auto *FuncletPad =
-          cast<FuncletPadInst>(FuncletStart->getBasicBlock()->getFirstNonPHI());
+      auto *FuncletPad = cast<FuncletPadInst>(
+          FuncletStart->getBasicBlock()->getFirstNonPHIIt());
       assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0);
       BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second;
       StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f668e41094bbc..21622ea43724c 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2866,7 +2866,7 @@ bool IRTranslator::findUnwindDestinations(
   }
 
   while (EHPadBB) {
-    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt();
     BasicBlock *NewEHPadBB = nullptr;
     if (isa<LandingPadInst>(Pad)) {
       // Stop on landingpads. They are not funclets.
@@ -2927,7 +2927,7 @@ bool IRTranslator::translateInvoke(const User &U,
     return false;
 
   // FIXME: support Windows exception handling.
-  if (!isa<LandingPadInst>(EHPadBB->getFirstNonPHI()))
+  if (!isa<LandingPadInst>(EHPadBB->getFirstNonPHIIt()))
     return false;
 
   // FIXME: support Windows dllimport function calls and calls through
@@ -4031,7 +4031,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MF->push_back(EntryBB);
   EntryBuilder->setMBB(*EntryBB);
 
-  DebugLoc DbgLoc = F.getEntryBlock().getFirstNonPHI()->getDebugLoc();
+  DebugLoc DbgLoc = F.getEntryBlock().getFirstNonPHIIt()->getDebugLoc();
   SwiftError.setFunction(CurMF);
   SwiftError.createEntriesInEntryBlock(DbgLoc);
 
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 9d4547df046d4..7b76155b175d1 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -633,7 +633,7 @@ void GlobalMergeImpl::setMustKeepGlobalVariables(Module &M) {
 
   for (Function &F : M) {
     for (BasicBlock &BB : F) {
-      Instruction *Pad = BB.getFirstNonPHI();
+      BasicBlock::iterator Pad = BB.getFirstNonPHIIt();
       auto *II = dyn_cast<IntrinsicInst>(Pad);
       if (!Pad->isEHPad() &&
           !(II && II->getIntrinsicID() == Intrinsic::eh_typeid_for))
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index e4824183e8dfc..ab3609b6141b8 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -833,7 +833,8 @@ MCSymbol *MachineFunction::addLandingPad(MachineBasicBlock *LandingPad) {
   LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
   LP.LandingPadLabel = LandingPadLabel;
 
-  const Instruction *FirstI = LandingPad->getBasicBlock()->getFirstNonPHI();
+  BasicBlock::const_iterator FirstI =
+      LandingPad->getBasicBlock()->getFirstNonPHIIt();
   if (const auto *LPI = dyn_cast<LandingPadInst>(FirstI)) {
     // If there's no typeid list specified, then "cleanup" is implicit.
     // Otherwise, id 0 is reserved for the cleanup action.
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 57488a90e7a4a..b7600a3b7fba7 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -1217,7 +1217,7 @@ bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
     return true;
 
   OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
-                                   L->getHeader()->getFirstNonPHI());
+                                   &*L->getHeader()->getFirstNonPHIIt());
 
   if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
       LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 3e89b18585f15..33c6341744478 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -250,7 +250,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
     // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks
     // are really data, and no instructions can live here.
     if (BB.isEHPad()) {
-      const Instruction *PadInst = BB.getFirstNonPHI();
+      BasicBlock::const_iterator PadInst = BB.getFirstNonPHIIt();
       // If this is a non-landingpad EH pad, mark this function as using
       // funclets.
       // FIXME: SEH catchpads do not create EH scope/funclets, so we could avoid
@@ -261,13 +261,13 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
         MF->getFrameInfo().setHasOpaqueSPAdjustment(true);
       }
       if (isa<CatchSwitchInst>(PadInst)) {
-        assert(&*BB.begin() == PadInst &&
+        assert(BB.begin() == PadInst &&
                "WinEHPrepare failed to remove PHIs from imaginary BBs");
         continue;
       }
       if (isa<FuncletPadInst>(PadInst) &&
           Personality != EHPersonality::Wasm_CXX)
-        assert(&*BB.begin() == PadInst && "WinEHPrepare failed to demote PHIs");
+        assert(BB.begin() == PadInst && "WinEHPrepare failed to demote PHIs");
     }
 
     MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&BB);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ecaa61fdc86a4..428e7a316d247 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2063,7 +2063,7 @@ static void findWasmUnwindDestinations(
     SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
         &UnwindDests) {
   while (EHPadBB) {
-    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt();
     if (isa<CleanupPadInst>(Pad)) {
       // Stop on cleanup pads.
       UnwindDests.emplace_back(FuncInfo.getMBB(EHPadBB), Prob);
@@ -2111,7 +2111,7 @@ static void findUnwindDestinations(
   }
 
   while (EHPadBB) {
-    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock::const_iterator Pad = EHPadBB->getFirstNonPHIIt();
     BasicBlock *NewEHPadBB = nullptr;
     if (isa<LandingPadInst>(Pad)) {
       // Stop on landingpads. They are not funclets.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 3b1abf7f3d994..899f83bbc6064 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1421,7 +1421,7 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
   // Catchpads have one live-in register, which typically holds the exception
   // pointer or code.
   if (isFuncletEHPersonality(Pers)) {
-    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHIIt())) {
       if (hasExceptionPointerOrCodeUser(CPI)) {
         // Get or create the virtual register to hold the pointer or code.  Mark
         // the live in physreg and copy into the vreg.
@@ -1452,7 +1452,7 @@ bool SelectionDAGISel::PrepareEHLandingPad() {
     MF->getRegInfo().addPhysRegsUsedFromRegMask(RegMask);
 
   if (Pers == EHPersonality::Wasm_CXX) {
-    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI()))
+    if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHIIt()))
       mapWasmLandingPadIndex(MBB, CPI);
   } else {
     // Assign the call site to the landing pad's begin label.
@@ -1721,13 +1721,12 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     // use anything def'd by or after the tail call.
     {
       BasicBlock::iterator BBStart =
-          const_cast<BasicBlock *>(LLVMBB)->getFirstNonPHI()->getIterator();
+          const_cast<BasicBlock *>(LLVMBB)->getFirstNonPHIIt();
       BasicBlock::iterator BBEnd = const_cast<BasicBlock *>(LLVMBB)->end();
       preserveFakeUses(BBStart, BBEnd);
     }
 
-    BasicBlock::const_iterator const Begin =
-        LLVMBB->getFirstNonPHI()->getIterator();
+    BasicBlock::const_iterator const Begin = LLVMBB->getFirstNonPHIIt();
     BasicBlock::const_iterator const End = LLVMBB->end();
     BasicBlock::const_iterator BI = End;
 
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 1701b0d04425d..d18196b2217f5 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -227,7 +227,7 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
   for (BasicBlock &BB : F) {
     if (!BB.isEHPad())
       continue;
-    auto *Pad = BB.getFirstNonPHI();
+    BasicBlock::iterator Pad = BB.getFirstNonPHIIt();
     if (isa<CatchPadInst>(Pad))
       CatchPads.push_back(&BB);
     else if (isa<CleanupPadInst>(Pad))
@@ -284,7 +284,7 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) {
 
   unsigned Index = 0;
   for (auto *BB : CatchPads) {
-    auto *CPI = cast<CatchPadInst>(BB->getFirstNonPHI());
+    auto *CPI = cast<CatchPadInst>(BB->getFirstNonPHIIt());
     // In case of a single catch (...), we don't need to emit a personalify
     // function call
     if (CPI->arg_size() == 1 &&
@@ -309,7 +309,7 @@ void WasmEHPrepareImpl::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
   IRBuilder<> IRB(BB->getContext());
   IRB.SetInsertPoint(BB, BB->getFirstInsertionPt());
 
-  auto *FPI = cast<FuncletPadInst>(BB->getFirstNonPHI());
+  auto *FPI = cast<FuncletPadInst>(BB->getFirstNonPHIIt());
   Instruction *GetExnCI = nullptr, *GetSelectorCI = nullptr;
   for (auto &U : FPI->uses()) {
     if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
@@ -388,13 +388,13 @@ void llvm::calculateWasmEHInfo(const Function *F, WasmEHFuncInfo &EHInfo) {
   for (const auto &BB : *F) {
     if (!BB.isEHPad())
       continue;
-    const Instruction *Pad = BB.getFirstNonPHI();
+    const Instruction *Pad = &*BB.getFirstNonPHIIt();
 
     if (const auto *CatchPad = dyn_cast<CatchPadInst>(Pad)) {
       const auto *UnwindBB = CatchPad->getCatchSwitch()->getUnwindDest();
       if (!UnwindBB)
         continue;
-      const Instruction *UnwindPad = UnwindBB->getFirstNonPHI();
+      const Instruction *UnwindPad = &*UnwindBB->getFirstNonPHIIt();
       if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UnwindPad))
         // Currently there should be only one handler per a catchswitch.
         EHInfo.setUnwindDest(&BB, *CatchSwitch->handlers().begin());
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index c58c67b70fe3c..6d85f07829033 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -201,7 +201,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn,
 
     BasicBlock *FuncletUnwindDest;
     auto *FuncletPad =
-        dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI());
+        dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHIIt());
     assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock());
     if (!FuncletPad)
       FuncletUnwindDest = nullptr;
@@ -223,7 +223,7 @@ static void calculateStateNumbersForInvokes(const Function *Fn,
     if (BaseState != -1) {
       FuncInfo.InvokeStateMap[II] = BaseState;
     } else {
-      Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI();
+      Instruction *PadInst = &*InvokeUnwindDest->getFirstNonPHIIt();
       assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!");
       FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst];
     }
@@ -254,10 +254,10 @@ void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State,
     if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State)
       continue; // skip blocks already visited by lower State
 
-    const llvm::Instruction *I = BB->getFirstNonPHI();
+    BasicBlock::const_iterator It = BB->getFirstNonPHIIt();
     const llvm::Instruction *TI = BB->getTerminator();
-    if (I->isEHPad())
-      State = EHInfo.EHPadStateMap[I];
+    if (It->isEHPad())
+      State = EHInfo.EHPadStateMap[&*It];
     EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting
 
     if ((isa<CleanupReturnInst>(TI) || isa<CatchReturnInst>(TI)) && State > 0) {
@@ -315,15 +315,15 @@ void llvm::calculateSEHStateForAsynchEH(const BasicBlock *BB, int State,
     if (EHInfo.BlockToStateMap.count(BB) && EHInfo.BlockToStateMap[BB] <= State)
       continue; // skip blocks already visited by lower State
 
-    const llvm::Instruction *I = BB->getFirstNonPHI();
+    BasicBlock::const_iterator It = BB->getFirstNonPHIIt();
     const llvm::Instruction *TI = BB->getTerminator();
-    if (I->isEHPad())
-      State = EHInfo.EHPadStateMap[I];
+    if (It->isEHPad())
+      State = EHInfo.EHPadStateMap[&*It];
     EHInfo.BlockToStateMap[BB] = State; // Record state
 
-    if (isa<CatchPadInst>(I) && isa<CatchReturnInst>(TI)) {
+    if (isa<CatchPadInst>(It) && isa<CatchReturnInst>(TI)) {
       const Constant *FilterOrNull = cast<Constant>(
-          cast<CatchPadInst>(I)->getArgOperand(0)->stripPointerCasts());
+          cast<CatchPadInst>(It)->getArgOperand(0)->stripPointerCasts());
       const Function *Filter = dyn_cast<Function>(FilterOrNull);
       if (!Filter || !Filter->getName().starts_with("__IsLocalUnwind"))
         State = EHInfo.SEHUnwindMap[State].ToState; // Retrive next State
@@ -385,7 +385,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
 
     SmallVector<const CatchPadInst *, 2> Handlers;
     for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
-      auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHIIt());
       Handlers.push_back(CatchPad);
     }
     int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
@@ -393,7 +393,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
     for (const BasicBlock *PredBlock : predecessors(BB))
       if ((PredBlock = getEHPadFromPredecessor(PredBlock,
                                                CatchSwitch->getParentPad())))
-        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+        calculateCXXStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(),
                                  TryLow);
     int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
 
@@ -456,7 +456,7 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
     for (const BasicBlock *PredBlock : predecessors(BB)) {
       if ((PredBlock = getEHPadFromPredecessor(PredBlock,
                                                CleanupPad->getParentPad()))) {
-        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+        calculateCXXStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(),
                                  CleanupState);
       }
     }
@@ -509,7 +509,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
     assert(CatchSwitch->getNumHandlers() == 1 &&
            "SEH doesn't have multiple handlers per __try");
     const auto *CatchPad =
-        cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHI());
+        cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHIIt());
     const BasicBlock *CatchPadBB = CatchPad->getParent();
     const Constant *FilterOrNull =
         cast<Constant>(CatchPad->getArgOperand(0)->stripPointerCasts());
@@ -526,7 +526,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
     for (const BasicBlock *PredBlock : predecessors(BB))
       if ((PredBlock = getEHPadFromPredecessor(PredBlock,
                                                CatchSwitch->getParentPad())))
-        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+        calculateSEHStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(),
                                  TryState);
 
     // Everything in the __except block unwinds to ParentState, just like code
@@ -562,7 +562,7 @@ static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
     for (const BasicBlock *PredBlock : predecessors(BB))
       if ((PredBlock =
                getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad())))
-        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+        calculateSEHStateNumbers(FuncInfo, &*PredBlock->getFirstNonPHIIt(),
                                  CleanupState);
     for (const User *U : CleanupPad->users()) {
       const auto *UserI = cast<Instruction>(U);
@@ -594,7 +594,7 @@ void llvm::calculateSEHStateNumbers(const Function *Fn,
   for (const BasicBlock &BB : *Fn) {
     if (!BB.isEHPad())
       continue;
-    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt();
     if (!isTopLevelPadForMSVC(FirstNonPHI))
       continue;
     ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1);
@@ -618,7 +618,7 @@ void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
   for (const BasicBlock &BB : *Fn) {
     if (!BB.isEHPad())
       continue;
-    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt();
     if (!isTopLevelPadForMSVC(FirstNonPHI))
       continue;
     calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1);
@@ -678,7 +678,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
   // Seed a worklist with pads that have no parent.
   SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
   for (const BasicBlock &BB : *Fn) {
-    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Instruction *FirstNonPHI = &*BB.getFirstNonPHIIt();
     const Value *ParentPad;
     if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
       ParentPad = CPI->getParentPad();
@@ -725,7 +725,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
       for (const BasicBlock *CatchBlock : llvm::reverse(CatchBlocks)) {
         // Create the entry for this catch with the appropriate handler
         // properties.
-        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
+        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHIIt());
         uint32_t TypeToken = static_cast<uint32_t>(
             cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
         CatchState =
@@ -751,7 +751,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
   // so visit pads in descendant-most to ancestor-most order.
   for (ClrEHUnwindMapEntry &Entry : llvm::reverse(FuncInfo.ClrEHUnwindMap)) {
     const Instruction *Pad =
-        cast<const BasicBlock *>(Entry.Handler)->getFirstNonPHI();
+        &*cast<const BasicBlock *>(Entry.Handler)->getFirstNonPHIIt();
     // For most pads, the TryParentState is the state associated with the
     // unwind dest of exceptional exits from it.
     const BasicBlock *UnwindDest;
@@ -800,7 +800,7 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
 
         // Now we have an unwind dest for the user, but we need to see if it
         // unwinds all the way out of the cleanup or if it stays within it.
-        const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+        const Instruction *UserUnwindPad = &*UserUnwindDest->getFirstNonPHIIt();
         const Value *UserUnwindParent;
         if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
           UserUnwindParent = CSI->getParentPad();
@@ -835,7 +835,8 @@ void llvm::calculateClrEHStateNumbers(const Function *Fn,
     if (!UnwindDest) {
       UnwindDestState = -1;
     } else {
-      UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
+      UnwindDestState =
+          FuncInfo.EHPadStateMap[&*UnwindDest->getFirstNonPHIIt()];
     }
 
     Entry.TryParentState = UnwindDestState;
@@ -863,7 +864,8 @@ void WinEHPrepareImpl::demotePHIsOnFunclets(Function &F,
   for (BasicBlock &BB : make_early_inc_range(F)) {
     if (!BB.isEHPad())
       continue;
-    if (DemoteCatchSwitchPHIOnly && !isa<CatchSwitchInst>(BB.getFirstNonPHI()))
+    if (DemoteCatchSwitchPHIOnly &&
+        !isa<CatchSwitchInst>(BB.getFirstNonPHIIt()))
       continue;
 
     for (Instruction &I : make_early_inc_range(BB)) {
@@ -898,7 +900,7 @@ void WinEHPrepareImpl::cloneCommonBlocks(Function &F) {
     if (FuncletPadBB == &F.getEntryBlock())
       FuncletToken = ConstantTokenNone::get(F.getContext());
     else
-      FuncletToken = FuncletPadBB->getFirstNonPHI();
+      FuncletToken = &*FuncletPadBB->getFirstNonPHIIt();
 
     std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
     ValueToValueMapTy VMap;
@@ -1094,7 +1096,7 @@ void WinEHPrepareImpl::removeImplausibleInstructions(Function &F) {
   for (auto &Funclet : FuncletBlocks) {
     BasicBlock *FuncletPadBB = Funclet.first;
     std::vector<BasicBlock *> &BlocksInFunclet = Funclet.second;
-    Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI();
+    Instruction *FirstNonPHI = &*FuncletPadBB->getFirstNonPHIIt();
     auto *FuncletPad = dyn_cast<FuncletPadInst>(FirstNonPHI);
     auto *CatchPad = dyn_cast_or_null<CatchPadInst>(FuncletPad);
     auto *CleanupPad = dyn_cast_or_null<CleanupPadInst>(FuncletPad);
@@ -1228,7 +1230,7 @@ bool WinEHPrepareImpl::prepareExplicitEH(Function &F) {
 AllocaInst *WinEHPrepareImpl::insertPHILoads(PHINode *PN, Function &F) {
   BasicBlock *PHIBlock = PN->getParent();
   AllocaInst *SpillSlot = nullptr;
-  Instruction *EHPad = PHIBlock->getFirstNonPHI();
+  Instruction *EHPad = &*PHIBlock->getFirstNonPHIIt();
 
   if (!EHPad->isTerminator()) {
     // If the EHPad isn't a terminator, then we can insert a load in this block
@@ -1303,7 +1305,7 @@ void WinEHPrepareImpl::insertPHIStore(
     BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
     SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist) {
 
-  if (PredBlock->isEHPad() && PredBlock->getFirstNonPHI()->isTerminator()) {
+  if (PredBlock->isEHPad() && PredBlock->getFirstNonPHIIt()->isTerminator()) {
     // Pred is unsplittable, so we need to queue it on the worklist.
     Worklist.push_back({PredBlock, PredVal});
     return;
diff --git a/llvm/lib/IR/EHPersonalities.cpp b/llvm/lib/IR/EHPersonalities.cpp
index 7c32601b8a83e..575130bff7a34 100644
--- a/llvm/lib/IR/EHPersonalities.cpp
+++ b/llvm/lib/IR/EHPersonalities.cpp
@@ -129,7 +129,7 @@ DenseMap<BasicBlock *, ColorVector> llvm::colorEHFunclets(Function &F) {
     DEBUG_WITH_TYPE("win-eh-prepare-coloring",
                     dbgs() << "Visiting " << Visiting->getName() << ", "
                            << Color->getName() << "\n");
-    Instruction *VisitingHead = Visiting->getFirstNonPHI();
+    BasicBlock::iterator VisitingHead = Visiting->getFirstNonPHIIt();
     if (VisitingHead->isEHPad()) {
       // Mark this funclet head as a member of itself.
       Color = Visiting;
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b585d8cfbf2e2..c9f5807765e40 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -832,7 +832,7 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
 }
 
 LandingPadInst *InvokeInst::getLandingPadInst() const {
-  return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHI());
+  return cast<LandingPadInst>(getUnwindDest()->getFirstNonPHIIt());
 }
 
 void InvokeInst::updateProfWeight(uint64_t S, uint64_t T) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 00280dbe5300b..54de812517438 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2726,7 +2726,7 @@ static Instruction *getSuccPad(Instruction *Terminator) {
     UnwindDest = CSI->getUnwindDest();
   else
     UnwindDest = cast<CleanupReturnInst>(Terminator)->getUnwindDest();
-  return UnwindDest->getFirstNonPHI();
+  return &*UnwindDest->getFirstNonPHIIt();
 }
 
 void Verifier::verifySiblingFuncletUnwinds() {
@@ -4585,7 +4585,7 @@ void Verifier::visitCatchPadInst(CatchPadInst &CPI) {
 
   // The catchpad instruction must be the first non-PHI instruction in the
   // block.
-  Check(BB->getFirstNonPHI() == &CPI,
+  Check(&*BB->getFirstNonPHIIt() == &CPI,
         "CatchPadInst not the first non-PHI instruction in the block.", &CPI);
 
   visitEHPadPredecessors(CPI);
@@ -4609,7 +4609,7 @@ void Verifier::visitCleanupPadInst(CleanupPadInst &CPI) {
 
   // The cleanuppad instruction must be the first non-PHI instruction in the
   // block.
-  Check(BB->getFirstNonPHI() == &CPI,
+  Check(&*BB->getFirstNonPHIIt() == &CPI,
         "CleanupPadInst not the first non-PHI instruction in the block.", &CPI);
 
   auto *ParentPad = CPI.getParentPad();
@@ -4664,7 +4664,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
       Value *UnwindPad;
       bool ExitsFPI;
       if (UnwindDest) {
-        UnwindPad = UnwindDest->getFirstNonPHI();
+        UnwindPad = &*UnwindDest->getFirstNonPHIIt();
         if (!cast<Instruction>(UnwindPad)->isEHPad())
           continue;
         Value *UnwindParent = getParentPad(UnwindPad);
@@ -4767,7 +4767,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) {
       BasicBlock *SwitchUnwindDest = CatchSwitch->getUnwindDest();
       Value *SwitchUnwindPad;
       if (SwitchUnwindDest)
-        SwitchUnwindPad = SwitchUnwindDest->getFirstNonPHI();
+        SwitchUnwindPad = &*SwitchUnwindDest->getFirstNonPHIIt();
       else
         SwitchUnwindPad = ConstantTokenNone::get(FPI.getContext());
       Check(SwitchUnwindPad == FirstUnwindPad,
@@ -4790,7 +4790,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
 
   // The catchswitch instruction must be the first non-PHI instruction in the
   // block.
-  Check(BB->getFirstNonPHI() == &CatchSwitch,
+  Check(&*BB->getFirstNonPHIIt() == &CatchSwitch,
         "CatchSwitchInst not the first non-PHI instruction in the block.",
         &CatchSwitch);
 
@@ -4799,14 +4799,14 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
         "CatchSwitchInst has an invalid parent.", ParentPad);
 
   if (BasicBlock *UnwindDest = CatchSwitch.getUnwindDest()) {
-    Instruction *I = UnwindDest->getFirstNonPHI();
+    BasicBlock::iterator I = UnwindDest->getFirstNonPHIIt();
     Check(I->isEHPad() && !isa<LandingPadInst>(I),
           "CatchSwitchInst must unwind to an EH block which is not a "
           "landingpad.",
           &CatchSwitch);
 
     // Record catchswitch sibling unwinds for verifySiblingFuncletUnwinds
-    if (getParentPad(I) == ParentPad)
+    if (getParentPad(&*I) == ParentPad)
       SiblingFuncletInfo[&CatchSwitch] = &CatchSwitch;
   }
 
@@ -4814,7 +4814,7 @@ void Verifier::visitCatchSwitchInst(CatchSwitchInst &CatchSwitch) {
         "CatchSwitchInst cannot have empty handler list", &CatchSwitch);
 
   for (BasicBlock *Handler : CatchSwitch.handlers()) {
-    Check(isa<CatchPadInst>(Handler->getFirstNonPHI()),
+    Check(isa<CatchPadInst>(Handler->getFirstNonPHIIt()),
           "CatchSwitchInst handlers must be catchpads", &CatchSwitch, Handler);
   }
 
@@ -4828,7 +4828,7 @@ void Verifier::visitCleanupReturnInst(CleanupReturnInst &CRI) {
         CRI.getOperand(0));
 
   if (BasicBlock *UnwindDest = CRI.getUnwindDest()) {
-    Instruction *I = UnwindDest->getFirstNonPHI();
+    BasicBlock::iterator I = UnwindDest->getFirstNonPHIIt();
     Check(I->isEHPad() && !isa<LandingPadInst>(I),
           "CleanupReturnInst must unwind to an EH block which is not a "
           "landingpad.",
diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
index 1b2558d2e4009..afc47968bf657 100644
--- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
+++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp
@@ -222,7 +222,7 @@ bool BPFAdjustOptImpl::serializeICMPCrossBB(BasicBlock &BB) {
   if (!BI || !BI->isConditional())
     return false;
   auto *Cond = dyn_cast<ICmpInst>(BI->getCondition());
-  if (!Cond || B2->getFirstNonPHI() != Cond)
+  if (!Cond || &*B2->getFirstNonPHIIt() != Cond)
     return false;
   Value *B2Op0 = Cond->getOperand(0);
   auto Cond2Op = Cond->getPredicate();
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 991ee5b1cbaa5..d2ae2ef7bd7ff 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -1097,8 +1097,7 @@ bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
       promoteTo(In, DestTy, LoopB);
 
   // Fix up the PHI nodes in the exit block.
-  Instruction *EndI = ExitB->getFirstNonPHI();
-  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
+  BasicBlock::iterator End = ExitB->getFirstNonPHIIt();
   for (auto I = ExitB->begin(); I != End; ++I) {
     PHINode *P = dyn_cast<PHINode>(I);
     if (!P)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 839a206033a0c..c60cf69c30104 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -1199,7 +1199,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
 
   // Look for orphan landingpads, can occur in blocks with no predecessors
   for (BasicBlock &BB : F) {
-    Instruction *I = BB.getFirstNonPHI();
+    BasicBlock::iterator I = BB.getFirstNonPHIIt();
     if (auto *LPI = dyn_cast<LandingPadInst>(I))
       LandingPads.insert(LPI);
   }
@@ -1739,7 +1739,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::handleLongjmpableCallsForWasmSjLj(
 
   SmallVector<Instruction *, 16> ToErase;
   for (auto &BB : F) {
-    if (auto *CSI = dyn_cast<CatchSwitchInst>(BB.getFirstNonPHI())) {
+    if (auto *CSI = dyn_cast<CatchSwitchInst>(BB.getFirstNonPHIIt())) {
       if (CSI != CatchSwitchLongjmp && CSI->unwindsToCaller()) {
         IRB.SetInsertPoint(CSI);
         ToErase.push_back(CSI);
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 35b7d7f508b02..7d6d3f8d21f25 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -513,7 +513,7 @@ int WinEHStatePass::getBaseStateForBB(
   assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
   BasicBlock *FuncletEntryBB = BBColors.front();
   if (auto *FuncletPad =
-          dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI())) {
+          dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHIIt())) {
     auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
     if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
       BaseState = BaseStateI->second;
@@ -741,7 +741,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   for (BasicBlock *BB : RPOT) {
     auto &BBColors = BlockColors[BB];
     BasicBlock *FuncletEntryBB = BBColors.front();
-    if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI()))
+    if (isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHIIt()))
       continue;
 
     int PrevState = getPredState(FinalStates, F, ParentBaseState, BB);
@@ -783,7 +783,7 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   for (CallBase *Call : SetJmp3Calls) {
     auto &BBColors = BlockColors[Call->getParent()];
     BasicBlock *FuncletEntryBB = BBColors.front();
-    bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
+    bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHIIt());
 
     IRBuilder<> Builder(Call);
     Value *State;
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index cc462011a6242..3686c7c153999 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -180,12 +180,12 @@ static void rewriteMaterializableInstructions(
     // insert the remats into the end of the predecessor (there should only be
     // one). This is so that suspend blocks always have the suspend instruction
     // as the first instruction.
-    auto InsertPoint = &*Use->getParent()->getFirstInsertionPt();
+    BasicBlock::iterator InsertPoint = Use->getParent()->getFirstInsertionPt();
     if (isa<AnyCoroSuspendInst>(Use)) {
       BasicBlock *SuspendPredecessorBlock =
           Use->getParent()->getSinglePredecessor();
       assert(SuspendPredecessorBlock && "malformed coro suspend instruction");
-      InsertPoint = SuspendPredecessorBlock->getTerminator();
+      InsertPoint = SuspendPredecessorBlock->getTerminator()->getIterator();
     }
 
     // Note: skip the first instruction as this is the actual use that we're
@@ -197,7 +197,7 @@ static void rewriteMaterializableInstructions(
       CurrentMaterialization = D->clone();
       CurrentMaterialization->setName(D->getName());
       CurrentMaterialization->insertBefore(InsertPoint);
-      InsertPoint = CurrentMaterialization;
+      InsertPoint = CurrentMaterialization->getIterator();
 
       // Replace all uses of Def in the instructions being added as part of this
       // rematerialization group
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 6cc218e63a012..3f54106bd09fe 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -1754,7 +1754,7 @@ findOrCreatePHIInBlock(PHINode &PN, OutlinableRegion &Region,
   // If we've made it here, it means we weren't able to replace the PHINode, so
   // we must insert it ourselves.
   PHINode *NewPN = cast<PHINode>(PN.clone());
-  NewPN->insertBefore(&*OverallPhiBlock->begin());
+  NewPN->insertBefore(OverallPhiBlock->begin());
   for (unsigned Idx = 0, Edx = NewPN->getNumIncomingValues(); Idx < Edx;
        Idx++) {
     Value *IncomingVal = NewPN->getIncomingValue(Idx);
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 28c81465a0948..cead7b84c3fc8 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -1039,7 +1039,7 @@ void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
   };
 
   ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
-      ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+      ClonedOI->ReturnBlock->getFirstNonPHIIt());
   BasicBlock::iterator I = PreReturn->begin();
   BasicBlock::iterator Ins = ClonedOI->ReturnBlock->begin();
   SmallVector<Instruction *, 4> DeadPhis;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index cca6f78084b46..e5f3b7f24bca7 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -689,11 +689,11 @@ class RuntimeCallInserter {
       }
 
       BasicBlock *Color = Colors.front();
-      Instruction *EHPad = Color->getFirstNonPHI();
+      BasicBlock::iterator EHPadIt = Color->getFirstNonPHIIt();
 
-      if (EHPad && EHPad->isEHPad()) {
+      if (EHPadIt != Color->end() && EHPadIt->isEHPad()) {
         // Replace CI with a clone with an added funclet OperandBundle
-        OperandBundleDef OB("funclet", EHPad);
+        OperandBundleDef OB("funclet", &*EHPadIt);
         auto *NewCall = CallBase::addOperandBundle(CI, LLVMContext::OB_funclet,
                                                    OB, CI->getIterator());
         NewCall->copyMetadata(*CI);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 8e9b85c8d6857..56d3eb10d73e9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1205,8 +1205,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     removeUnreachableBlocks(F);
 
     MS.initializeCallbacks(*F.getParent(), TLI);
-    FnPrologueEnd = IRBuilder<>(F.getEntryBlock().getFirstNonPHI())
-                        .CreateIntrinsic(Intrinsic::donothing, {}, {});
+    FnPrologueEnd =
+        IRBuilder<>(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt())
+            .CreateIntrinsic(Intrinsic::donothing, {}, {});
 
     if (MS.CompileKernel) {
       IRBuilder<> IRB(FnPrologueEnd);
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 57e39c4eae966..d396dbf75eebc 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -760,7 +760,7 @@ void NumericalStabilitySanitizer::createShadowArguments(
       }))
     return;
 
-  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHI());
+  IRBuilder<> Builder(&F.getEntryBlock(), F.getEntryBlock().getFirstNonPHIIt());
   // The function has shadow args if the shadow args tag matches the function
   // address.
   Value *HasShadowArgs = Builder.CreateICmpEQ(
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index db4d62ec36751..5ad07e83d1273 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -910,9 +910,9 @@ populateEHOperandBundle(VPCandidateInfo &Cand,
     if (!BlockColors.empty()) {
       const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second;
       assert(CV.size() == 1 && "non-unique color for block!");
-      Instruction *EHPad = CV.front()->getFirstNonPHI();
-      if (EHPad->isEHPad())
-        OpBundles.emplace_back("funclet", EHPad);
+      BasicBlock::iterator EHPadIt = CV.front()->getFirstNonPHIIt();
+      if (EHPadIt->isEHPad())
+        OpBundles.emplace_back("funclet", &*EHPadIt);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index dc51c564fbe0d..f6780c0f06b18 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -386,7 +386,7 @@ bool MemOPSizeOpt::perform(MemOp MO) {
   PHINode *PHI = nullptr;
   if (!MemOpTy->isVoidTy()) {
     // Insert a phi for the return values at the merge block.
-    IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
+    IRBuilder<> IRBM(MergeBB, MergeBB->getFirstNonPHIIt());
     PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
     MO.I->replaceAllUsesWith(PHI);
     PHI->addIncoming(MO.I, DefaultBB);
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index e0070e583b681..7deaac5e59a28 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -479,7 +479,8 @@ static bool isTsanAtomic(const Instruction *I) {
 }
 
 void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
-  InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
+  InstrumentationIRBuilder IRB(&F.getEntryBlock(),
+                               F.getEntryBlock().getFirstNonPHIIt());
   IRB.CreateCall(TsanIgnoreBegin);
   EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
   while (IRBuilder<> *AtExit = EE.Next()) {
@@ -569,7 +570,8 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
 
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
-    InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
+    InstrumentationIRBuilder IRB(&F.getEntryBlock(),
+                                 F.getEntryBlock().getFirstNonPHIIt());
     Value *ReturnAddress =
         IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
index 33870d7ea192a..b6ade1c29a2b5 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -32,9 +32,9 @@ CallInst *objcarc::createCallInstWithColors(
   if (!BlockColors.empty()) {
     const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
     assert(CV.size() == 1 && "non-unique color for block!");
-    Instruction *EHPad = CV.front()->getFirstNonPHI();
+    BasicBlock::iterator EHPad = CV.front()->getFirstNonPHIIt();
     if (EHPad->isEHPad())
-      OpBundles.emplace_back("funclet", EHPad);
+      OpBundles.emplace_back("funclet", &*EHPad);
   }
 
   return CallInst::Create(FTy, Callee, Args, OpBundles, NameStr, InsertBefore);
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index b020591c203db..8407726a69c0b 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -627,7 +627,7 @@ bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) {
             // block with a catchswitch has no insertion point. Keep going up
             // the dominator tree until we find a non-catchswitch.
             BasicBlock *InsertBB = IncomingBB;
-            while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
+            while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHIIt())) {
               InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
             }
 
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 340d55190a5e6..9d7f5e64f9868 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -583,7 +583,8 @@ class ObjCARCOpt {
       const ColorVector &CV = BlockEHColors.find(BB)->second;
       assert(CV.size() > 0 && "Uncolored block");
       for (BasicBlock *EHPadBB : CV)
-        if (auto *EHPad = dyn_cast<FuncletPadInst>(EHPadBB->getFirstNonPHI())) {
+        if (auto *EHPad =
+                dyn_cast<FuncletPadInst>(EHPadBB->getFirstNonPHIIt())) {
           OpBundles.emplace_back("funclet", EHPad);
           return;
         }
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index def4add46e5ba..21eb7f741d7c8 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1720,7 +1720,8 @@ bool GVNPass::PerformLoadPRE(LoadInst *Load, AvailValInBlkVect &ValuesPerBlock,
   // to speculatively execute the load at that points.
   if (MustEnsureSafetyOfSpeculativeExecution) {
     if (CriticalEdgePredSplit.size())
-      if (!isSafeToSpeculativelyExecute(Load, LoadBB->getFirstNonPHI(), AC, DT))
+      if (!isSafeToSpeculativelyExecute(Load, &*LoadBB->getFirstNonPHIIt(), AC,
+                                        DT))
         return false;
     for (auto &PL : PredLoads)
       if (!isSafeToSpeculativelyExecute(Load, PL.first->getTerminator(), AC,
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 730f5cd0f8d0d..6651281ff2d01 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -906,7 +906,7 @@ void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
   // and move it to the start of the successor block.
   for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
     I0->getOperandUse(O).set(NewOperands[O]);
-  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+  I0->moveBefore(BBEnd->getFirstInsertionPt());
 
   // Update metadata and IR flags.
   for (auto *I : Insts)
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 658187ed74505..1a65154ae5936 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -1448,9 +1448,9 @@ static Instruction *cloneInstructionInExitBlock(
       const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
       assert(CV.size() == 1 && "non-unique color for exit block!");
       BasicBlock *BBColor = CV.front();
-      Instruction *EHPad = BBColor->getFirstNonPHI();
+      BasicBlock::iterator EHPad = BBColor->getFirstNonPHIIt();
       if (EHPad->isEHPad())
-        OpBundles.emplace_back("funclet", EHPad);
+        OpBundles.emplace_back("funclet", &*EHPad);
     }
 
     New = CallInst::Create(CI, OpBundles);
@@ -1549,7 +1549,8 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
   // it require updating BlockColors for all offspring blocks accordingly. By
   // skipping such corner case, we can make updating BlockColors after splitting
   // predecessor fairly simple.
-  if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
+  if (!SafetyInfo->getBlockColors().empty() &&
+      BB->getFirstNonPHIIt()->isEHPad())
     return false;
   for (BasicBlock *BBPred : predecessors(BB)) {
     if (isa<IndirectBrInst>(BBPred->getTerminator()))
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 3c82eeda54838..c5091e731444e 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1641,8 +1641,8 @@ static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
   //       plus "cnt0". Currently it is not optimized.
   //       This step could be used to detect POPCNT instruction:
   //       cnt.next = cnt + (x.next & 1)
-  for (Instruction &Inst : llvm::make_range(
-           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+  for (Instruction &Inst :
+       llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
     if (Inst.getOpcode() != Instruction::Add)
       continue;
 
@@ -1745,8 +1745,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
   {
     CountInst = nullptr;
-    for (Instruction &Inst : llvm::make_range(
-             LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+    for (Instruction &Inst :
+         llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
       if (Inst.getOpcode() != Instruction::Add)
         continue;
 
@@ -1869,8 +1869,8 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   //       plus "cnt0". Currently it is not optimized.
   //       This step could be used to detect POPCNT instruction:
   //       cnt.next = cnt + (x.next & 1)
-  for (Instruction &Inst : llvm::make_range(
-           LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
+  for (Instruction &Inst :
+       llvm::make_range(LoopEntry->getFirstNonPHIIt(), LoopEntry->end())) {
     if (Inst.getOpcode() != Instruction::Add)
       continue;
 
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index ed80040aa4236..38fc682698c53 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1350,7 +1350,7 @@ bool LoopInterchangeTransform::transform() {
         // Duplicate instruction and move it the new latch. Update uses that
         // have been moved.
         Instruction *NewI = WorkList[i]->clone();
-        NewI->insertBefore(NewLatch->getFirstNonPHI());
+        NewI->insertBefore(NewLatch->getFirstNonPHIIt());
         assert(!NewI->mayHaveSideEffects() &&
                "Moving instructions with side-effects may change behavior of "
                "the loop nest!");
@@ -1388,8 +1388,9 @@ bool LoopInterchangeTransform::transform() {
 
   // Ensure the inner loop phi nodes have a separate basic block.
   BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
-  if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
-    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+  if (&*InnerLoopHeader->getFirstNonPHIIt() !=
+      InnerLoopHeader->getTerminator()) {
+    SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHIIt(), DT, LI);
     LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
   }
 
@@ -1526,12 +1527,12 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
   // InnerLatch, which will become the new exit block for the innermost
   // loop after interchanging.
   for (PHINode *P : LcssaInnerExit)
-    P->moveBefore(InnerLatch->getFirstNonPHI());
+    P->moveBefore(InnerLatch->getFirstNonPHIIt());
 
   // If the inner loop latch contains LCSSA PHIs, those come from a child loop
   // and we have to move them to the new inner latch.
   for (PHINode *P : LcssaInnerLatch)
-    P->moveBefore(InnerExit->getFirstNonPHI());
+    P->moveBefore(InnerExit->getFirstNonPHIIt());
 
   // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
   // incoming values defined in the outer loop, we have to add a new PHI
@@ -1557,7 +1558,7 @@ static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
           continue;
         NewPhi->addIncoming(P.getIncomingValue(0), Pred);
       }
-      NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
+      NewPhi->insertBefore(InnerLatch->getFirstNonPHIIt());
       P.setIncomingValue(0, NewPhi);
     }
   }
@@ -1697,12 +1698,12 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // outer loop and all the remains to do is and updating the incoming blocks.
   for (PHINode *PHI : OuterLoopPHIs) {
     LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
-    PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+    PHI->moveBefore(InnerLoopHeader->getFirstNonPHIIt());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
   for (PHINode *PHI : InnerLoopPHIs) {
     LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
-    PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+    PHI->moveBefore(OuterLoopHeader->getFirstNonPHIIt());
     assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index ae9103d0608a1..765b76e54068c 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -365,7 +365,7 @@ class ConstantTerminatorFoldingImpl {
       for (auto &PN : BB->phis())
         DeadInstructions.push_back(&PN);
 
-      if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
+      if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHIIt()))
         DeadInstructions.emplace_back(LandingPad);
 
       for (Instruction *I : DeadInstructions) {
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 7ec1949c1c10f..8be2f78187a0c 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -102,7 +102,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   // Remove unreachable blocks and non-feasible edges.
   for (BasicBlock *DeadBB : BlocksToErase)
-    NumInstRemoved += changeToUnreachable(DeadBB->getFirstNonPHI(),
+    NumInstRemoved += changeToUnreachable(&*DeadBB->getFirstNonPHIIt(),
                                           /*PreserveLCSSA=*/false, &DTU);
 
   BasicBlock *NewUnreachableBB = nullptr;
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 4606514cbc717..62b4b545f29bb 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -366,8 +366,8 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
       continue;
 
     // Don't even think about ehpads/landingpads.
-    Instruction *FirstNonPHI = Target->getFirstNonPHI();
-    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+    auto FirstNonPHIIt = Target->getFirstNonPHIIt();
+    if (FirstNonPHIIt->isEHPad() || Target->isLandingPad())
       continue;
 
     // Remember edge probabilities if needed.
@@ -380,7 +380,7 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
       BPI->eraseBlock(Target);
     }
 
-    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHIIt, ".split");
     if (ShouldUpdateAnalysis) {
       // Copy the BFI/BPI from Target to BodyBlock.
       BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 33b3e4aea12d3..526132f5e5332 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -410,8 +410,8 @@ CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
   assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
 #endif
 
-  BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
-      CommonExitBlock->getFirstNonPHI()->getIterator());
+  BasicBlock *NewExitBlock =
+      CommonExitBlock->splitBasicBlock(CommonExitBlock->getFirstNonPHIIt());
 
   for (BasicBlock *Pred :
        llvm::make_early_inc_range(predecessors(CommonExitBlock))) {
@@ -701,7 +701,7 @@ void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) {
   // containing PHI nodes merging values from outside of the region, and a
   // second that contains all of the code for the block and merges back any
   // incoming values from inside of the region.
-  BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT);
+  BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHIIt(), DT);
 
   // We only want to code extract the second block now, and it becomes the new
   // header of the region.
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index aa5e04d71657a..6a8a468ebcae2 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -276,7 +276,7 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad,
     Value *UnwindDestToken = nullptr;
     if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
       if (CatchSwitch->hasUnwindDest()) {
-        UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+        UnwindDestToken = &*CatchSwitch->getUnwindDest()->getFirstNonPHIIt();
       } else {
         // Catchswitch doesn't have a 'nounwind' variant, and one might be
         // annotated as "unwinds to caller" when really it's nounwind (see
@@ -288,7 +288,8 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad,
                   HE = CatchSwitch->handler_end();
              HI != HE && !UnwindDestToken; ++HI) {
           BasicBlock *HandlerBlock = *HI;
-          auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+          auto *CatchPad =
+              cast<CatchPadInst>(&*HandlerBlock->getFirstNonPHIIt());
           for (User *Child : CatchPad->users()) {
             // Intentionally ignore invokes here -- since the catchswitch is
             // marked "unwind to caller", it would be a verifier error if it
@@ -326,14 +327,14 @@ static Value *getUnwindDestTokenHelper(Instruction *EHPad,
       for (User *U : CleanupPad->users()) {
         if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
           if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
-            UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+            UnwindDestToken = &*RetUnwindDest->getFirstNonPHIIt();
           else
             UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
           break;
         }
         Value *ChildUnwindDestToken;
         if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
-          ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+          ChildUnwindDestToken = &*Invoke->getUnwindDest()->getFirstNonPHIIt();
         } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
           Instruction *ChildPad = cast<Instruction>(U);
           auto Memo = MemoMap.find(ChildPad);
@@ -522,14 +523,13 @@ static Value *getUnwindDestToken(Instruction *EHPad,
     if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
       assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
       for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
-        auto *CatchPad = HandlerBlock->getFirstNonPHI();
+        auto *CatchPad = &*HandlerBlock->getFirstNonPHIIt();
         for (User *U : CatchPad->users()) {
-          assert(
-              (!isa<InvokeInst>(U) ||
-               (getParentPad(
-                    cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
-                CatchPad)) &&
-              "Expected useless pad");
+          assert((!isa<InvokeInst>(U) ||
+                  (getParentPad(&*cast<InvokeInst>(U)
+                                      ->getUnwindDest()
+                                      ->getFirstNonPHIIt()) == CatchPad)) &&
+                 "Expected useless pad");
           if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
             Worklist.push_back(cast<Instruction>(U));
         }
@@ -538,11 +538,12 @@ static Value *getUnwindDestToken(Instruction *EHPad,
       assert(isa<CleanupPadInst>(UselessPad));
       for (User *U : UselessPad->users()) {
         assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
-        assert((!isa<InvokeInst>(U) ||
-                (getParentPad(
-                     cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
-                 UselessPad)) &&
-               "Expected useless pad");
+        assert(
+            (!isa<InvokeInst>(U) ||
+             (getParentPad(
+                  &*cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHIIt()) ==
+              UselessPad)) &&
+            "Expected useless pad");
         if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
           Worklist.push_back(cast<Instruction>(U));
       }
@@ -678,7 +679,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
   BasicBlock *UnwindDest = II->getUnwindDest();
   Function *Caller = FirstNewBlock->getParent();
 
-  assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
+  assert(UnwindDest->getFirstNonPHIIt()->isEHPad() && "unexpected BasicBlock!");
 
   // If there are PHI nodes in the unwind destination block, we need to keep
   // track of which values came into them from the invoke before removing the
@@ -723,7 +724,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
       }
     }
 
-    Instruction *I = BB->getFirstNonPHI();
+    BasicBlock::iterator I = BB->getFirstNonPHIIt();
     if (!I->isEHPad())
       continue;
 
@@ -772,7 +773,7 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
     }
 
     if (Replacement) {
-      Replacement->takeName(I);
+      Replacement->takeName(&*I);
       I->replaceAllUsesWith(Replacement);
       I->eraseFromParent();
       UpdatePHINodes(&*BB);
@@ -2288,7 +2289,7 @@ remapIndices(Function &Caller, BasicBlock *StartBB,
       // this may be the entryblock from the inlined callee, coming into a BB
       // that didn't have instrumentation because of MST decisions. Let's make
       // sure it's placed accordingly. This is a noop elsewhere.
-      BBID->moveBefore(&*BB->getFirstInsertionPt());
+      BBID->moveBefore(BB->getFirstInsertionPt());
     }
     for (auto &I : llvm::make_early_inc_range(*BB)) {
       if (auto *Inc = dyn_cast<InstrProfIncrementInst>(&I)) {
@@ -2581,7 +2582,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
             // Ok, the call site is within a cleanuppad.  Let's check the callee
             // for catchpads.
             for (const BasicBlock &CalledBB : *CalledFunc) {
-              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
+              if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHIIt()))
                 return InlineResult::failure("catch in cleanup funclet");
             }
           }
@@ -3029,7 +3030,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
   // rewriting the "parent pad" links.
   if (auto *II = dyn_cast<InvokeInst>(&CB)) {
     BasicBlock *UnwindDest = II->getUnwindDest();
-    Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+    BasicBlock::iterator FirstNonPHI = UnwindDest->getFirstNonPHIIt();
     if (isa<LandingPadInst>(FirstNonPHI)) {
       HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
     } else {
@@ -3055,7 +3056,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
         if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
           changeToUnreachable(CleanupRet);
 
-      Instruction *I = BB->getFirstNonPHI();
+      BasicBlock::iterator I = BB->getFirstNonPHIIt();
       if (!I->isEHPad())
         continue;
 
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 94cf1185bc2cb..d5cf62e52cca3 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2108,7 +2108,7 @@ insertDbgVariableRecordsForPHIs(BasicBlock *BB,
   for (auto PHI : InsertedPHIs) {
     BasicBlock *Parent = PHI->getParent();
     // Avoid inserting a debug-info record into an EH block.
-    if (Parent->getFirstNonPHI()->isEHPad())
+    if (Parent->getFirstNonPHIIt()->isEHPad())
       continue;
     for (auto VI : PHI->operand_values()) {
       auto V = DbgValueMap.find(VI);
@@ -2174,7 +2174,7 @@ void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
   for (auto *PHI : InsertedPHIs) {
     BasicBlock *Parent = PHI->getParent();
     // Avoid inserting an intrinsic into an EH block.
-    if (Parent->getFirstNonPHI()->isEHPad())
+    if (Parent->getFirstNonPHIIt()->isEHPad())
       continue;
     for (auto *VI : PHI->operand_values()) {
       auto V = DbgValueMap.find(VI);
@@ -3206,7 +3206,7 @@ static bool markAliveBlocks(Function &F,
         BasicBlock *HandlerBB = *I;
         if (DTU)
           ++NumPerSuccessorCases[HandlerBB];
-        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+        auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHIIt());
         if (!HandlerSet.insert({CatchPad, Empty}).second) {
           if (DTU)
             --NumPerSuccessorCases[HandlerBB];
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index b3f9f76274d30..61ffb49a8c010 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -382,7 +382,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
                                            Header->getName() + ".backedge", F);
   BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
-  BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
+  BETerminator->setDebugLoc(Header->getFirstNonPHIIt()->getDebugLoc());
 
   LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
                     << BEBlock->getName() << "\n");
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 91291b429ea43..dbab56a6996ce 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -109,8 +109,9 @@ void llvm::createMemCpyLoopKnownSize(
   uint64_t BytesCopied = LoopEndCount;
   uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
   if (RemainingBytes) {
-    IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
-                                    : InsertBefore);
+    BasicBlock::iterator InsertIt = PostLoopBB ? PostLoopBB->getFirstNonPHIIt()
+                                               : InsertBefore->getIterator();
+    IRBuilder<> RBuilder(InsertIt->getParent(), InsertIt);
 
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
@@ -735,14 +736,16 @@ static void createMemMoveLoopKnownSize(Instruction *InsertBefore,
     // the same way, except that we change the IRBuilder insert point for each
     // load/store pair so that each one is inserted before the previous one
     // instead of after it.
-    IRBuilder<> BwdResBuilder(CopyBackwardsBB->getFirstNonPHI());
+    IRBuilder<> BwdResBuilder(CopyBackwardsBB,
+                              CopyBackwardsBB->getFirstNonPHIIt());
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
                                           SrcAS, DstAS, PartSrcAlign,
                                           PartDstAlign);
     for (auto *OpTy : RemainingOps) {
       // reverse the order of the emitted operations
-      BwdResBuilder.SetInsertPoint(CopyBackwardsBB->getFirstNonPHI());
+      BwdResBuilder.SetInsertPoint(CopyBackwardsBB,
+                                   CopyBackwardsBB->getFirstNonPHIIt());
       GenerateResidualLdStPair(OpTy, BwdResBuilder, BytesCopied);
     }
   }
diff --git a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
index 9b1b09bb3d8f2..ad105f5a57b49 100644
--- a/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
+++ b/llvm/lib/Transforms/Utils/MoveAutoInit.cpp
@@ -179,7 +179,7 @@ static bool runMoveAutoInit(Function &F, DominatorTree &DT, MemorySSA &MSSA) {
 
     // CatchSwitchInst blocks can only have one instruction, so they are not
     // good candidates for insertion.
-    while (isa<CatchSwitchInst>(UsersDominator->getFirstNonPHI())) {
+    while (isa<CatchSwitchInst>(UsersDominator->getFirstNonPHIIt())) {
       for (BasicBlock *Pred : predecessors(UsersDominator))
         if (DT.isReachableFromEntry(Pred))
           UsersDominator = DT.findNearestCommonDominator(UsersDominator, Pred);
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 229b1d9f07f8c..48d9528f0c3df 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -173,8 +173,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
 
   // Set the DebugLoc of the inserted PHI, if available.
   DebugLoc DL;
-  if (const Instruction *I = BB->getFirstNonPHI())
-      DL = I->getDebugLoc();
+  if (BasicBlock::iterator It = BB->getFirstNonPHIIt(); It != BB->end())
+    DL = It->getDebugLoc();
   InsertedPHI->setDebugLoc(DL);
 
   // If the client wants to know about all new instructions, tell it.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e4e87704c1c97..49694eb68e25b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7959,7 +7959,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
     PhisInBlock.push_back(&Phi);
 
   for (PHINode *Phi : PhisInBlock) {
-    Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
+    Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());
     Phi->replaceIncomingBlockWith(
         VecEpilogueIterationCountCheck->getSinglePredecessor(),
         VecEpilogueIterationCountCheck);
@@ -10291,8 +10291,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
         // start value; compare the final value from the main vector loop
         // to the start value.
-        IRBuilder<> Builder(
-            cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+        BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();
+        IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
         ResumeV =
             Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
       } else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 9d973d200662d..4159a71469bd1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -316,10 +316,9 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
   // will directly follow the scalar definitions.
   auto OldIP = Builder.saveIP();
-  auto NewIP =
-      isa<PHINode>(LastInst)
-          ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
-          : std::next(BasicBlock::iterator(LastInst));
+  auto NewIP = isa<PHINode>(LastInst)
+                   ? LastInst->getParent()->getFirstNonPHIIt()
+                   : std::next(BasicBlock::iterator(LastInst));
   Builder.SetInsertPoint(&*NewIP);
 
   // However, if we are vectorizing, we need to construct the vector values.
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
index 41e3ffd963f5b..da363df77d0c0 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceBasicBlocks.cpp
@@ -52,7 +52,7 @@ static void replaceBranchTerminator(BasicBlock &BB,
   bool IsBranch = isa<BranchInst>(Term);
   if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Term)) {
     BasicBlock *UnwindDest = Invoke->getUnwindDest();
-    Instruction *LP = UnwindDest->getFirstNonPHI();
+    BasicBlock::iterator LP = UnwindDest->getFirstNonPHIIt();
 
     // Remove landingpad instruction if the containing block isn't used by other
     // invokes.
diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp
index 1fb3f46b9240f..ad4393ccd5315 100644
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@@ -1578,7 +1578,7 @@ TEST_F(MemorySSATest, TestLoopInvariantEntryBlockPointer) {
   for (auto &BB : *F) {
     if (BB.getName() == "exit") {
       // Get the store instruction
-      auto *SI = BB.getFirstNonPHI();
+      auto *SI = &*BB.getFirstNonPHIIt();
       // Get the memory access and location
       MemoryAccess *MA = MSSA.getMemoryAccess(SI);
       MemoryLocation ML = MemoryLocation::get(SI);
diff --git a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
index f36d3ba99775b..519389d8e0b19 100644
--- a/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/llvm/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -146,7 +146,7 @@ TEST_F(ProfileSummaryInfoTest, TestNoProfile) {
   EXPECT_FALSE(PSI.isHotBlock(&BB0, &BFI));
   EXPECT_FALSE(PSI.isColdBlock(&BB0, &BFI));
 
-  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHI());
+  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHIIt());
   EXPECT_FALSE(PSI.isHotCallSite(CS1, &BFI));
   EXPECT_FALSE(PSI.isColdCallSite(CS1, &BFI));
 }
@@ -240,8 +240,8 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
   EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI));
   EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI));
 
-  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHI());
-  auto *CI2 = BB2->getFirstNonPHI();
+  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHIIt());
+  BasicBlock::iterator CI2 = BB2->getFirstNonPHIIt();
   CallBase &CS2 = cast<CallBase>(*CI2);
 
   EXPECT_TRUE(PSI.isHotCallSite(CS1, &BFI));
@@ -336,8 +336,8 @@ TEST_F(ProfileSummaryInfoTest, SampleProf) {
   EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB2, &BFI));
   EXPECT_TRUE(PSI.isColdBlockNthPercentile(10000, BB3, &BFI));
 
-  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHI());
-  auto *CI2 = BB2->getFirstNonPHI();
+  CallBase &CS1 = cast<CallBase>(*BB1->getFirstNonPHIIt());
+  BasicBlock::iterator CI2 = BB2->getFirstNonPHIIt();
   // Manually attach branch weights metadata to the call instruction.
   SmallVector<uint32_t, 1> Weights;
   Weights.push_back(1000);
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index f620d2c968b3f..42616155f0cc3 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -6434,7 +6434,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
 
   // Check entry block
   auto &EntryBlock = OutlinedFn->getEntryBlock();
-  Instruction *Alloca1 = EntryBlock.getFirstNonPHI();
+  Instruction *Alloca1 = &*EntryBlock.getFirstNonPHIIt();
   EXPECT_NE(Alloca1, nullptr);
 
   EXPECT_TRUE(isa<AllocaInst>(Alloca1));
@@ -6469,7 +6469,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   // Check user code block
   auto *UserCodeBlock = EntryBlockBranch->getSuccessor(0);
   EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry");
-  auto *Load1 = UserCodeBlock->getFirstNonPHI();
+  Instruction *Load1 = &*UserCodeBlock->getFirstNonPHIIt();
   EXPECT_TRUE(isa<LoadInst>(Load1));
   auto *Load2 = Load1->getNextNode();
   EXPECT_TRUE(isa<LoadInst>(Load2));
@@ -6480,7 +6480,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0);
   EXPECT_EQ(OutlinedBlock->getName(), "outlined.body");
 
-  auto *Value1 = OutlinedBlock->getFirstNonPHI();
+  Instruction *Value1 = &*OutlinedBlock->getFirstNonPHIIt();
   EXPECT_EQ(Value1, Value);
   EXPECT_EQ(Value1->getNextNode(), TargetStore);
   auto *Deinit = TargetStore->getNextNode();
@@ -6496,7 +6496,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   // Check exit block
   auto *ExitBlock = EntryBlockBranch->getSuccessor(1);
   EXPECT_EQ(ExitBlock->getName(), "worker.exit");
-  EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHI()));
+  EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHIIt()));
 
   // Check global exec_mode.
   GlobalVariable *Used = M->getGlobalVariable("llvm.compiler.used");
@@ -6804,7 +6804,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
 
   // Check that we have moved our alloca created in the
   // BodyGenCB function, to the top of the function.
-  Instruction *Alloca1 = EntryBlock.getFirstNonPHI();
+  Instruction *Alloca1 = &*EntryBlock.getFirstNonPHIIt();
   EXPECT_NE(Alloca1, nullptr);
   EXPECT_TRUE(isa<AllocaInst>(Alloca1));
   EXPECT_EQ(Alloca1, RaiseAlloca);
@@ -6840,7 +6840,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   // Check user code block
   auto *UserCodeBlock = EntryBlockBranch->getSuccessor(0);
   EXPECT_EQ(UserCodeBlock->getName(), "user_code.entry");
-  auto *Load1 = UserCodeBlock->getFirstNonPHI();
+  BasicBlock::iterator Load1 = UserCodeBlock->getFirstNonPHIIt();
   EXPECT_TRUE(isa<LoadInst>(Load1));
 
   auto *OutlinedBlockBr = Load1->getNextNode();
@@ -6849,7 +6849,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   auto *OutlinedBlock = OutlinedBlockBr->getSuccessor(0);
   EXPECT_EQ(OutlinedBlock->getName(), "outlined.body");
 
-  auto *Load2 = OutlinedBlock->getFirstNonPHI();
+  Instruction *Load2 = &*OutlinedBlock->getFirstNonPHIIt();
   EXPECT_TRUE(isa<LoadInst>(Load2));
   EXPECT_EQ(Load2, Value);
   EXPECT_EQ(Load2->getNextNode(), TargetStore);
@@ -6866,7 +6866,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   // Check exit block
   auto *ExitBlock = EntryBlockBranch->getSuccessor(1);
   EXPECT_EQ(ExitBlock->getName(), "worker.exit");
-  EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHI()));
+  EXPECT_TRUE(isa<ReturnInst>(ExitBlock->getFirstNonPHIIt()));
 }
 
 TEST_F(OpenMPIRBuilderTest, CreateTask) {
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index ea20c87d6b09b..4283ba7a8f823 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -134,7 +134,7 @@ TEST(StripTest, LoopMetadata) {
   // we update the terminator's metadata correctly, we should be able to
   // observe the change in emission kind for the CU.
   auto getEmissionKind = [&]() {
-    Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+    Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
     MDNode *LoopMD = I.getMetadata(LLVMContext::MD_loop);
     return cast<DILocation>(LoopMD->getOperand(1))
         ->getScope()
@@ -183,7 +183,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
 )");
 
   // Find %b = add ...
-  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
   // Find the dbg.value using %b.
   SmallVector<DbgValueInst *, 1> DVIs;
@@ -268,7 +268,7 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
   // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -319,7 +319,7 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
     !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10)
 )");
 
-  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
 
   SmallVector<DbgValueInst *, 2> DVIs;
   SmallVector<DbgVariableRecord *, 2> DVRs;
@@ -902,7 +902,7 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
 )");
 
   // Find the first dbg.value,
-  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+  Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHIIt();
   const DILocalVariable *Var = nullptr;
   const DIExpression *Expr = nullptr;
   const DILocation *Loc = nullptr;
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index c1d3279688858..386a60702d0da 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1716,7 +1716,7 @@ TEST(InstructionsTest, DropLocation) {
         cast<Function>(M->getNamedValue("no_parent_scope"));
     BasicBlock &BB = NoParentScopeF->front();
 
-    auto *I1 = BB.getFirstNonPHI();
+    auto *I1 = &*BB.getFirstNonPHIIt();
     auto *I2 = I1->getNextNode();
     auto *I3 = BB.getTerminator();
 
@@ -1738,7 +1738,7 @@ TEST(InstructionsTest, DropLocation) {
         cast<Function>(M->getNamedValue("with_parent_scope"));
     BasicBlock &BB = WithParentScopeF->front();
 
-    auto *I2 = BB.getFirstNonPHI()->getNextNode();
+    auto *I2 = BB.getFirstNonPHIIt()->getNextNode();
 
     MDNode *Scope = cast<MDNode>(WithParentScopeF->getSubprogram());
     EXPECT_EQ(I2->getDebugLoc().getLine(), 2U);
diff --git a/llvm/unittests/Transforms/Scalar/LICMTest.cpp b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
index 5a986b067700c..98a69bbb47de1 100644
--- a/llvm/unittests/Transforms/Scalar/LICMTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LICMTest.cpp
@@ -63,7 +63,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
   BasicBlock *LoopBB = EntryBB.getUniqueSuccessor();
 
   // Select `load i64, i64* %ptr`.
-  Instruction *IBefore = LoopBB->getFirstNonPHI();
+  Instruction *IBefore = &*LoopBB->getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IBefore));
   // Upon this query SCEV caches disposition of <load i64, i64* %ptr> SCEV.
@@ -73,7 +73,7 @@ TEST(LICMTest, TestSCEVInvalidationOnHoisting) {
   MPM.run(*M, MAM);
 
   // Select `load i64, i64* %ptr` after it was hoisted.
-  Instruction *IAfter = EntryBB.getFirstNonPHI();
+  Instruction *IAfter = &*EntryBB.getFirstNonPHIIt();
   // Make sure the right instruction was selected.
   ASSERT_TRUE(isa<LoadInst>(IAfter));
 
diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index 1293e4c921c9d..b2e3b5d32fbe2 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -508,7 +508,7 @@ Value *BlockGenerator::getOrCreateAlloca(const ScopArrayInfo *Array) {
       new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
                      DL.getPrefTypeAlign(Ty), ScalarBase->getName() + NameExt);
   BasicBlock *EntryBB = &Builder.GetInsertBlock()->getParent()->getEntryBlock();
-  Addr->insertBefore(&*EntryBB->getFirstInsertionPt());
+  Addr->insertBefore(EntryBB->getFirstInsertionPt());
 
   return Addr;
 }
@@ -869,7 +869,7 @@ void BlockGenerator::createScalarFinalization(Scop &S) {
     // Create the merge PHI that merges the optimized and unoptimized version.
     PHINode *MergePHI = PHINode::Create(EscapeInst->getType(), 2,
                                         EscapeInst->getName() + ".merge");
-    MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
+    MergePHI->insertBefore(MergeBB->getFirstInsertionPt());
 
     // Add the respective values to the merge PHI.
     MergePHI->addIncoming(EscapeInstReload, OptExitBB);
@@ -950,7 +950,7 @@ void BlockGenerator::createExitPHINodeMerges(Scop &S) {
             cast<Instruction>(OriginalValue)->getParent() != MergeBB) &&
            "Original value must no be one we just generated.");
     auto *MergePHI = PHINode::Create(PHI->getType(), 2, Name + ".ph.merge");
-    MergePHI->insertBefore(&*MergeBB->getFirstInsertionPt());
+    MergePHI->insertBefore(MergeBB->getFirstInsertionPt());
     MergePHI->addIncoming(Reload, OptExitBB);
     MergePHI->addIncoming(OriginalValue, ExitBB);
     int Idx = PHI->getBasicBlockIndex(MergeBB);
@@ -1384,7 +1384,7 @@ void RegionGenerator::copyPHIInstruction(ScopStmt &Stmt, PHINode *PHI,
   unsigned NumIncoming = PHI->getNumIncomingValues();
   PHINode *PHICopy =
       Builder.CreatePHI(PHI->getType(), NumIncoming, "polly." + PHI->getName());
-  PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHI());
+  PHICopy->moveBefore(PHICopy->getParent()->getFirstNonPHIIt());
   BBMap[PHI] = PHICopy;
 
   for (BasicBlock *IncomingBB : PHI->blocks())
diff --git a/polly/lib/CodeGen/LoopGenerators.cpp b/polly/lib/CodeGen/LoopGenerators.cpp
index 5f772170d9628..f3975ccee44fa 100644
--- a/polly/lib/CodeGen/LoopGenerators.cpp
+++ b/polly/lib/CodeGen/LoopGenerators.cpp
@@ -185,7 +185,7 @@ Value *polly::createLoop(Value *LB, Value *UB, Value *Stride,
     DT.changeImmediateDominator(ExitBB, HeaderBB);
 
   // The loop body should be added here.
-  Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
+  Builder.SetInsertPoint(HeaderBB->getFirstNonPHIIt());
   return IV;
 }
 
diff --git a/polly/lib/Transform/MaximalStaticExpansion.cpp b/polly/lib/Transform/MaximalStaticExpansion.cpp
index e32a69d47f69c..c9227ac0bfd10 100644
--- a/polly/lib/Transform/MaximalStaticExpansion.cpp
+++ b/polly/lib/Transform/MaximalStaticExpansion.cpp
@@ -169,7 +169,7 @@ class MaximalStaticExpansionImpl {
     } else if (SAI->isExitPHIKind()) {
       // For now, we are not able to expand ExitPhi.
       emitRemark(SAI->getName() + " is a ExitPhi node.",
-                 S.getEnteringBlock()->getFirstNonPHI());
+                 &*S.getEnteringBlock()->getFirstNonPHIIt());
       return false;
     }
 
@@ -270,7 +270,7 @@ class MaximalStaticExpansionImpl {
     // No need to expand SAI with no write.
     if (NumberWrites == 0) {
       emitRemark(SAI->getName() + " has 0 write access.",
-                 S.getEnteringBlock()->getFirstNonPHI());
+                 &*S.getEnteringBlock()->getFirstNonPHIIt());
       return false;
     }
 

From 02c6002d1cd2dabe4b98368f91e7b4395e5ab11d Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 24 Jan 2025 13:42:06 +0000
Subject: [PATCH 003/432] [lldb][AArch64] Add Guarded Control Stack registers
 (#123720)

The Guarded Control Stack extension implements a shadow stack and the
Linux kernel provides access to 3 registers for it via ptrace.

struct user_gcs {
	__u64 features_enabled;
	__u64 features_locked;
	__u64 gcspr_el0;
};

This commit adds support for reading those from a live process.

The first 2 are pseudo registers based on the real control register and
the 3rd is a real register. This is the stack pointer for the guarded
stack.

I have added a "gcs_" prefix to the "features" registers so that they
have a clear name when shown individually. Also this means they will tab
complete from "gcs", and be next to gcspr_el0 in any sorted lists of
registers.

Guarded Control Stack Registers:
  gcs_features_enabled = 0x0000000000000000
  gcs_features_locked = 0x0000000000000000
  gcspr_el0 = 0x0000000000000000

Testing is more of the usual, where possible I'm writing a register then
doing something in the program to confirm the value was actually sent to
ptrace.
---
 .../NativeRegisterContextLinux_arm64.cpp      |  86 ++++++++++
 .../Linux/NativeRegisterContextLinux_arm64.h  |  16 ++
 .../Utility/RegisterContextPOSIX_arm64.cpp    |   4 +
 .../Utility/RegisterContextPOSIX_arm64.h      |   1 +
 .../Utility/RegisterInfoPOSIX_arm64.cpp       |  39 ++++-
 .../Process/Utility/RegisterInfoPOSIX_arm64.h |   7 +
 .../linux/aarch64/gcs/TestAArch64LinuxGCS.py  | 152 ++++++++++++++++++
 lldb/test/API/linux/aarch64/gcs/main.c        |  23 ++-
 8 files changed, 323 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index 6056f3001fed6..efd3385c46e92 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -64,8 +64,14 @@
 #define NT_ARM_FPMR 0x40e /* Floating point mode register */
 #endif
 
+#ifndef NT_ARM_GCS
+#define NT_ARM_GCS 0x410 /* Guarded Control Stack control registers */
+#endif
+
 #define HWCAP_PACA (1 << 30)
 
+#define HWCAP_GCS (1UL << 32)
+
 #define HWCAP2_MTE (1 << 18)
 
 #define HWCAP2_FPMR (1UL << 48)
@@ -150,6 +156,8 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux(
         opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskMTE);
       if (*auxv_at_hwcap2 & HWCAP2_FPMR)
         opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskFPMR);
+      if (*auxv_at_hwcap & HWCAP_GCS)
+        opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskGCS);
     }
 
     opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskTLS);
@@ -193,6 +201,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
   ::memset(&m_pac_mask, 0, sizeof(m_pac_mask));
   ::memset(&m_tls_regs, 0, sizeof(m_tls_regs));
   ::memset(&m_sme_pseudo_regs, 0, sizeof(m_sme_pseudo_regs));
+  ::memset(&m_gcs_regs, 0, sizeof(m_gcs_regs));
   std::fill(m_zt_reg.begin(), m_zt_reg.end(), 0);
 
   m_mte_ctrl_reg = 0;
@@ -213,6 +222,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
   m_tls_is_valid = false;
   m_zt_buffer_is_valid = false;
   m_fpmr_is_valid = false;
+  m_gcs_is_valid = false;
 
   // SME adds the tpidr2 register
   m_tls_size = GetRegisterInfo().IsSSVEPresent() ? sizeof(m_tls_regs)
@@ -433,6 +443,14 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info,
     offset = reg_info->byte_offset - GetRegisterInfo().GetFPMROffset();
     assert(offset < GetFPMRBufferSize());
     src = (uint8_t *)GetFPMRBuffer() + offset;
+  } else if (IsGCS(reg)) {
+    error = ReadGCS();
+    if (error.Fail())
+      return error;
+
+    offset = reg_info->byte_offset - GetRegisterInfo().GetGCSOffset();
+    assert(offset < GetGCSBufferSize());
+    src = (uint8_t *)GetGCSBuffer() + offset;
   } else
     return Status::FromErrorString(
         "failed - register wasn't recognized to be a GPR or an FPR, "
@@ -657,6 +675,17 @@ Status NativeRegisterContextLinux_arm64::WriteRegister(
     ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
 
     return WriteFPMR();
+  } else if (IsGCS(reg)) {
+    error = ReadGCS();
+    if (error.Fail())
+      return error;
+
+    offset = reg_info->byte_offset - GetRegisterInfo().GetGCSOffset();
+    assert(offset < GetGCSBufferSize());
+    dst = (uint8_t *)GetGCSBuffer() + offset;
+    ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
+
+    return WriteGCS();
   }
 
   return Status::FromErrorString("Failed to write register value");
@@ -672,6 +701,7 @@ enum RegisterSetType : uint32_t {
   SME,  // ZA only, because SVCR and SVG are pseudo registers.
   SME2, // ZT only.
   FPMR,
+  GCS, // Guarded Control Stack registers.
 };
 
 static uint8_t *AddRegisterSetType(uint8_t *dst,
@@ -759,6 +789,13 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) {
       return error;
   }
 
+  if (GetRegisterInfo().IsGCSPresent()) {
+    cached_size += sizeof(RegisterSetType) + GetGCSBufferSize();
+    error = ReadGCS();
+    if (error.Fail())
+      return error;
+  }
+
   // tpidr is always present but tpidr2 depends on SME.
   cached_size += sizeof(RegisterSetType) + GetTLSBufferSize();
   error = ReadTLS();
@@ -867,6 +904,11 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
                             GetFPMRBufferSize());
   }
 
+  if (GetRegisterInfo().IsGCSPresent()) {
+    dst = AddSavedRegisters(dst, RegisterSetType::GCS, GetGCSBuffer(),
+                            GetGCSBufferSize());
+  }
+
   dst = AddSavedRegisters(dst, RegisterSetType::TLS, GetTLSBuffer(),
                           GetTLSBufferSize());
 
@@ -1020,6 +1062,11 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues(
           GetFPMRBuffer(), &src, GetFPMRBufferSize(), m_fpmr_is_valid,
           std::bind(&NativeRegisterContextLinux_arm64::WriteFPMR, this));
       break;
+    case RegisterSetType::GCS:
+      error = RestoreRegisters(
+          GetGCSBuffer(), &src, GetGCSBufferSize(), m_gcs_is_valid,
+          std::bind(&NativeRegisterContextLinux_arm64::WriteGCS, this));
+      break;
     }
 
     if (error.Fail())
@@ -1067,6 +1114,10 @@ bool NativeRegisterContextLinux_arm64::IsFPMR(unsigned reg) const {
   return GetRegisterInfo().IsFPMRReg(reg);
 }
 
+bool NativeRegisterContextLinux_arm64::IsGCS(unsigned reg) const {
+  return GetRegisterInfo().IsGCSReg(reg);
+}
+
 llvm::Error NativeRegisterContextLinux_arm64::ReadHardwareDebugInfo() {
   if (!m_refresh_hwdebug_info) {
     return llvm::Error::success();
@@ -1215,6 +1266,7 @@ void NativeRegisterContextLinux_arm64::InvalidateAllRegisters() {
   m_tls_is_valid = false;
   m_zt_buffer_is_valid = false;
   m_fpmr_is_valid = false;
+  m_gcs_is_valid = false;
 
   // Update SVE and ZA registers in case there is change in configuration.
   ConfigureRegisterContext();
@@ -1400,6 +1452,40 @@ Status NativeRegisterContextLinux_arm64::WriteTLS() {
   return WriteRegisterSet(&ioVec, GetTLSBufferSize(), NT_ARM_TLS);
 }
 
+Status NativeRegisterContextLinux_arm64::ReadGCS() {
+  Status error;
+
+  if (m_gcs_is_valid)
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = GetGCSBuffer();
+  ioVec.iov_len = GetGCSBufferSize();
+
+  error = ReadRegisterSet(&ioVec, GetGCSBufferSize(), NT_ARM_GCS);
+
+  if (error.Success())
+    m_gcs_is_valid = true;
+
+  return error;
+}
+
+Status NativeRegisterContextLinux_arm64::WriteGCS() {
+  Status error;
+
+  error = ReadGCS();
+  if (error.Fail())
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = GetGCSBuffer();
+  ioVec.iov_len = GetGCSBufferSize();
+
+  m_gcs_is_valid = false;
+
+  return WriteRegisterSet(&ioVec, GetGCSBufferSize(), NT_ARM_GCS);
+}
+
 Status NativeRegisterContextLinux_arm64::ReadZAHeader() {
   Status error;
 
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
index 16190b5492582..7ed0da8503496 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
@@ -92,6 +92,7 @@ class NativeRegisterContextLinux_arm64
   bool m_pac_mask_is_valid;
   bool m_tls_is_valid;
   size_t m_tls_size;
+  bool m_gcs_is_valid;
 
   struct user_pt_regs m_gpr_arm64; // 64-bit general purpose registers.
 
@@ -136,6 +137,12 @@ class NativeRegisterContextLinux_arm64
 
   uint64_t m_fpmr_reg;
 
+  struct gcs_regs {
+    uint64_t features_enabled;
+    uint64_t features_locked;
+    uint64_t gcspr_e0;
+  } m_gcs_regs;
+
   bool IsGPR(unsigned reg) const;
 
   bool IsFPR(unsigned reg) const;
@@ -166,6 +173,10 @@ class NativeRegisterContextLinux_arm64
 
   Status WriteZA();
 
+  Status ReadGCS();
+
+  Status WriteGCS();
+
   // No WriteZAHeader because writing only the header will disable ZA.
   // Instead use WriteZA and ensure you have the correct ZA buffer size set
   // beforehand if you wish to disable it.
@@ -187,6 +198,7 @@ class NativeRegisterContextLinux_arm64
   bool IsMTE(unsigned reg) const;
   bool IsTLS(unsigned reg) const;
   bool IsFPMR(unsigned reg) const;
+  bool IsGCS(unsigned reg) const;
 
   uint64_t GetSVERegVG() { return m_sve_header.vl / 8; }
 
@@ -212,6 +224,8 @@ class NativeRegisterContextLinux_arm64
 
   void *GetFPMRBuffer() { return &m_fpmr_reg; }
 
+  void *GetGCSBuffer() { return &m_gcs_regs; }
+
   size_t GetSVEHeaderSize() { return sizeof(m_sve_header); }
 
   size_t GetPACMaskSize() { return sizeof(m_pac_mask); }
@@ -234,6 +248,8 @@ class NativeRegisterContextLinux_arm64
 
   size_t GetFPMRBufferSize() { return sizeof(m_fpmr_reg); }
 
+  size_t GetGCSBufferSize() { return sizeof(m_gcs_regs); }
+
   llvm::Error ReadHardwareDebugInfo() override;
 
   llvm::Error WriteHardwareDebugRegs(DREGType hwbType) override;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
index 575e9c8c81cbf..0233837f99d09 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.cpp
@@ -63,6 +63,10 @@ bool RegisterContextPOSIX_arm64::IsFPMR(unsigned reg) const {
   return m_register_info_up->IsFPMRReg(reg);
 }
 
+bool RegisterContextPOSIX_arm64::IsGCS(unsigned reg) const {
+  return m_register_info_up->IsGCSReg(reg);
+}
+
 RegisterContextPOSIX_arm64::RegisterContextPOSIX_arm64(
     lldb_private::Thread &thread,
     std::unique_ptr<RegisterInfoPOSIX_arm64> register_info)
diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
index 35ad56c98a7ae..de46c628d836d 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextPOSIX_arm64.h
@@ -59,6 +59,7 @@ class RegisterContextPOSIX_arm64 : public lldb_private::RegisterContext {
   bool IsSME(unsigned reg) const;
   bool IsMTE(unsigned reg) const;
   bool IsFPMR(unsigned reg) const;
+  bool IsGCS(unsigned reg) const;
 
   bool IsSVEZ(unsigned reg) const { return m_register_info_up->IsSVEZReg(reg); }
   bool IsSVEP(unsigned reg) const { return m_register_info_up->IsSVEPReg(reg); }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
index f51a93e1b2dcb..c004c0f3c3cf5 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
@@ -97,6 +97,10 @@ static lldb_private::RegisterInfo g_register_infos_sme2[] = {
 static lldb_private::RegisterInfo g_register_infos_fpmr[] = {
     DEFINE_EXTENSION_REG(fpmr)};
 
+static lldb_private::RegisterInfo g_register_infos_gcs[] = {
+    DEFINE_EXTENSION_REG(gcs_features_enabled),
+    DEFINE_EXTENSION_REG(gcs_features_locked), DEFINE_EXTENSION_REG(gcspr_el0)};
+
 // Number of register sets provided by this context.
 enum {
   k_num_gpr_registers = gpr_w28 - gpr_x0 + 1,
@@ -109,6 +113,7 @@ enum {
   // only for SME1 registers.
   k_num_sme_register = 3,
   k_num_fpmr_register = 1,
+  k_num_gcs_register = 3,
   k_num_register_sets_default = 2,
   k_num_register_sets = 3
 };
@@ -221,6 +226,9 @@ static const lldb_private::RegisterSet g_reg_set_sme_arm64 = {
 static const lldb_private::RegisterSet g_reg_set_fpmr_arm64 = {
     "Floating Point Mode Register", "fpmr", k_num_fpmr_register, nullptr};
 
+static const lldb_private::RegisterSet g_reg_set_gcs_arm64 = {
+    "Guarded Control Stack Registers", "gcs", k_num_gcs_register, nullptr};
+
 RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64(
     const lldb_private::ArchSpec &target_arch, lldb_private::Flags opt_regsets)
     : lldb_private::RegisterInfoAndSetInterface(target_arch),
@@ -273,6 +281,9 @@ RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64(
       if (m_opt_regsets.AllSet(eRegsetMaskFPMR))
         AddRegSetFPMR();
 
+      if (m_opt_regsets.AllSet(eRegsetMaskGCS))
+        AddRegSetGCS();
+
       m_register_info_count = m_dynamic_reg_infos.size();
       m_register_info_p = m_dynamic_reg_infos.data();
       m_register_set_p = m_dynamic_reg_sets.data();
@@ -434,6 +445,24 @@ void RegisterInfoPOSIX_arm64::AddRegSetFPMR() {
   m_dynamic_reg_sets.back().registers = m_fpmr_regnum_collection.data();
 }
 
+void RegisterInfoPOSIX_arm64::AddRegSetGCS() {
+  uint32_t gcs_regnum = m_dynamic_reg_infos.size();
+  for (uint32_t i = 0; i < k_num_gcs_register; i++) {
+    m_gcs_regnum_collection.push_back(gcs_regnum + i);
+    m_dynamic_reg_infos.push_back(g_register_infos_gcs[i]);
+    m_dynamic_reg_infos[gcs_regnum + i].byte_offset =
+        m_dynamic_reg_infos[gcs_regnum + i - 1].byte_offset +
+        m_dynamic_reg_infos[gcs_regnum + i - 1].byte_size;
+    m_dynamic_reg_infos[gcs_regnum + i].kinds[lldb::eRegisterKindLLDB] =
+        gcs_regnum + i;
+  }
+
+  m_per_regset_regnum_range[m_register_set_count] =
+      std::make_pair(gcs_regnum, m_dynamic_reg_infos.size());
+  m_dynamic_reg_sets.push_back(g_reg_set_gcs_arm64);
+  m_dynamic_reg_sets.back().registers = m_gcs_regnum_collection.data();
+}
+
 uint32_t RegisterInfoPOSIX_arm64::ConfigureVectorLengthSVE(uint32_t sve_vq) {
   // sve_vq contains SVE Quad vector length in context of AArch64 SVE.
   // SVE register infos if enabled cannot be disabled by selecting sve_vq = 0.
@@ -561,6 +590,10 @@ bool RegisterInfoPOSIX_arm64::IsFPMRReg(unsigned reg) const {
   return llvm::is_contained(m_fpmr_regnum_collection, reg);
 }
 
+bool RegisterInfoPOSIX_arm64::IsGCSReg(unsigned reg) const {
+  return llvm::is_contained(m_gcs_regnum_collection, reg);
+}
+
 uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEZ0() const { return sve_z0; }
 
 uint32_t RegisterInfoPOSIX_arm64::GetRegNumSVEFFR() const { return sve_ffr; }
@@ -593,4 +626,8 @@ uint32_t RegisterInfoPOSIX_arm64::GetSMEOffset() const {
 
 uint32_t RegisterInfoPOSIX_arm64::GetFPMROffset() const {
   return m_register_info_p[m_fpmr_regnum_collection[0]].byte_offset;
-}
\ No newline at end of file
+}
+
+uint32_t RegisterInfoPOSIX_arm64::GetGCSOffset() const {
+  return m_register_info_p[m_gcs_regnum_collection[0]].byte_offset;
+}
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
index 16a951ef0935f..d2ddf7d86d8c3 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
@@ -33,6 +33,7 @@ class RegisterInfoPOSIX_arm64
     eRegsetMaskZA = 32,
     eRegsetMaskZT = 64,
     eRegsetMaskFPMR = 128,
+    eRegsetMaskGCS = 256,
     eRegsetMaskDynamic = ~1,
   };
 
@@ -113,6 +114,8 @@ class RegisterInfoPOSIX_arm64
 
   void AddRegSetFPMR();
 
+  void AddRegSetGCS();
+
   uint32_t ConfigureVectorLengthSVE(uint32_t sve_vq);
 
   void ConfigureVectorLengthZA(uint32_t za_vq);
@@ -132,6 +135,7 @@ class RegisterInfoPOSIX_arm64
   bool IsMTEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); }
   bool IsTLSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); }
   bool IsFPMRPresent() const { return m_opt_regsets.AnySet(eRegsetMaskFPMR); }
+  bool IsGCSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskGCS); }
 
   bool IsSVEReg(unsigned reg) const;
   bool IsSVEZReg(unsigned reg) const;
@@ -144,6 +148,7 @@ class RegisterInfoPOSIX_arm64
   bool IsSMERegZA(unsigned reg) const;
   bool IsSMERegZT(unsigned reg) const;
   bool IsFPMRReg(unsigned reg) const;
+  bool IsGCSReg(unsigned reg) const;
 
   uint32_t GetRegNumSVEZ0() const;
   uint32_t GetRegNumSVEFFR() const;
@@ -156,6 +161,7 @@ class RegisterInfoPOSIX_arm64
   uint32_t GetTLSOffset() const;
   uint32_t GetSMEOffset() const;
   uint32_t GetFPMROffset() const;
+  uint32_t GetGCSOffset() const;
 
 private:
   typedef std::map<uint32_t, std::vector<lldb_private::RegisterInfo>>
@@ -188,6 +194,7 @@ class RegisterInfoPOSIX_arm64
   std::vector<uint32_t> m_tls_regnum_collection;
   std::vector<uint32_t> m_sme_regnum_collection;
   std::vector<uint32_t> m_fpmr_regnum_collection;
+  std::vector<uint32_t> m_gcs_regnum_collection;
 };
 
 #endif
diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index 0928ff8e14e00..d3d4dbecf4a2a 100644
--- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -83,3 +83,155 @@ def test_gcs_fault(self):
                 "stop reason = signal SIGSEGV: control protection fault",
             ],
         )
+
+    @skipUnlessArch("aarch64")
+    @skipUnlessPlatform(["linux"])
+    def test_gcs_registers(self):
+        if not self.isAArch64GCS():
+            self.skipTest("Target must support GCS.")
+
+        self.build()
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+
+        self.runCmd("b test_func")
+        self.runCmd("b test_func2")
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        self.expect("register read --all", substrs=["Guarded Control Stack Registers:"])
+
+        # This helper reads all the GCS registers and optionally compares them
+        # against a previous state, then returns the current register values.
+        def check_gcs_registers(
+            expected_gcs_features_enabled=None,
+            expected_gcs_features_locked=None,
+            expected_gcspr_el0=None,
+        ):
+            thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0)
+            registerSets = thread.GetFrameAtIndex(0).GetRegisters()
+            gcs_registers = registerSets.GetFirstValueByName(
+                r"Guarded Control Stack Registers"
+            )
+
+            gcs_features_enabled = gcs_registers.GetChildMemberWithName(
+                "gcs_features_enabled"
+            ).GetValueAsUnsigned()
+            if expected_gcs_features_enabled is not None:
+                self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled)
+
+            gcs_features_locked = gcs_registers.GetChildMemberWithName(
+                "gcs_features_locked"
+            ).GetValueAsUnsigned()
+            if expected_gcs_features_locked is not None:
+                self.assertEqual(expected_gcs_features_locked, gcs_features_locked)
+
+            gcspr_el0 = gcs_registers.GetChildMemberWithName(
+                "gcspr_el0"
+            ).GetValueAsUnsigned()
+            if expected_gcspr_el0 is not None:
+                self.assertEqual(expected_gcspr_el0, gcspr_el0)
+
+            return gcs_features_enabled, gcs_features_locked, gcspr_el0
+
+        enabled, locked, spr_el0 = check_gcs_registers()
+
+        # Features enabled should have at least the enable bit set, it could have
+        # others depending on what the C library did, but we can't rely on always
+        # having them.
+        self.assertTrue(enabled & 1, "Expected GCS enable bit to be set.")
+
+        # Features locked we cannot predict, we will just assert that it remains
+        # the same as we continue.
+
+        # spr_el0 will point to some memory region that is a shadow stack region.
+        self.expect(f"memory region {spr_el0}", substrs=["shadow stack: yes"])
+
+        # Continue into test_func2, where the GCS pointer should have been
+        # decremented, and the other registers remain the same.
+        self.runCmd("continue")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        _, _, spr_el0 = check_gcs_registers(enabled, locked, spr_el0 - 8)
+
+        # Any combination of GCS feature lock bits might have been set by the C
+        # library, and could be set to 0 or 1. To check that we can modify them,
+        # invert one of those bits then write it back to the lock register.
+        # The stack pushing feature is bit 2 of that register.
+        STACK_PUSH = 2
+        # Get the original value of the stack push lock bit.
+        stack_push = bool((locked >> STACK_PUSH) & 1)
+        # Invert the value and put it back into the set of lock bits.
+        new_locked = (locked & ~(1 << STACK_PUSH)) | (int(not stack_push) << STACK_PUSH)
+        # Write the new lock bits, which are the same as before, only with stack
+        # push locked (if it was previously unlocked), or unlocked (if it was
+        # previously locked).
+        self.runCmd(f"register write gcs_features_locked 0x{new_locked:x}")
+        # We should be able to read back this new set of lock bits.
+        self.expect(
+            f"register read gcs_features_locked",
+            substrs=[f"gcs_features_locked = 0x{new_locked:016x}"],
+        )
+
+        # We could prove the write made it to hardware by trying to prctl() to
+        # enable or disable the stack push feature here, but because the libc
+        # may or may not have locked it, it's tricky to coordinate this. Given
+        # that we know the other registers can be written and their values are
+        # seen by the process, we can assume this is too.
+
+        # Restore the original lock bits, as the libc may rely on being able
+        # to use certain features during program execution.
+        self.runCmd(f"register write gcs_features_locked 0x{locked:x}")
+
+        # Modify the guarded control stack pointer to cause a fault.
+        spr_el0 += 8
+        self.runCmd(f"register write gcspr_el0 {spr_el0}")
+        self.expect(
+            "register read gcspr_el0", substrs=[f"gcspr_el0 = 0x{spr_el0:016x}"]
+        )
+
+        # If we wrote it back correctly, we will now fault. Don't pass this signal
+        # to the application, as we will continue past it later.
+        self.runCmd("process handle SIGSEGV --pass false")
+        self.runCmd("continue")
+
+        self.expect(
+            "thread list",
+            "Expected stopped by SIGSEGV.",
+            substrs=[
+                "stopped",
+                "stop reason = signal SIGSEGV: control protection fault",
+            ],
+        )
+
+        # Now to prove we can write gcs_features_enabled, disable GCS and continue
+        # past the fault we caused. Note that although the libc likely locked the
+        # ability to disable GCS, ptrace bypasses the lock bits.
+        enabled &= ~1
+        self.runCmd(f"register write gcs_features_enabled {enabled}")
+        self.expect(
+            "register read gcs_features_enabled",
+            substrs=[f"gcs_features_enabled = 0x{enabled:016x}"],
+        )
+
+        # With GCS disabled, the invalid guarded control stack pointer is not
+        # checked, so the program can finish normally.
+        self.runCmd("continue")
+        self.expect(
+            "process status",
+            substrs=[
+                "exited with status = 0",
+            ],
+        )
diff --git a/lldb/test/API/linux/aarch64/gcs/main.c b/lldb/test/API/linux/aarch64/gcs/main.c
index 32a9b07c20743..09354639af376 100644
--- a/lldb/test/API/linux/aarch64/gcs/main.c
+++ b/lldb/test/API/linux/aarch64/gcs/main.c
@@ -2,8 +2,8 @@
 #include <sys/auxv.h>
 #include <sys/prctl.h>
 
-#ifndef HWCAP2_GCS
-#define HWCAP2_GCS (1UL << 63)
+#ifndef HWCAP_GCS
+#define HWCAP_GCS (1UL << 32)
 #endif
 
 #define PR_GET_SHADOW_STACK_STATUS 74
@@ -49,8 +49,14 @@ void gcs_signal() {
       "ret\n");
 }
 
+// These functions are used to observe gcspr_el0 changing as we enter them, and
+// the fault we cause by changing its value.
+void test_func2() { volatile int i = 99; }
+
+void test_func() { test_func2(); }
+
 int main() {
-  if (!(getauxval(AT_HWCAP2) & HWCAP2_GCS))
+  if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
     return 1;
 
   unsigned long mode = get_gcs_status();
@@ -63,7 +69,16 @@ int main() {
   }
 
   // By now we should have one memory region where the GCS is stored.
-  gcs_signal(); // Set break point at this line.
+
+  // For register read/write tests.
+  test_func();
+
+  // If this was a register test, we would have disabled GCS during the
+  // test_func call. We cannot re-enable it from ptrace so skip this part in
+  // this case.
+  mode = get_gcs_status();
+  if ((mode & 1) == 1)
+    gcs_signal(); // Set break point at this line.
 
   return 0;
 }

From 11b040192640ef3b1f481124c440f464ed6ec86a Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Fri, 24 Jan 2025 19:13:40 +0530
Subject: [PATCH 004/432] [AMDGPU] Restore SP from saved-FP or saved-BP
 (#124007)

Currently, the AMDGPU backend bumps the Stack Pointer
by fixed size offsets in the prolog of device functions, and
restores it by the same amount in the epilog.
Prolog:
sp += frameSize

Epilog:
sp -= frameSize

If a function has dynamic stack realignment,
Prolog:
sp += frameSize + max_alignment

Epilog:
sp -= frameSize + max_alignment

These calculations are not optimal in case of dynamic
stack realignment, and completely fail in case of
dynamic stack readjustment.
This patch uses the saved Frame Pointer to restore SP.
Prolog:
fp = sp
sp += frameSize

Epilog:
sp = fp

In case of dynamic stack realignment, SP is restored from
the saved Base Pointer.
Prolog:
fp = sp + (max_alignment - 1)
fp = fp & (-max_alignment)
bp = sp
sp += frameSize + max_alignment

Epilog:
sp = bp

(Note: The presence of BP has been enforced in case of any
dynamic stack realignment.)

---------

Co-authored-by: Pravin Jagtap <Pravin.Jagtap@amd.com>
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  20 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |   3 +-
 .../CodeGen/AMDGPU/GlobalISel/assert-align.ll |   2 +-
 .../GlobalISel/call-outgoing-stack-args.ll    |   8 +-
 .../GlobalISel/dynamic-alloca-uniform.ll      |  48 +-
 .../CodeGen/AMDGPU/GlobalISel/localizer.ll    |   2 +-
 .../AMDGPU/GlobalISel/non-entry-alloca.ll     |   7 +-
 .../abi-attribute-hints-undefined-behavior.ll |   2 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  10 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  72 +-
 ...ffer-fat-pointers-contents-legalization.ll |  10 +-
 llvm/test/CodeGen/AMDGPU/call-args-inreg.ll   |  96 +-
 .../CodeGen/AMDGPU/call-argument-types.ll     |  40 +-
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll |  59 +-
 .../callee-special-input-vgprs-packed.ll      |   2 +-
 .../AMDGPU/callee-special-input-vgprs.ll      |   2 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   8 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   2 +-
 .../test/CodeGen/AMDGPU/dynamic_stackalloc.ll | 184 ++--
 .../eliminate-frame-index-v-add-co-u32.mir    | 240 +++--
 .../eliminate-frame-index-v-add-u32.mir       |  25 +-
 .../fix-frame-reg-in-custom-csr-spills.ll     |   7 +-
 ...frame-setup-without-sgpr-to-vgpr-spills.ll |   4 +-
 .../CodeGen/AMDGPU/function-args-inreg.ll     |   8 +-
 .../CodeGen/AMDGPU/gfx-call-non-gfx-func.ll   |   4 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 959 +++++++++---------
 .../gfx-callable-preserved-registers.ll       |  60 +-
 .../AMDGPU/gfx-callable-return-types.ll       | 150 +--
 llvm/test/CodeGen/AMDGPU/global-alias.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |  32 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |   2 +-
 .../CodeGen/AMDGPU/insert-waitcnts-crash.ll   |   2 +-
 .../local-stack-alloc-block-sp-reference.ll   |  10 +-
 .../materialize-frame-index-sgpr.gfx10.ll     |  49 +-
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   2 +-
 .../AMDGPU/need-fp-from-vgpr-spills.ll        |   8 +-
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |   3 +-
 .../AMDGPU/no-source-locations-in-prologue.ll |   3 +-
 llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll  |  14 +-
 .../AMDGPU/pei-scavenge-sgpr-carry-out.mir    |  54 +-
 .../CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir |  10 +-
 .../test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir |   5 +-
 .../AMDGPU/pei-scavenge-vgpr-spill.mir        |  15 +-
 .../AMDGPU/preserve-wwm-copy-dst-reg.ll       |   6 +-
 .../AMDGPU/schedule-amdgpu-trackers.ll        |   4 +-
 .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir |   2 +-
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   6 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/stack-realign.ll     |  16 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |  14 +-
 .../AMDGPU/strictfp_f16_abi_promote.ll        |  18 +-
 .../AMDGPU/tail-call-inreg-arguments.error.ll |   4 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |   4 +-
 .../CodeGen/AMDGPU/use_restore_frame_reg.mir  |  10 +-
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |  12 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   4 +-
 .../AMDGPU/whole-wave-register-copy.ll        |   2 +-
 .../AMDGPU/whole-wave-register-spill.ll       |   4 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   8 +-
 ...dgpu_generated_funcs.ll.generated.expected |   4 +-
 ...pu_generated_funcs.ll.nogenerated.expected |   4 +-
 61 files changed, 1305 insertions(+), 1064 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 2e2523312840a..060db477a59f8 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1256,6 +1256,18 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
   bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
 
+  if (RoundedSize != 0) {
+    if (TRI.hasBasePointer(MF)) {
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
+          .addReg(TRI.getBaseRegister())
+          .setMIFlag(MachineInstr::FrameDestroy);
+    } else if (hasFP(MF)) {
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), StackPtrReg)
+          .addReg(FramePtrReg)
+          .setMIFlag(MachineInstr::FrameDestroy);
+    }
+  }
+
   Register FramePtrRegScratchCopy;
   Register SGPRForFPSaveRestoreCopy =
       FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
@@ -1280,14 +1292,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                          FramePtrRegScratchCopy);
   }
 
-  if (RoundedSize != 0 && hasFP(MF)) {
-    auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
-        .addReg(StackPtrReg)
-        .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
-        .setMIFlag(MachineInstr::FrameDestroy);
-    Add->getOperand(3).setIsDead(); // Mark SCC as dead.
-  }
-
   if (FPSaved) {
     // Insert the copy to restore FP.
     Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 11121e6058770..6fc57dec6a826 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -525,8 +525,7 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   // When we need stack realignment, we can't reference off of the
   // stack pointer, so we reserve a base pointer.
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return MFI.getNumFixedObjects() && shouldRealignStack(MF);
+  return shouldRealignStack(MF);
 }
 
 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
index 604caf572b0fe..c477732e5cd59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll
@@ -27,11 +27,11 @@ define ptr addrspace(1) @call_assert_align() {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
index 974ce492daea8..410d3b1bb7062 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll
@@ -247,11 +247,11 @@ define void @func_caller_stack() {
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
-; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
 ; MUBUF-NEXT:    s_mov_b32 s33, s4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
@@ -286,11 +286,11 @@ define void @func_caller_stack() {
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
-; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
@@ -372,11 +372,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
 ; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
 ; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
-; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
 ; MUBUF-NEXT:    s_mov_b32 s33, s4
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
@@ -437,11 +437,11 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) {
 ; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
 ; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
 ; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
 ; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
-; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
 ; FLATSCR-NEXT:    s_mov_b32 s33, s0
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
index ae055ea041297..6b767d9e754be 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -80,13 +80,13 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s6, s4
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -103,7 +103,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
@@ -112,7 +111,8 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s6, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s32, s33
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align4:
@@ -127,7 +127,6 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, s32
-; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
@@ -136,9 +135,10 @@ define void @func_dynamic_stackalloc_sgpr_align4() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s32, s2, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 4
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -221,13 +221,13 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[4:5], 0x0
-; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl2_add_u32 s4, s4, 15
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s6, s4
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    s_mov_b32 s33, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -244,7 +244,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s6
-; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
@@ -253,7 +252,8 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s6, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
+; GFX10-NEXT:    s_mov_b32 s32, s33
+; GFX10-NEXT:    s_mov_b32 s33, s7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align16:
@@ -268,7 +268,6 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, s32
-; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
@@ -277,9 +276,10 @@ define void @func_dynamic_stackalloc_sgpr_align16() {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s0, s0, -16
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_add_u32 s32, s2, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s32, s33
+; GFX11-NEXT:    s_mov_b32 s33, s3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv, align 16
   %alloca = alloca i32, i32 %n, addrspace(5)
@@ -355,6 +355,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_mov_b32 s6, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; GFX9-NEXT:    s_mov_b32 s7, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_addk_i32 s32, 0x1000
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
@@ -373,7 +375,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    s_and_b32 s4, s4, -16
 ; GFX9-NEXT:    s_lshl_b32 s4, s4, 6
 ; GFX9-NEXT:    s_add_u32 s32, s5, s4
-; GFX9-NEXT:    s_addk_i32 s32, 0xf000
+; GFX9-NEXT:    s_mov_b32 s32, s34
+; GFX9-NEXT:    s_mov_b32 s34, s7
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -382,8 +385,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s6, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3e0
-; GFX10-NEXT:    s_addk_i32 s32, 0x800
+; GFX10-NEXT:    s_mov_b32 s7, s34
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xfffffc00
+; GFX10-NEXT:    s_mov_b32 s34, s32
+; GFX10-NEXT:    s_addk_i32 s32, 0x800
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
@@ -401,7 +406,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX10-NEXT:    s_and_b32 s4, s4, -16
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 5
 ; GFX10-NEXT:    s_add_u32 s32, s5, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xf800
+; GFX10-NEXT:    s_mov_b32 s32, s34
+; GFX10-NEXT:    s_mov_b32 s34, s7
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: func_dynamic_stackalloc_sgpr_align32:
@@ -409,8 +415,10 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s2, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 31
-; GFX11-NEXT:    s_add_i32 s32, s32, 64
+; GFX11-NEXT:    s_mov_b32 s3, s34
 ; GFX11-NEXT:    s_and_not1_b32 s33, s33, 31
+; GFX11-NEXT:    s_mov_b32 s34, s32
+; GFX11-NEXT:    s_add_i32 s32, s32, 64
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, gv@gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, gv@gotpcrel32@hi+12
@@ -429,8 +437,8 @@ define void @func_dynamic_stackalloc_sgpr_align32(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 5
 ; GFX11-NEXT:    s_add_u32 s32, s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_addk_i32 s32, 0xffc0
+; GFX11-NEXT:    s_mov_b32 s32, s34
+; GFX11-NEXT:    s_mov_b32 s34, s3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %n = load i32, ptr addrspace(4) @gv
   %alloca = alloca i32, i32 %n, align 32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
index 009beeb395100..767232a01c7e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@@ -248,11 +248,11 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], 0
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
index 69abef02d3d92..34cf6905fe75b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -180,7 +180,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 
@@ -216,8 +216,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
 ; GCN-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GCN-NEXT:    s_mov_b32 s8, s34
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GCN-NEXT:    s_mov_b32 s34, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x2000
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz .LBB3_2
@@ -242,7 +244,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_store_dword v[0:1], v0, off
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_addk_i32 s32, 0xe000
+; GCN-NEXT:    s_mov_b32 s32, s34
+; GCN-NEXT:    s_mov_b32 s34, s8
 ; GCN-NEXT:    s_mov_b32 s33, s7
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb4..194a23fa0d4a9 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -32,11 +32,11 @@ define void @parent_func_missing_inputs() #0 {
 ; FIXEDABI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; FIXEDABI-NEXT:    v_readlane_b32 s31, v40, 1
 ; FIXEDABI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIXEDABI-NEXT:    s_mov_b32 s32, s33
 ; FIXEDABI-NEXT:    v_readlane_b32 s4, v40, 2
 ; FIXEDABI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; FIXEDABI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; FIXEDABI-NEXT:    s_mov_b64 exec, s[6:7]
-; FIXEDABI-NEXT:    s_addk_i32 s32, 0xfc00
 ; FIXEDABI-NEXT:    s_mov_b32 s33, s4
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index 25b6b7be1f3b5..ab2363860af9d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -193,11 +193,11 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -329,11 +329,11 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -477,11 +477,11 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -614,11 +614,11 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -761,11 +761,11 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ec469b3020cce..0382cc72a36ae 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3798,10 +3798,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v2, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -3829,10 +3829,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -3858,10 +3858,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -3887,10 +3887,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3917,11 +3917,11 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3947,10 +3947,10 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3990,10 +3990,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v4, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -4026,10 +4026,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -4055,10 +4055,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4084,10 +4084,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4114,11 +4114,11 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4144,10 +4144,10 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4189,10 +4189,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v5, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -4227,10 +4227,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -4259,10 +4259,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4290,10 +4290,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4322,11 +4322,11 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4354,10 +4354,10 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4407,10 +4407,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v8, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v8, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -4453,10 +4453,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -4485,10 +4485,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4516,10 +4516,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4548,11 +4548,11 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4578,10 +4578,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4651,10 +4651,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v16, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v16, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -4717,10 +4717,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -4755,10 +4755,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -4790,10 +4790,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4826,11 +4826,11 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4856,10 +4856,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4969,10 +4969,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v20, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v20, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -5075,10 +5075,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -5125,10 +5125,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -5168,10 +5168,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s18
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5212,11 +5212,11 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s18
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5244,10 +5244,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_readlane_b32 s31, v9, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v9, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index cdfaed0a203e9..9912ce3604a49 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -3099,8 +3099,11 @@ define i160 @load_i160(ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_mov_b32 s4, s33
 ; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_mov_b32 s5, s34
+; SDAG-NEXT:    s_mov_b32 s34, s32
 ; SDAG-NEXT:    s_addk_i32 s32, 0x1800
-; SDAG-NEXT:    s_addk_i32 s32, 0xe800
+; SDAG-NEXT:    s_mov_b32 s32, s34
+; SDAG-NEXT:    s_mov_b32 s34, s5
 ; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -3124,10 +3127,13 @@ define void @store_i160(i160 %data, ptr addrspace(8) inreg %buf) {
 ; SDAG-NEXT:    s_mov_b32 s4, s33
 ; SDAG-NEXT:    s_add_i32 s33, s32, 0x7c0
 ; SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff800
+; SDAG-NEXT:    s_mov_b32 s5, s34
+; SDAG-NEXT:    s_mov_b32 s34, s32
 ; SDAG-NEXT:    s_addk_i32 s32, 0x1000
 ; SDAG-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SDAG-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16
-; SDAG-NEXT:    s_addk_i32 s32, 0xf000
+; SDAG-NEXT:    s_mov_b32 s32, s34
+; SDAG-NEXT:    s_mov_b32 s34, s5
 ; SDAG-NEXT:    s_mov_b32 s33, s4
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index 93a4469c7718e..704b68aa296a9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -51,11 +51,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -79,11 +79,11 @@ define void @test_call_external_void_func_i8_inreg(i8 inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -111,11 +111,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -139,11 +139,11 @@ define void @test_call_external_void_func_i16_inreg(i16 inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -171,11 +171,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -199,11 +199,11 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -232,11 +232,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -260,11 +260,11 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -293,11 +293,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -321,11 +321,11 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -355,11 +355,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -383,11 +383,11 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -418,11 +418,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -446,11 +446,11 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -485,11 +485,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[24:25]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -513,11 +513,11 @@ define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -545,11 +545,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -573,11 +573,11 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -605,11 +605,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -633,11 +633,11 @@ define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -665,11 +665,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -693,11 +693,11 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -726,11 +726,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -754,11 +754,11 @@ define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -786,11 +786,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -814,11 +814,11 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -847,11 +847,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -875,11 +875,11 @@ define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -908,11 +908,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -936,11 +936,11 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -969,11 +969,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -997,11 +997,11 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1030,11 +1030,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1058,11 +1058,11 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1091,11 +1091,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1119,11 +1119,11 @@ define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1151,11 +1151,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1179,11 +1179,11 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1214,11 +1214,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1242,11 +1242,11 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1275,11 +1275,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1303,11 +1303,11 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1339,11 +1339,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1367,11 +1367,11 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1411,11 +1411,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], vcc
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1439,11 +1439,11 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1486,11 +1486,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[22:23]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1514,11 +1514,11 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 26ab0f3ce6355..35d00390067d8 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -5873,11 +5873,11 @@ define void @stack_12xv3i32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
-; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -5941,11 +5941,11 @@ define void @stack_12xv3i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
-; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
@@ -6009,11 +6009,11 @@ define void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6059,11 +6059,11 @@ define void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6127,11 +6127,11 @@ define void @stack_12xv3i32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
-; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_setpc_b64 s[30:31]
@@ -6212,11 +6212,11 @@ define void @stack_12xv3f32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
-; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -6280,11 +6280,11 @@ define void @stack_12xv3f32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
-; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
@@ -6348,11 +6348,11 @@ define void @stack_12xv3f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6402,11 +6402,11 @@ define void @stack_12xv3f32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6470,11 +6470,11 @@ define void @stack_12xv3f32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
-; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_setpc_b64 s[30:31]
@@ -6563,11 +6563,11 @@ define void @stack_8xv5i32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
-; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -6639,11 +6639,11 @@ define void @stack_8xv5i32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
-; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
@@ -6715,11 +6715,11 @@ define void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6770,11 +6770,11 @@ define void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6846,11 +6846,11 @@ define void @stack_8xv5i32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
-; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_setpc_b64 s[30:31]
@@ -6935,11 +6935,11 @@ define void @stack_8xv5f32() #0 {
 ; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; VI-NEXT:    v_readlane_b32 s31, v40, 1
 ; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    s_mov_b32 s32, s33
 ; VI-NEXT:    v_readlane_b32 s4, v40, 2
 ; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; VI-NEXT:    s_mov_b64 exec, s[6:7]
-; VI-NEXT:    s_addk_i32 s32, 0xfc00
 ; VI-NEXT:    s_mov_b32 s33, s4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -7011,11 +7011,11 @@ define void @stack_8xv5f32() #0 {
 ; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; CI-NEXT:    v_readlane_b32 s31, v40, 1
 ; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    s_mov_b32 s32, s33
 ; CI-NEXT:    v_readlane_b32 s4, v40, 2
 ; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CI-NEXT:    s_mov_b64 exec, s[6:7]
-; CI-NEXT:    s_addk_i32 s32, 0xfc00
 ; CI-NEXT:    s_mov_b32 s33, s4
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    s_setpc_b64 s[30:31]
@@ -7087,11 +7087,11 @@ define void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7145,11 +7145,11 @@ define void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7221,11 +7221,11 @@ define void @stack_8xv5f32() #0 {
 ; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; HSA-NEXT:    v_readlane_b32 s31, v40, 1
 ; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    s_mov_b32 s32, s33
 ; HSA-NEXT:    v_readlane_b32 s4, v40, 2
 ; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; HSA-NEXT:    s_mov_b64 exec, s[6:7]
-; HSA-NEXT:    s_addk_i32 s32, 0xfc00
 ; HSA-NEXT:    s_mov_b32 s33, s4
 ; HSA-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index e926a3c728cbd..6fb071dd42d2f 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -58,8 +58,8 @@ define void @callee_with_stack() #0 {
 ; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33{{$}}
 ; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}}
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_addk_i32 s32, 0xfe00
-; FLATSCR-NEXT: s_add_i32 s32, s32, -8
+; MUBUF-NEXT:   s_mov_b32 s32, s33
+; FLATSCR-NEXT: s_mov_b32 s32, s33
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_all() #1 {
@@ -106,13 +106,13 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
 ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]]
 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]]
 
+; MUBUF:    s_mov_b32 s32, s33{{$}}
+; FLATSCR:  s_mov_b32 s32, s33{{$}}
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:    s_addk_i32 s32, 0xfc00{{$}}
-; FLATSCR:  s_add_i32 s32, s32, -16{{$}}
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 
@@ -149,13 +149,13 @@ define void @callee_with_stack_and_call() #0 {
 ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0
 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1
 
+; MUBUF:   s_mov_b32 s32, s33
+; FLATSCR: s_mov_b32 s32, s33
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]]
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   s_addk_i32 s32, 0xfc00
-; FLATSCR: s_add_i32 s32, s32, -16
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -253,10 +253,10 @@ define void @spill_only_csr_sgpr() {
 ; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; MUBUF:        s_addk_i32 s32, 0x300
-; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00
+; MUBUF-NEXT:   s_mov_b32 s32, s33
 ; MUBUF-NEXT:   s_mov_b32 s33, s4
 ; FLATSCR:      s_add_i32 s32, s32, 12
-; FLATSCR-NEXT: s_add_i32 s32, s32, -12
+; FLATSCR-NEXT:   s_mov_b32 s32, s33
 ; FLATSCR-NEXT: s_mov_b32 s33, s0
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -282,16 +282,14 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v1
 
-; MUBUF:        s_addk_i32 s32, 0x400
-; FLATSCR:      s_add_i32 s32, s32, 16
+; MUBUF:        s_mov_b32 s32, s33
+; FLATSCR:      s_mov_b32 s32, s33
+
 ; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:        s_addk_i32 s32, 0xfc00
-; FLATSCR:      s_add_i32 s32, s32, -16
 ; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -333,12 +331,12 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
+; MUBUF-NEXT:   s_mov_b32 s32, s33
+; FLATSCR-NEXT: s_mov_b32 s32, s33
 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF-NEXT:   s_addk_i32 s32, 0xfc00
-; FLATSCR-NEXT: s_add_i32 s32, s32, -16
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -366,16 +364,22 @@ define void @no_new_vgpr_for_fp_csr() #1 {
 ; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
 ; MUBUF-NEXT:   s_and_b32 s33, s33, 0xfff80000
 ; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
+; MUBUF-NEXT:   s_mov_b32 s5, s34
+; FLATSCR-NEXT: s_mov_b32 s1, s34
+; MUBUF-NEXT:   s_mov_b32 s34, s32 
+; FLATSCR-NEXT: s_mov_b32 s34, s32
 ; MUBUF-NEXT:   s_add_i32 s32, s32, 0x180000
 ; FLATSCR-NEXT: s_addk_i32 s32, 0x6000
 ; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; MUBUF-NEXT:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}}
 ; MUBUF-NEXT:   buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}}
-; FLATSCR-NEXT: s_add_i32 s1, s33, 0x2000
-; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s1
+; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000
+; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2
 ; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_add_i32 s32, s32, 0xffe80000
-; FLATSCR-NEXT: s_addk_i32 s32, 0xa000
+; MUBUF-NEXT:   s_mov_b32 s32, s34 
+; MUBUF-NEXT:   s_mov_b32 s34, s5 
+; FLATSCR-NEXT:   s_mov_b32 s32, s34 
+; FLATSCR-NEXT:   s_mov_b32 s34, s1 
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
@@ -399,16 +403,13 @@ define void @realign_stack_no_fp_elim() #1 {
 ; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
 ; GCN-NEXT:     s_waitcnt vmcnt(0)
 ; GCN: ;;#ASMSTART
-; MUBUF:        s_addk_i32 s32, 0x300
-; FLATSCR:      s_add_i32 s32, s32, 12
 ; GCN:     v_readlane_b32 s31, [[CSR_VGPR]], 1
 ; GCN:     v_readlane_b32 s30, [[CSR_VGPR]], 0
+;GCN-NEXT: s_mov_b32 s32, s33
 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   s_addk_i32 s32, 0xfd00
-; FLATSCR: s_add_i32 s32, s32, -12
 ; GCN-NEXT: s_mov_b32 s33, vcc_lo
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -435,19 +436,19 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
+; GCN: v_mov_b32_e32
 
-; MUBUF-DAG:   buffer_store_dword
-; FLATSCR-DAG: scratch_store_dword
 ; MUBUF:       s_addk_i32 s32, 0x300{{$}}
 ; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
+; MUBUF-DAG:   buffer_store_dword
+; FLATSCR-DAG: scratch_store_dword
 
 ; GCN: ;;#ASMSTART
+; GCN: s_mov_b32 s32, s33
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   s_addk_i32 s32, 0xfd00{{$}}
-; FLATSCR: s_add_i32 s32, s32, -12{{$}}
 ; GCN-NEXT: s_mov_b32 s33, vcc_lo
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -495,8 +496,6 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 ; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   s_add_i32 s32, s32, 0xfffbfd00{{$}}
-; FLATSCR: s_addk_i32 s32, 0xeff4{{$}}
 ; GCN-NEXT: s_mov_b32 s33, vcc_lo
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -538,8 +537,6 @@ define internal void @local_empty_func() #0 {
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}}
 ; GCN:     s_swappc_b64
-; MUBUF:   s_addk_i32 s32, 0xfc00
-; FLATSCR: s_add_i32 s32, s32, -16
 ; GCN: s_mov_b32 s33, [[TMP_SGPR]]
 define void @ipra_call_with_stack() #0 {
   %alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
index b52e7918b27ab..f85cea1ba03fb 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll
@@ -428,8 +428,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 
 ; GCN: s_swappc_b64
 
+; GCN: s_mov_b32 s32, s33
 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN: s_addk_i32 s32, 0xfc00{{$}}
 ; GCN: s_setpc_b64
 define void @too_many_args_call_too_many_args_use_workitem_id_x(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
index 9792c9dabac2f..2399112e3fefb 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll
@@ -441,8 +441,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
 
 ; GCN: s_swappc_b64
 
+; GCN: s_mov_b32 s32, s33
 ; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN: s_addk_i32 s32, 0xfc00{{$}}
 ; GCN: s_setpc_b64
 define void @too_many_args_call_too_many_args_use_workitem_id_x(
   i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 5e6152661aeec..9bef0b7d76ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -42,11 +42,11 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -78,11 +78,11 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -114,11 +114,11 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -151,11 +151,11 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, v4
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index 2cd3916165fe7..8b02bdbb70b7b 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -91,11 +91,11 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 9acb3a42ae102..d61c4b46596c0 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -1084,7 +1084,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1113,7 +1113,7 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1140,12 +1140,11 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform:
@@ -1171,11 +1170,12 @@ define void @test_dynamic_stackalloc_device_uniform(i32 %n) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, addrspace(5)
   store volatile i32 123, ptr addrspace(5) %alloca
@@ -1189,10 +1189,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-SDAG-NEXT:    s_mov_b32 s9, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s10, s34
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-SDAG-NEXT:    s_mov_b32 s6, 0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-SDAG-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1210,7 +1212,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 10
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s10
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1220,10 +1223,12 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1241,7 +1246,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1251,17 +1257,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_mov_b32 s4, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s5, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-SDAG-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-SDAG-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-SDAG-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-SDAG-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -1274,8 +1281,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s5
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_over_aligned:
@@ -1284,17 +1291,18 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-GISEL-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX11-GISEL-NEXT:    s_bitset0_b32 s1, s2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    s_max_u32 s0, s0, s3
 ; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-GISEL-NEXT:    s_cbranch_scc1 .LBB9_1
@@ -1307,7 +1315,8 @@ define void @test_dynamic_stackalloc_device_uniform_over_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 128, addrspace(5)
   store volatile i32 10, ptr addrspace(5) %alloca
@@ -1340,7 +1349,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1369,7 +1378,7 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1396,12 +1405,11 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 22
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_uniform_under_aligned:
@@ -1427,11 +1435,12 @@ define void @test_dynamic_stackalloc_device_uniform_under_aligned(i32 %n) {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 22
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i32 %n, align 2, addrspace(5)
   store volatile i32 22, ptr addrspace(5) %alloca
@@ -1465,7 +1474,7 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1495,7 +1504,7 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1524,12 +1533,11 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7b
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent:
@@ -1557,11 +1565,12 @@ define void @test_dynamic_stackalloc_device_divergent() {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, addrspace(5)
@@ -1575,6 +1584,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s10, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s11, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX9-SDAG-NEXT:    s_add_i32 s4, s32, 0x1fff
@@ -1598,7 +1609,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x1bc
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s11
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s10
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1609,10 +1621,12 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX9-GISEL-NEXT:    s_mov_b32 s9, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0x1fc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s10, s34
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9-GISEL-NEXT:    s_mov_b32 s6, 0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x4000
 ; GFX9-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
@@ -1630,7 +1644,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xc000
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s10
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1640,14 +1655,16 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
+; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x100
 ; GFX11-SDAG-NEXT:    s_mov_b32 s2, exec_lo
-; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
 ; GFX11-SDAG-NEXT:    s_add_i32 s0, s32, 0xfff
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
+; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:    s_and_b32 s0, s0, 0xfffff000
 ; GFX11-SDAG-NEXT:    s_and_b32 s33, s33, 0xffffff80
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0x1ff0, v0
 ; GFX11-SDAG-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-SDAG-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -1665,7 +1682,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s0 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_over_aligned:
@@ -1674,12 +1692,13 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
 ; GFX11-GISEL-NEXT:    s_mov_b32 s4, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 0x7f
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    v_lshl_add_u32 v0, v0, 2, 15
+; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_and_b32 s33, s33, 0xffffff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x100
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, -16, v0
 ; GFX11-GISEL-NEXT:  .LBB12_1: ; =>This Inner Loop Header: Depth=1
 ; GFX11-GISEL-NEXT:    s_ctz_i32_b32 s2, s1
@@ -1699,7 +1718,8 @@ define void @test_dynamic_stackalloc_device_divergent_over_aligned() {
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff00
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 128, addrspace(5)
@@ -1734,7 +1754,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1764,7 +1784,7 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1793,12 +1813,11 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_under_aligned:
@@ -1826,11 +1845,12 @@ define void @test_dynamic_stackalloc_device_divergent_under_aligned() {
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
   %alloca = alloca i32, i32 %idx, align 2, addrspace(5)
@@ -1844,9 +1864,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s13, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s14, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_cbranch_execz .LBB14_6
@@ -1915,7 +1937,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    buffer_store_dword v1, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s14
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s13
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1924,9 +1947,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s13, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s14, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x3000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_cbranch_execz .LBB14_6
@@ -1994,7 +2019,8 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xd000
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s14
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s13
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2003,9 +2029,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s7, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s8, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-SDAG-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-SDAG-NEXT:    s_cbranch_execz .LBB14_6
@@ -2079,9 +2107,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s8
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s7
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff40
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_multiple_allocas:
@@ -2089,9 +2117,11 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s7, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xc0
 ; GFX11-GISEL-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-GISEL-NEXT:    s_cbranch_execz .LBB14_6
@@ -2162,8 +2192,9 @@ define void @test_dynamic_stackalloc_device_multiple_allocas(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s8
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s7
-; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff40
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %n, 0
@@ -2189,9 +2220,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_mov_b32 s11, s33
 ; GFX9-SDAG-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s34
 ; GFX9-SDAG-NEXT:    s_mov_b32 s8, 0
 ; GFX9-SDAG-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-SDAG-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-SDAG-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2243,7 +2276,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX9-SDAG-NEXT:    s_mov_b32 s34, s12
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s11
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -2253,9 +2287,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_mov_b32 s11, s33
 ; GFX9-GISEL-NEXT:    s_add_i32 s33, s32, 0xfc0
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, s34
 ; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
 ; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-GISEL-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0x2000
 ; GFX9-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
@@ -2307,7 +2343,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:  .LBB15_8: ; %bb.2
 ; GFX9-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xe000
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX9-GISEL-NEXT:    s_mov_b32 s34, s12
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2317,9 +2354,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_mov_b32 s5, s33
 ; GFX11-SDAG-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s6, s34
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, 0
 ; GFX11-SDAG-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-SDAG-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s32
 ; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-SDAG-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-SDAG-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2376,8 +2415,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
 ; GFX11-SDAG-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-SDAG-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s34
+; GFX11-SDAG-NEXT:    s_mov_b32 s34, s6
 ; GFX11-SDAG-NEXT:    s_mov_b32 s33, s5
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2386,9 +2425,11 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_mov_b32 s5, s33
 ; GFX11-GISEL-NEXT:    s_add_i32 s33, s32, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s34
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0
 ; GFX11-GISEL-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX11-GISEL-NEXT:    s_and_not1_b32 s33, s33, 63
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s32
 ; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0x80
 ; GFX11-GISEL-NEXT:    v_cmpx_ne_u32_e32 0, v0
 ; GFX11-GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
@@ -2446,7 +2487,8 @@ define void @test_dynamic_stackalloc_device_control_flow(i32 %n, i32 %m) {
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:  .LBB15_8: ; %bb.2
 ; GFX11-GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
-; GFX11-GISEL-NEXT:    s_addk_i32 s32, 0xff80
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s34
+; GFX11-GISEL-NEXT:    s_mov_b32 s34, s6
 ; GFX11-GISEL-NEXT:    s_mov_b32 s33, s5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2492,7 +2534,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2522,7 +2564,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2551,12 +2593,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16:
@@ -2584,11 +2625,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i16 %n, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
@@ -2621,7 +2663,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX9-SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], s4
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, s33
 ; GFX9-SDAG-NEXT:    s_mov_b32 s33, s9
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2650,7 +2692,7 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX9-GISEL-NEXT:    s_mov_b32 s33, s9
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2677,12 +2719,11 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX11-SDAG-NEXT:    s_mov_b32 s1, s32
 ; GFX11-SDAG-NEXT:    v_mov_b32_e32 v1, 0x29a
 ; GFX11-SDAG-NEXT:    v_lshl_add_u32 v0, s0, 5, s1
-; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    scratch_store_b32 off, v1, s1 dlc
 ; GFX11-SDAG-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s32, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-SDAG-NEXT:    s_mov_b32 s32, s33
+; GFX11-SDAG-NEXT:    s_mov_b32 s33, s4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i64:
@@ -2708,11 +2749,12 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i64(i64
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0x29a
 ; GFX11-GISEL-NEXT:    s_mov_b32 s1, s32
 ; GFX11-GISEL-NEXT:    s_lshl_b32 s0, s0, 5
-; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_add_u32 s32, s1, s0
 ; GFX11-GISEL-NEXT:    scratch_store_b32 off, v0, s1 dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-GISEL-NEXT:    s_mov_b32 s32, s33
+; GFX11-GISEL-NEXT:    s_mov_b32 s33, s4
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, i64 %n, align 2, addrspace(5)
   store volatile i32 666, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
index 831e246426ba7..b5a9f02711016 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
@@ -2052,112 +2052,136 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX7: liveins: $sgpr4
+    ; GFX7: liveins: $sgpr4, $sgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX7-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX8: liveins: $sgpr4
+    ; GFX8: liveins: $sgpr4, $sgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX8-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX900: liveins: $sgpr4
+    ; GFX900: liveins: $sgpr4, $sgpr5
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX900-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX90A: liveins: $sgpr4
+    ; GFX90A: liveins: $sgpr4, $sgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX90A-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX10: liveins: $sgpr4
+    ; GFX10: liveins: $sgpr4, $sgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
-    ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX940: liveins: $sgpr4
+    ; GFX940: liveins: $sgpr4, $sgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
-    ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec
-    ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
+    ; GFX940-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec
+    ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX940-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX11: liveins: $sgpr4
+    ; GFX11: liveins: $sgpr4, $sgpr5
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
-    ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX11-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc
-    ; GFX12: liveins: $sgpr4
+    ; GFX12: liveins: $sgpr4, $sgpr5
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
     ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, dead renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec
-    ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX12-NEXT: SI_RETURN implicit $vgpr0
     renamable $vgpr0,  renamable dead $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec
@@ -2180,115 +2204,139 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX7: liveins: $sgpr4
+    ; GFX7: liveins: $sgpr4, $sgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX7-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX8: liveins: $sgpr4
+    ; GFX8: liveins: $sgpr4, $sgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX8-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX900: liveins: $sgpr4
+    ; GFX900: liveins: $sgpr4, $sgpr5
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX900-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX90A: liveins: $sgpr4
+    ; GFX90A: liveins: $sgpr4, $sgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX90A-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX10: liveins: $sgpr4
+    ; GFX10: liveins: $sgpr4, $sgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX940: liveins: $sgpr4
+    ; GFX940: liveins: $sgpr4, $sgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
-    ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec
-    ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
+    ; GFX940-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec
+    ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX11: liveins: $sgpr4
+    ; GFX11: liveins: $sgpr4, $sgpr5
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     ;
     ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_non_vcc_live
-    ; GFX12: liveins: $sgpr4
+    ; GFX12: liveins: $sgpr4, $sgpr5
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
     ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $sgpr8_sgpr9
     renamable $vgpr0,  renamable $sgpr8_sgpr9 = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec
@@ -2311,112 +2359,136 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX7: liveins: $sgpr4
+    ; GFX7: liveins: $sgpr4, $sgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX7-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX7-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX7-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX8: liveins: $sgpr4
+    ; GFX8: liveins: $sgpr4, $sgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 12288
     ; GFX8-NEXT: $vgpr1, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr1, 0, implicit $exec
     ; GFX8-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX900: liveins: $sgpr4
+    ; GFX900: liveins: $sgpr4, $sgpr5
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX900-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX90A: liveins: $sgpr4
+    ; GFX90A: liveins: $sgpr4, $sgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX90A-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX10: liveins: $sgpr4
+    ; GFX10: liveins: $sgpr4, $sgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
-    ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX940: liveins: $sgpr4
+    ; GFX940: liveins: $sgpr4, $sgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
-    ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec
-    ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
+    ; GFX940-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec
+    ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX940-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX11: liveins: $sgpr4
+    ; GFX11: liveins: $sgpr4, $sgpr5
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
-    ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX11-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc
-    ; GFX12: liveins: $sgpr4
+    ; GFX12: liveins: $sgpr4, $sgpr5
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
     ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, dead renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec
-    ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX12-NEXT: SI_RETURN implicit $vgpr0
     renamable $vgpr0,  renamable dead $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec
@@ -2439,115 +2511,139 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; GFX7-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX7: liveins: $sgpr4
+    ; GFX7: liveins: $sgpr4, $sgpr5
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX7-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX7-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX7-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX7-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX7-NEXT: $sgpr6 = S_MOV_B32 12288
     ; GFX7-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec
     ; GFX7-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX7-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX7-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX7-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX7-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX7-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX8-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX8: liveins: $sgpr4
+    ; GFX8: liveins: $sgpr4, $sgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX8-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX8-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $sgpr6 = S_MOV_B32 12288
     ; GFX8-NEXT: $vgpr1, dead $sgpr6_sgpr7 = V_ADD_CO_U32_e64 killed $sgpr6, killed $vgpr1, 0, implicit $exec
     ; GFX8-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX8-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX8-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX8-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX900-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX900: liveins: $sgpr4
+    ; GFX900: liveins: $sgpr4, $sgpr5
     ; GFX900-NEXT: {{  $}}
     ; GFX900-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX900-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX900-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX900-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX900-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX900-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX900-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX900-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX900-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX900-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX900-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX900-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX90A-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX90A: liveins: $sgpr4
+    ; GFX90A: liveins: $sgpr4, $sgpr5
     ; GFX90A-NEXT: {{  $}}
     ; GFX90A-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX90A-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX90A-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX90A-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX90A-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; GFX90A-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; GFX90A-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX90A-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX90A-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX90A-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX90A-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX10-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX10: liveins: $sgpr4
+    ; GFX10: liveins: $sgpr4, $sgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX10-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; GFX10-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX10-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX10-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
     ; GFX10-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX10-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; GFX10-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX10-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX10-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX10-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX940-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX940: liveins: $sgpr4
+    ; GFX940: liveins: $sgpr4, $sgpr5
     ; GFX940-NEXT: {{  $}}
     ; GFX940-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX940-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX940-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX940-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; GFX940-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
-    ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr5, 0, implicit $exec
-    ; GFX940-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX940-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
+    ; GFX940-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 64, killed $sgpr6, 0, implicit $exec
+    ; GFX940-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX940-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX940-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX940-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX11-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX11: liveins: $sgpr4
+    ; GFX11: liveins: $sgpr4, $sgpr5
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX11-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX11-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX11-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX11-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 12352, killed $vgpr1, 0, implicit $exec
     ; GFX11-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX11-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; GFX11-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX11-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX11-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX11-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     ;
     ; GFX12-LABEL: name: v_add_co_u32_e64_imm_fi_vop3_literal_error_vcc_live
-    ; GFX12: liveins: $sgpr4
+    ; GFX12: liveins: $sgpr4, $sgpr5
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; GFX12-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; GFX12-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX12-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24576, implicit-def dead $scc
     ; GFX12-NEXT: $vgpr1 = V_MOV_B32_e32 $sgpr33, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 4160, killed $vgpr1, 0, implicit $exec
     ; GFX12-NEXT: renamable $vgpr0, renamable $vcc = V_ADD_CO_U32_e64 killed $vgpr0, 0, 0, implicit $exec
-    ; GFX12-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24576, implicit-def dead $scc
+    ; GFX12-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; GFX12-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; GFX12-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vcc
     renamable $vgpr0,  renamable $vcc = V_ADD_CO_U32_e64 64, %stack.1, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
index de198941b565e..b7a5cf963138f 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-u32.mir
@@ -1708,42 +1708,51 @@ machineFunctionInfo:
 body:             |
   bb.0:
     ; MUBUF-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error
-    ; MUBUF: liveins: $sgpr4
+    ; MUBUF: liveins: $sgpr4, $sgpr5
     ; MUBUF-NEXT: {{  $}}
     ; MUBUF-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; MUBUF-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; MUBUF-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; MUBUF-NEXT: $vgpr1 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; MUBUF-NEXT: $vgpr1 = V_ADD_U32_e32 12288, killed $vgpr1, implicit $exec
     ; MUBUF-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $vgpr1, 0, implicit $exec
-    ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; MUBUF-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; MUBUF-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; MUBUF-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; MUBUFW32-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error
-    ; MUBUFW32: liveins: $sgpr4
+    ; MUBUFW32: liveins: $sgpr4, $sgpr5
     ; MUBUFW32-NEXT: {{  $}}
     ; MUBUFW32-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 262112, implicit-def $scc
     ; MUBUFW32-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294705152, implicit-def dead $scc
+    ; MUBUFW32-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; MUBUFW32-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; MUBUFW32-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 1048576, implicit-def dead $scc
     ; MUBUFW32-NEXT: renamable $vgpr1 = V_LSHRREV_B32_e64 5, $sgpr33, implicit $exec
     ; MUBUFW32-NEXT: renamable $vgpr0 = V_ADD_U32_e64 12352, killed $vgpr1, 0, implicit $exec
-    ; MUBUFW32-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -1048576, implicit-def dead $scc
+    ; MUBUFW32-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; MUBUFW32-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; MUBUFW32-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; MUBUFW32-NEXT: SI_RETURN implicit $vgpr0
     ;
     ; FLATSCRW64-LABEL: name: v_add_u32_e64_imm_fi_vop3_literal_error
-    ; FLATSCRW64: liveins: $sgpr4
+    ; FLATSCRW64: liveins: $sgpr4, $sgpr5
     ; FLATSCRW64-NEXT: {{  $}}
     ; FLATSCRW64-NEXT: $sgpr4 = frame-setup COPY $sgpr33
     ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 8191, implicit-def $scc
     ; FLATSCRW64-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294959104, implicit-def dead $scc
+    ; FLATSCRW64-NEXT: $sgpr5 = frame-setup COPY $sgpr34
+    ; FLATSCRW64-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; FLATSCRW64-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
-    ; FLATSCRW64-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
-    ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr5, 0, implicit $exec
-    ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
+    ; FLATSCRW64-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 12288, implicit-def $scc
+    ; FLATSCRW64-NEXT: renamable $vgpr0 = V_ADD_U32_e64 64, killed $sgpr6, 0, implicit $exec
+    ; FLATSCRW64-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; FLATSCRW64-NEXT: $sgpr34 = frame-destroy COPY $sgpr5
     ; FLATSCRW64-NEXT: $sgpr33 = frame-destroy COPY $sgpr4
     ; FLATSCRW64-NEXT: SI_RETURN implicit $vgpr0
     renamable $vgpr0 = V_ADD_U32_e64 64, %stack.1, 0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
index 8e0750195b3b4..c4063aecb6ed7 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll
@@ -18,8 +18,10 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
 ; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    s_addk_i32 s32, 0x3000
 ; GCN-NEXT:    v_writelane_b32 v42, s16, 2
+; GCN-NEXT:    v_writelane_b32 v42, s34, 3
+; GCN-NEXT:    s_mov_b32 s34, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x3000
 ; GCN-NEXT:    s_getpc_b64 s[16:17]
 ; GCN-NEXT:    s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12
@@ -55,11 +57,12 @@ define void @test_stack_realign(<8 x i32> %val, i32 %idx) #0 {
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    v_readlane_b32 s31, v42, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    s_mov_b32 s32, s34
 ; GCN-NEXT:    v_readlane_b32 s4, v42, 2
+; GCN-NEXT:    v_readlane_b32 s34, v42, 3
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xd000
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
index 3922b5404d778..6684262f5c976 100644
--- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll
@@ -28,11 +28,11 @@ define void @callee_with_stack_and_call() #0 {
 ; SPILL-TO-VGPR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s31, v40, 1
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s30, v40, 0
+; SPILL-TO-VGPR-NEXT:    s_mov_b32 s32, s33
 ; SPILL-TO-VGPR-NEXT:    v_readlane_b32 s4, v40, 2
 ; SPILL-TO-VGPR-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; SPILL-TO-VGPR-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[6:7]
-; SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xfc00
 ; SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s4
 ; SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; SPILL-TO-VGPR-NEXT:    s_setpc_b64 s[30:31]
@@ -87,7 +87,7 @@ define void @callee_with_stack_and_call() #0 {
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b64 exec, s[4:5]
 ; NO-SPILL-TO-VGPR-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; NO-SPILL-TO-VGPR-NEXT:    s_addk_i32 s32, 0xf800
+; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s32, s33
 ; NO-SPILL-TO-VGPR-NEXT:    s_waitcnt vmcnt(0)
 ; NO-SPILL-TO-VGPR-NEXT:    v_readfirstlane_b32 s4, v0
 ; NO-SPILL-TO-VGPR-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
index 6fd2c5a1267fb..32f255df82499 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args-inreg.ll
@@ -1680,11 +1680,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[20:21]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1710,11 +1710,11 @@ define void @caller_void_func_i32_v2float_inreg(i32 inreg %arg0, <2 x float> inr
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2092,11 +2092,11 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2134,11 +2134,11 @@ define void @void_func_a13i32_inreg([13  x i32] inreg %arg0, ptr addrspace(1) %p
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
index bb0e116cb4d32..1ad365df2e8a8 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll
@@ -74,10 +74,10 @@ define amdgpu_gfx void @gfx_func() {
 ; SDAG-NEXT:    v_readlane_b32 s6, v40, 2
 ; SDAG-NEXT:    v_readlane_b32 s5, v40, 1
 ; SDAG-NEXT:    v_readlane_b32 s4, v40, 0
+; SDAG-NEXT:    s_mov_b32 s32, s33
 ; SDAG-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; SDAG-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; SDAG-NEXT:    s_mov_b64 exec, s[34:35]
-; SDAG-NEXT:    s_addk_i32 s32, 0xfc00
 ; SDAG-NEXT:    s_mov_b32 s33, s36
 ; SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; SDAG-NEXT:    s_setpc_b64 s[30:31]
@@ -151,10 +151,10 @@ define amdgpu_gfx void @gfx_func() {
 ; GISEL-NEXT:    v_readlane_b32 s6, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s5, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[34:35]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s36
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 5ccbc85f46dd4..2e3ca34af4c74 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -142,11 +142,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -171,12 +171,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -201,11 +201,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -230,12 +230,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -265,11 +265,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -296,12 +296,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -327,11 +327,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -358,12 +358,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -394,11 +394,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -425,12 +425,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -456,11 +456,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -487,12 +487,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -520,11 +520,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -548,12 +548,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -577,11 +577,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -605,12 +605,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -638,11 +638,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -667,12 +667,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -697,11 +697,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -726,12 +726,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -760,11 +760,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -789,12 +789,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -819,11 +819,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -848,12 +848,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -881,11 +881,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -909,12 +909,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -938,11 +938,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -966,12 +966,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -999,11 +999,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1028,12 +1028,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1058,11 +1058,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1087,12 +1087,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_signext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1121,11 +1121,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1150,12 +1150,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1180,11 +1180,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1209,12 +1209,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_zeroext(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1242,11 +1242,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1270,12 +1270,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1299,11 +1299,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1327,12 +1327,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1360,11 +1360,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1389,12 +1389,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1418,11 +1418,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1447,12 +1447,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1481,11 +1481,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1511,12 +1511,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1542,11 +1542,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1572,12 +1572,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1608,11 +1608,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1639,12 +1639,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1669,11 +1669,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1700,12 +1700,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1736,11 +1736,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1768,12 +1768,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1799,11 +1799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1831,12 +1831,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -1872,11 +1872,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1906,12 +1906,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1938,11 +1938,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1972,12 +1972,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2006,11 +2006,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2034,12 +2034,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2063,11 +2063,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2091,12 +2091,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2123,11 +2123,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2151,12 +2151,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2180,11 +2180,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2208,12 +2208,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2241,11 +2241,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2270,12 +2270,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2299,11 +2299,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2328,12 +2328,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2362,11 +2362,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2392,12 +2392,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2422,11 +2422,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2452,12 +2452,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2488,11 +2488,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2520,12 +2520,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2551,11 +2551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2583,12 +2583,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2616,11 +2616,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2645,12 +2645,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2674,11 +2674,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2703,12 +2703,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2738,11 +2738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2769,12 +2769,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2799,11 +2799,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2830,12 +2830,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -2867,11 +2867,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -2900,12 +2900,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2931,11 +2931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2964,12 +2964,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3001,11 +3001,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3034,12 +3034,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3068,11 +3068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3101,12 +3101,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3139,11 +3139,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3172,12 +3172,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3205,11 +3205,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3238,12 +3238,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3277,11 +3277,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3311,12 +3311,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3345,11 +3345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3379,12 +3379,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3420,11 +3420,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3456,12 +3456,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3492,11 +3492,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3528,12 +3528,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3572,11 +3572,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3611,12 +3611,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3649,11 +3649,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3688,12 +3688,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -3764,11 +3764,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -3836,12 +3836,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -3902,11 +3902,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -3974,12 +3974,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4016,11 +4016,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4052,12 +4052,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4089,11 +4089,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4125,12 +4125,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4173,11 +4173,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4209,6 +4209,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[40:41], v0, off
@@ -4219,7 +4220,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4253,8 +4253,8 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
@@ -4263,7 +4263,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4295,6 +4294,7 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
@@ -4305,7 +4305,6 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4351,11 +4350,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4390,6 +4389,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-NEXT:    global_store_short v[40:41], v0, off
@@ -4400,7 +4400,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4437,6 +4436,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
@@ -4447,7 +4447,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4482,6 +4481,7 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
 ; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
@@ -4492,7 +4492,6 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4539,11 +4538,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4577,9 +4576,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
@@ -4589,7 +4589,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4627,9 +4626,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -4641,7 +4641,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4675,9 +4674,10 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
@@ -4687,7 +4687,6 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4739,11 +4738,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -4779,11 +4778,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_byte v[0:1], v4, off
 ; GFX10-NEXT:    global_store_dword v[40:41], v2, off
@@ -4794,7 +4794,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -4834,9 +4833,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_lshlrev_b32 v3, 16, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
@@ -4851,7 +4851,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -4887,11 +4886,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v4, off
 ; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v2, off
@@ -4902,7 +4902,6 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -4959,11 +4958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5008,8 +5007,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-NEXT:    s_clause 0x1
@@ -5019,7 +5019,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5072,8 +5071,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    global_store_b64 v[40:41], v[0:1], off
@@ -5083,7 +5083,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -5128,8 +5127,9 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
@@ -5139,7 +5139,6 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -5263,11 +5262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5386,12 +5385,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5537,11 +5536,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
 ; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -5660,12 +5659,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -5696,11 +5695,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5724,12 +5723,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5753,11 +5752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -5781,12 +5780,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -5814,11 +5813,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5842,12 +5841,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5871,11 +5870,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -5899,12 +5898,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -5932,11 +5931,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -5960,12 +5959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -5989,11 +5988,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6017,12 +6016,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6051,11 +6050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6080,12 +6079,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6109,11 +6108,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6138,12 +6137,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6171,11 +6170,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6200,12 +6199,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6230,11 +6229,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6259,12 +6258,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6291,11 +6290,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6319,12 +6318,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6348,11 +6347,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6376,12 +6375,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6410,11 +6409,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6439,12 +6438,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6469,11 +6468,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6498,12 +6497,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6530,11 +6529,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6558,12 +6557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6587,11 +6586,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6615,12 +6614,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6648,11 +6647,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6676,12 +6675,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6705,11 +6704,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6733,12 +6732,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6767,11 +6766,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6796,12 +6795,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6825,11 +6824,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6854,12 +6853,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -6888,11 +6887,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6918,12 +6917,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -6948,11 +6947,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -6978,12 +6977,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7013,11 +7012,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7044,12 +7043,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7074,11 +7073,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7105,12 +7104,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7137,11 +7136,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7165,12 +7164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7194,11 +7193,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7222,12 +7221,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7258,11 +7257,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7289,12 +7288,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7319,11 +7318,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7350,12 +7349,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7386,11 +7385,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7418,12 +7417,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7449,11 +7448,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7481,12 +7480,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7517,11 +7516,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7550,12 +7549,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7584,11 +7583,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7617,12 +7616,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7658,11 +7657,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7693,12 +7692,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7725,11 +7724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7760,12 +7759,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7798,11 +7797,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7833,12 +7832,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -7869,11 +7868,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -7904,12 +7903,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -7949,11 +7948,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -7988,12 +7987,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8028,11 +8027,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8067,12 +8066,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8115,11 +8114,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8157,12 +8156,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8199,11 +8198,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8241,12 +8240,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8284,11 +8283,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8321,12 +8320,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8358,11 +8357,11 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8395,12 +8394,12 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8432,11 +8431,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8465,12 +8464,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8499,11 +8498,11 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8532,12 +8531,12 @@ define amdgpu_gfx void @test_call_external_void_func_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8570,11 +8569,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8602,12 +8601,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8635,11 +8634,11 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8667,12 +8666,12 @@ define amdgpu_gfx void @test_call_external_void_func_byval_struct_i8_i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8712,6 +8711,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_byte v[0:1], v0, off
@@ -8721,7 +8721,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8754,6 +8753,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_byte v[0:1], v0, off
@@ -8764,7 +8764,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -8795,6 +8794,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 offset:12
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off dlc
@@ -8804,7 +8804,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8837,6 +8836,7 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v1, off, s33 offset:12
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v0, off
@@ -8847,7 +8847,6 @@ define amdgpu_gfx void @test_call_external_void_func_sret_struct_i8_i32_byval_st
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -8909,11 +8908,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -8960,12 +8959,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9008,11 +9007,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9059,12 +9058,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i8() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9158,10 +9157,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s6
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9251,11 +9250,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9341,10 +9340,10 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9431,11 +9430,11 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:24 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s0
-; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s4
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9466,11 +9465,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9495,12 +9494,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9525,11 +9524,11 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9554,12 +9553,12 @@ define amdgpu_gfx void @test_call_external_void_func_i1_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9588,11 +9587,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9618,12 +9617,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9649,11 +9648,11 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9679,12 +9678,12 @@ define amdgpu_gfx void @test_call_external_void_func_i8_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9713,11 +9712,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9743,12 +9742,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9774,11 +9773,11 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9804,12 +9803,12 @@ define amdgpu_gfx void @test_call_external_void_func_i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9838,11 +9837,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9868,12 +9867,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -9899,11 +9898,11 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -9929,12 +9928,12 @@ define amdgpu_gfx void @test_call_external_void_func_i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -9966,11 +9965,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -9999,12 +9998,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10033,11 +10032,11 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10066,12 +10065,12 @@ define amdgpu_gfx void @test_call_external_void_func_i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10107,11 +10106,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10144,12 +10143,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10182,11 +10181,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10219,12 +10218,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10263,11 +10262,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10302,12 +10301,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10342,11 +10341,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10381,12 +10380,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10428,11 +10427,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10471,12 +10470,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10515,11 +10514,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10558,12 +10557,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10614,11 +10613,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10663,12 +10662,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10713,11 +10712,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10762,12 +10761,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i64_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10798,11 +10797,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10828,12 +10827,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10859,11 +10858,11 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -10889,12 +10888,12 @@ define amdgpu_gfx void @test_call_external_void_func_f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -10923,11 +10922,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -10953,12 +10952,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -10984,11 +10983,11 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11014,12 +11013,12 @@ define amdgpu_gfx void @test_call_external_void_func_f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11051,11 +11050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11084,12 +11083,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11118,11 +11117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11151,12 +11150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11191,11 +11190,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11227,12 +11226,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11264,11 +11263,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11300,12 +11299,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11346,11 +11345,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11388,12 +11387,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11431,11 +11430,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11473,12 +11472,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5f32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11510,11 +11509,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11543,12 +11542,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11577,11 +11576,11 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11610,12 +11609,12 @@ define amdgpu_gfx void @test_call_external_void_func_f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11653,11 +11652,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11692,12 +11691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11732,11 +11731,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11771,12 +11770,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11820,11 +11819,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 8
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -11865,12 +11864,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 8
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -11911,11 +11910,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 8
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -11956,12 +11955,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f64_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 8
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -11990,11 +11989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12020,12 +12019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12051,11 +12050,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12081,12 +12080,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12118,11 +12117,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12150,12 +12149,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12183,11 +12182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12215,12 +12214,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12252,11 +12251,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12284,12 +12283,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12317,11 +12316,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12349,12 +12348,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12387,11 +12386,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12420,12 +12419,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12454,11 +12453,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12487,12 +12486,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12524,11 +12523,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12557,12 +12556,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12591,11 +12590,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12624,12 +12623,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3f16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12660,11 +12659,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12692,12 +12691,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12725,11 +12724,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12757,12 +12756,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12795,11 +12794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12828,12 +12827,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12862,11 +12861,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -12895,12 +12894,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i16_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -12929,11 +12928,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -12959,12 +12958,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -12990,11 +12989,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13020,12 +13019,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2f16_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13057,11 +13056,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13089,12 +13088,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13122,11 +13121,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13154,12 +13153,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13192,11 +13191,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13225,12 +13224,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13259,11 +13258,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13292,12 +13291,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13332,11 +13331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13368,12 +13367,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 5
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13405,11 +13404,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 5
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13441,12 +13440,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_imm_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 5
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13484,11 +13483,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13523,12 +13522,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13563,11 +13562,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13602,12 +13601,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13642,11 +13641,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13678,12 +13677,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13715,11 +13714,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13751,12 +13750,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13795,11 +13794,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -13834,12 +13833,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 6
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -13874,11 +13873,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -13913,12 +13912,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 6
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -13959,11 +13958,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 7
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -14001,12 +14000,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 7
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14044,11 +14043,11 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 7
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -14086,12 +14085,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 7
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -14136,11 +14135,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -14182,12 +14181,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14229,11 +14228,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -14275,12 +14274,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -14332,11 +14331,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -14383,12 +14382,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 10
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14435,11 +14434,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -14486,12 +14485,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8i32_imm_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 10
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -14552,11 +14551,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 18
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -14614,12 +14613,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 18
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -14677,11 +14676,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 18
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -14739,12 +14738,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 18
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -14851,11 +14850,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -14958,12 +14957,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15061,11 +15060,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -15165,12 +15164,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_inreg() #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -15282,11 +15281,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 28
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -15394,12 +15393,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 28
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15501,11 +15500,11 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX11-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 28
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -15611,12 +15610,12 @@ define amdgpu_gfx void @test_call_external_void_func_v32i32_i32_inreg(i32) #0 {
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s6, v40, 2
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 28
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -15651,11 +15650,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -15685,12 +15684,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15715,11 +15714,11 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -15745,12 +15744,12 @@ define amdgpu_gfx void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, d
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -15817,11 +15816,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -15884,12 +15883,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -15931,11 +15930,11 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -15995,12 +15994,12 @@ define amdgpu_gfx void @stack_12xv3i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16087,11 +16086,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16162,12 +16161,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16213,11 +16212,11 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -16283,12 +16282,12 @@ define amdgpu_gfx void @stack_8xv5i32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16371,11 +16370,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16446,12 +16445,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16503,11 +16502,11 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -16573,12 +16572,12 @@ define amdgpu_gfx void @stack_8xv5f32() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16613,11 +16612,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16640,12 +16639,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16668,11 +16667,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -16695,12 +16694,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16727,11 +16726,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16754,12 +16753,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16782,11 +16781,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -16809,12 +16808,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16(i16 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16841,11 +16840,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16868,12 +16867,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -16896,11 +16895,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -16923,12 +16922,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16(i32 %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -16955,11 +16954,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -16982,12 +16981,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17010,11 +17009,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17037,12 +17036,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16(<3 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17069,11 +17068,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17096,12 +17095,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17124,11 +17123,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17151,12 +17150,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16(<4 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17183,11 +17182,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17210,12 +17209,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17238,11 +17237,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17265,12 +17264,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16(<8 x i16> %arg) #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17297,11 +17296,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17324,12 +17323,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17352,11 +17351,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17379,12 +17378,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16(<16 x i16> %arg) #0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17411,11 +17410,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17438,12 +17437,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17466,11 +17465,11 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17493,12 +17492,12 @@ define amdgpu_gfx void @test_call_external_void_func_bf16_inreg(i16 inreg %arg)
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17525,11 +17524,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17552,12 +17551,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17580,11 +17579,11 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17607,12 +17606,12 @@ define amdgpu_gfx void @test_call_external_void_func_v1bf16_inreg(i16 inreg %arg
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17639,11 +17638,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17666,12 +17665,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17694,11 +17693,11 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17721,12 +17720,12 @@ define amdgpu_gfx void @test_call_external_void_func_v2bf16_inreg(i32 inreg %arg
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17753,11 +17752,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17780,12 +17779,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17808,11 +17807,11 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17835,12 +17834,12 @@ define amdgpu_gfx void @test_call_external_void_func_v3bf16_inreg(<3 x i16> inre
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17867,11 +17866,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -17894,12 +17893,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -17922,11 +17921,11 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -17949,12 +17948,12 @@ define amdgpu_gfx void @test_call_external_void_func_v4bf16_inreg(<4 x i16> inre
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -17981,11 +17980,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -18008,12 +18007,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -18036,11 +18035,11 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -18063,12 +18062,12 @@ define amdgpu_gfx void @test_call_external_void_func_v8bf16_inreg(<8 x i16> inre
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
@@ -18095,11 +18094,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -18122,12 +18121,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -18150,11 +18149,11 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -18177,12 +18176,12 @@ define amdgpu_gfx void @test_call_external_void_func_v16bf16_inreg(<16 x i16> in
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    s_mov_b32 s32, s33
 ; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s0
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index a14e3d5673f82..4afc2fc972a28 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -30,11 +30,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -64,12 +64,12 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -99,11 +99,11 @@ define amdgpu_gfx void @test_call_external_void_func_void_clobber_s30_s31_call_e
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s5, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -227,11 +227,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -264,12 +264,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -301,11 +301,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -343,11 +343,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -380,12 +380,12 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -418,11 +418,11 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -461,11 +461,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -498,12 +498,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -536,11 +536,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s33(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -578,11 +578,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -615,12 +615,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -653,11 +653,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_s34(ptr addrspace(1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -693,11 +693,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -728,12 +728,12 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -763,11 +763,11 @@ define amdgpu_gfx void @test_call_void_func_void_preserves_v40(ptr addrspace(1)
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -911,11 +911,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -938,12 +938,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -966,11 +966,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s33() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -996,11 +996,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1023,12 +1023,12 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1051,11 +1051,11 @@ define amdgpu_gfx void @test_call_void_func_void_clobber_s34() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1090,11 +1090,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1126,12 +1126,12 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1163,11 +1163,11 @@ define amdgpu_gfx void @callee_saved_sgpr_kernel() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1213,11 +1213,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
 ; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -1258,12 +1258,12 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
 ; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
 ; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s34
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1303,11 +1303,11 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
 ; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
 ; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index c3ab9c23d1950..6384fdba7a45a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -36,10 +36,10 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -61,11 +61,11 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s36
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -87,10 +87,10 @@ define amdgpu_gfx void @call_i1() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -132,10 +132,10 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -157,11 +157,11 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s36
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -183,10 +183,10 @@ define amdgpu_gfx void @call_i16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -228,10 +228,10 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -253,11 +253,11 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s36
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -279,10 +279,10 @@ define amdgpu_gfx void @call_2xi16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v1, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v1, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -332,10 +332,10 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -357,11 +357,11 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX10-NEXT:    s_mov_b32 s33, s36
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -383,10 +383,10 @@ define amdgpu_gfx void @call_3xi16() #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -746,10 +746,10 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_addk_i32 s32, 0xdc00
 ; GFX9-NEXT:    s_mov_b32 s33, s36
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -836,11 +836,11 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:124
 ; GFX10-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
 ; GFX10-NEXT:    buffer_load_dword v100, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_addk_i32 s32, 0xee00
 ; GFX10-NEXT:    s_mov_b32 s33, s36
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -927,10 +927,10 @@ define amdgpu_gfx void @call_100xi32() #0 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:124
 ; GFX11-NEXT:    v_readlane_b32 s31, v100, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v100, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xff70
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -2130,61 +2130,67 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX9-LABEL: call_512xi32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, s33
+; GFX9-NEXT:    s_mov_b32 s35, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x1ffc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xfffe0000
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    s_xor_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-NEXT:    s_mov_b32 s35, return_512xi32@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, return_512xi32@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s37, return_512xi32@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, return_512xi32@abs32@lo
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
+; GFX9-NEXT:    s_mov_b32 s38, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x60000
 ; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    s_mov_b32 s32, s34
+; GFX9-NEXT:    s_mov_b32 s34, s38
+; GFX9-NEXT:    s_xor_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffa0000
-; GFX9-NEXT:    s_mov_b32 s33, s36
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_mov_b32 s33, s35
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: call_512xi32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s36, s33
+; GFX10-NEXT:    s_mov_b32 s35, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0xffe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffff0000
-; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
+; GFX10-NEXT:    s_xor_saveexec_b32 s36, -1
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    s_mov_b32 exec_lo, s36
 ; GFX10-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
-; GFX10-NEXT:    s_mov_b32 s35, return_512xi32@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, return_512xi32@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s37, return_512xi32@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, return_512xi32@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s38, s34
+; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x30000
 ; GFX10-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    v_readlane_b32 s31, v2, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v2, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s34, -1
+; GFX10-NEXT:    s_mov_b32 s32, s34
+; GFX10-NEXT:    s_mov_b32 s34, s38
+; GFX10-NEXT:    s_xor_saveexec_b32 s36, -1
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:2048 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffd0000
-; GFX10-NEXT:    s_mov_b32 s33, s36
+; GFX10-NEXT:    s_mov_b32 exec_lo, s36
+; GFX10-NEXT:    s_mov_b32 s33, s35
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: call_512xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s34, s33
+; GFX11-NEXT:    s_mov_b32 s35, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x7ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffff800
@@ -2195,17 +2201,20 @@ define amdgpu_gfx void @call_512xi32() #0 {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s33
 ; GFX11-NEXT:    s_mov_b32 s1, return_512xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, return_512xi32@abs32@lo
+; GFX11-NEXT:    s_mov_b32 s36, s34
+; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_addk_i32 s32, 0x1800
 ; GFX11-NEXT:    v_writelane_b32 v5, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v5, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX11-NEXT:    s_mov_b32 s32, s34
+; GFX11-NEXT:    s_mov_b32 s34, s36
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v5, off, s33 offset:2048 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xe800
-; GFX11-NEXT:    s_mov_b32 s33, s34
+; GFX11-NEXT:    s_mov_b32 s33, s35
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
@@ -2619,12 +2628,14 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-LABEL: call_72xi32:
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s36, s33
+; GFX9-NEXT:    s_mov_b32 s35, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_mov_b32 s38, s34
+; GFX9-NEXT:    s_mov_b32 s34, s32
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
@@ -2685,8 +2696,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX9-NEXT:    s_mov_b32 s35, return_72xi32@abs32@hi
-; GFX9-NEXT:    s_mov_b32 s34, return_72xi32@abs32@lo
+; GFX9-NEXT:    s_mov_b32 s37, return_72xi32@abs32@hi
+; GFX9-NEXT:    s_mov_b32 s36, return_72xi32@abs32@lo
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x200, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
@@ -2720,7 +2731,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
@@ -2859,7 +2870,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -2877,25 +2888,28 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    s_mov_b32 s32, s34
+; GFX9-NEXT:    s_mov_b32 s34, s38
+; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
 ; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload
-; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
-; GFX9-NEXT:    s_mov_b32 s33, s36
+; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX9-NEXT:    s_mov_b32 s33, s35
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: call_72xi32:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s36, s33
+; GFX10-NEXT:    s_mov_b32 s35, s33
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
+; GFX10-NEXT:    s_or_saveexec_b32 s36, -1
 ; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
+; GFX10-NEXT:    s_mov_b32 exec_lo, s36
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_mov_b32 s38, s34
+; GFX10-NEXT:    s_mov_b32 s34, s32
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
 ; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
@@ -2912,8 +2926,6 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2956,11 +2968,13 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x200, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v8, 0
@@ -2987,10 +3001,10 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
-; GFX10-NEXT:    s_mov_b32 s35, return_72xi32@abs32@hi
-; GFX10-NEXT:    s_mov_b32 s34, return_72xi32@abs32@lo
+; GFX10-NEXT:    s_mov_b32 s37, return_72xi32@abs32@hi
+; GFX10-NEXT:    s_mov_b32 s36, return_72xi32@abs32@lo
 ; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    s_clause 0x28
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
 ; GFX10-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
@@ -3133,7 +3147,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, 5, s33
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
-; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
+; GFX10-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX10-NEXT:    s_clause 0xe
 ; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
 ; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
@@ -3152,19 +3166,20 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56
 ; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
-; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
+; GFX10-NEXT:    s_mov_b32 s32, s34
+; GFX10-NEXT:    s_mov_b32 s34, s38
+; GFX10-NEXT:    s_or_saveexec_b32 s36, -1
 ; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_mov_b32 exec_lo, s34
-; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
-; GFX10-NEXT:    s_mov_b32 s33, s36
+; GFX10-NEXT:    s_mov_b32 exec_lo, s36
+; GFX10-NEXT:    s_mov_b32 s33, s35
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: call_72xi32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s34, s33
+; GFX11-NEXT:    s_mov_b32 s35, s33
 ; GFX11-NEXT:    s_add_i32 s33, s32, 0x1ff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
@@ -3178,6 +3193,8 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_mov_b32 s36, s34
+; GFX11-NEXT:    s_mov_b32 s34, s32
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
 ; GFX11-NEXT:    s_clause 0xb
 ; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
@@ -3347,11 +3364,12 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:44
 ; GFX11-NEXT:    v_readlane_b32 s31, v60, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v60, 0
+; GFX11-NEXT:    s_mov_b32 s32, s34
+; GFX11-NEXT:    s_mov_b32 s34, s36
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1600 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xf600
-; GFX11-NEXT:    s_mov_b32 s33, s34
+; GFX11-NEXT:    s_mov_b32 s33, s35
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index 334e6e2b617e0..d8df20eb69452 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -37,11 +37,11 @@ define void @bar() {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index da8aa54469835..55da485b91f67 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -193,11 +193,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -277,11 +277,11 @@ define void @test_indirect_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -368,11 +368,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -453,11 +453,11 @@ define void @test_indirect_call_vgpr_ptr_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -543,11 +543,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 18
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -629,11 +629,11 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 18
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -729,11 +729,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 20
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -824,11 +824,11 @@ define void @test_indirect_call_vgpr_ptr_in_branch(ptr %fptr, i1 %cond) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    v_readlane_b32 s4, v40, 20
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s4
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -930,10 +930,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1024,10 +1024,10 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[6:7]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s5
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1126,10 +1126,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s31, v41, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1224,10 +1224,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s10
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1328,10 +1328,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1424,10 +1424,10 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s10
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1522,10 +1522,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GCN-NEXT:    v_readlane_b32 s34, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1615,10 +1615,10 @@ define void @test_indirect_tail_call_vgpr_ptr(ptr %fptr) {
 ; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
 ; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
 ; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s10
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5c5a769178dd9..ea3d57d127151 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -36,10 +36,10 @@ define void @f0() {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v4, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index eef51acc4e12e..1f518386c63d5 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -41,11 +41,11 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT: bb.2.DummyReturnBlock:
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
   ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
+  ; CHECK-NEXT:   $sgpr32 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
-  ; CHECK-NEXT:   $sgpr32 = frame-destroy S_ADDK_I32 $sgpr32, -512, implicit-def dead $scc
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 killed $sgpr4
   ; CHECK-NEXT:   S_WAITCNT 16240
   ; CHECK-NEXT:   S_SETPC_B64_return undef $sgpr30_sgpr31, implicit undef $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
index 29fbb0bb1c6c9..0edc7cb01887b 100644
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -112,10 +112,12 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
 ; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
 ; MUBUF-NEXT:    v_add_u32_e32 v3, 0x3000, v3
+; MUBUF-NEXT:    s_mov_b32 s6, s34
 ; MUBUF-NEXT:    v_add_u32_e32 v2, 64, v3
 ; MUBUF-NEXT:    v_mov_b32_e32 v3, 0
 ; MUBUF-NEXT:    v_mov_b32_e32 v4, 0x2000
 ; MUBUF-NEXT:    s_mov_b32 s4, 0
+; MUBUF-NEXT:    s_mov_b32 s34, s32
 ; MUBUF-NEXT:    s_add_i32 s32, s32, 0x200000
 ; MUBUF-NEXT:    buffer_store_dword v3, v4, s[0:3], s33 offen
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
@@ -141,7 +143,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:4 glc
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    s_add_i32 s32, s32, 0xffe00000
+; MUBUF-NEXT:    s_mov_b32 s32, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s6
 ; MUBUF-NEXT:    s_mov_b32 s33, s5
 ; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v6
 ; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v7, vcc
@@ -155,6 +158,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; FLATSCR-NEXT:    s_mov_b32 s2, s33
 ; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
 ; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; FLATSCR-NEXT:    s_mov_b32 s3, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
 ; FLATSCR-NEXT:    s_add_i32 s32, s32, 0x8000
 ; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
 ; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x2000
@@ -179,7 +184,8 @@ define void @func_local_stack_offset_uses_sp(ptr addrspace(1) %out) {
 ; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x3000
 ; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s0 offset:64 glc
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_addk_i32 s32, 0x8000
+; FLATSCR-NEXT:    s_mov_b32 s32, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s3
 ; FLATSCR-NEXT:    s_mov_b32 s33, s2
 ; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
index b77c3a9bb532b..8e436b327cda1 100644
--- a/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.gfx10.ll
@@ -439,6 +439,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_1-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    s_mov_b32 s32, s33
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use alloca0 v0
@@ -455,7 +456,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_1-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_1-NEXT:    s_mov_b32 s33, s5
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
@@ -473,6 +473,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_3-NEXT:    v_writelane_b32 v1, s59, 0
 ; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    s_mov_b32 s32, s33
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v0, 64, v0
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use alloca0 v0
@@ -488,7 +489,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80880
 ; GFX10_3-NEXT:    buffer_load_dword v1, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_3-NEXT:    s_mov_b32 s33, s5
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
@@ -513,7 +513,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX11-NEXT:    s_bitset0_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
@@ -523,7 +523,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x4044
 ; GFX11-NEXT:    scratch_load_b32 v1, off, s2 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX11-NEXT:    s_mov_b32 s33, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -558,11 +557,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX12-NEXT:    s_mov_b32 s32, s33
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v1, off, s33 offset:16388 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -593,11 +592,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s7, s33, 0x101100
 ; GFX8-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -626,11 +625,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s7, s33, 0x101100
 ; GFX900-NEXT:    buffer_load_dword v1, off, s[0:3], s7 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX900-NEXT:    s_mov_b32 s33, s6
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -660,11 +659,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_fp() #1 {
 ; GFX940-NEXT:    ; use s59, scc
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    v_readlane_b32 s59, v1, 0
+; GFX940-NEXT:    s_mov_b32 s32, s33
 ; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX940-NEXT:    s_add_i32 s3, s33, 0x4044
 ; GFX940-NEXT:    scratch_load_dword v1, off, s3 ; 4-byte Folded Reload
 ; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX940-NEXT:    s_mov_b32 s33, s2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
@@ -1027,6 +1026,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_1-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_1-NEXT:    s_mov_b32 s32, s33
 ; GFX10_1-NEXT:    v_add_nc_u32_e32 v1, 64, v1
 ; GFX10_1-NEXT:    v_readfirstlane_b32 s59, v1
 ; GFX10_1-NEXT:    ;;#ASMSTART
@@ -1038,7 +1038,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_1-NEXT:    s_mov_b32 s33, s5
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
@@ -1056,6 +1055,7 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_3-NEXT:    s_and_b32 s4, 0, exec_lo
+; GFX10_3-NEXT:    s_mov_b32 s32, s33
 ; GFX10_3-NEXT:    v_add_nc_u32_e32 v1, 64, v1
 ; GFX10_3-NEXT:    v_readfirstlane_b32 s59, v1
 ; GFX10_3-NEXT:    ;;#ASMSTART
@@ -1066,7 +1066,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
 ; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s4
-; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_3-NEXT:    s_mov_b32 s33, s5
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
@@ -1084,9 +1083,10 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX11-NEXT:    s_and_b32 s0, 0, exec_lo
 ; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s33, 64
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
 ; GFX11-NEXT:    s_bitset0_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mov_b32 s59, s0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59, scc
@@ -1096,7 +1096,6 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX11-NEXT:    s_mov_b32 s33, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1122,11 +1121,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX12-NEXT:    ; use s59, scc
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, s33
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s1
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1152,11 +1151,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX8-NEXT:    ; use s59, scc
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX8-NEXT:    s_add_i32 s7, s33, 0x101000
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX8-NEXT:    s_mov_b32 s33, s6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1180,11 +1179,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX900-NEXT:    ; use s59, scc
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; GFX900-NEXT:    s_add_i32 s7, s33, 0x101000
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s7 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX900-NEXT:    s_mov_b32 s33, s6
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1209,11 +1208,11 @@ define void @scalar_mov_materializes_frame_index_unavailable_scc_small_offset_fp
 ; GFX940-NEXT:    ; use s59, scc
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_mov_b32 s32, s33
 ; GFX940-NEXT:    s_xor_saveexec_b64 s[0:1], -1
 ; GFX940-NEXT:    s_add_i32 s3, s33, 0x4040
 ; GFX940-NEXT:    scratch_load_dword v0, off, s3 ; 4-byte Folded Reload
 ; GFX940-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX940-NEXT:    s_mov_b32 s33, s2
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
@@ -1236,6 +1235,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_1-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_1-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_1-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_1-NEXT:    s_mov_b32 s32, s33
 ; GFX10_1-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_1-NEXT:    ;;#ASMSTART
 ; GFX10_1-NEXT:    ; use s59
@@ -1246,7 +1246,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_1-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_1-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10_1-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10_1-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_1-NEXT:    s_mov_b32 s33, s4
 ; GFX10_1-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_1-NEXT:    s_setpc_b64 s[30:31]
@@ -1263,6 +1262,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_3-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX10_3-NEXT:    s_add_i32 s32, s32, 0x81000
 ; GFX10_3-NEXT:    s_lshr_b32 s59, s33, 5
+; GFX10_3-NEXT:    s_mov_b32 s32, s33
 ; GFX10_3-NEXT:    s_add_i32 s59, s59, 64
 ; GFX10_3-NEXT:    ;;#ASMSTART
 ; GFX10_3-NEXT:    ; use s59
@@ -1272,7 +1272,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX10_3-NEXT:    s_add_i32 s6, s33, 0x80800
 ; GFX10_3-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
 ; GFX10_3-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10_3-NEXT:    s_add_i32 s32, s32, 0xfff7f000
 ; GFX10_3-NEXT:    s_mov_b32 s33, s4
 ; GFX10_3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_3-NEXT:    s_setpc_b64 s[30:31]
@@ -1289,7 +1288,7 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX11-NEXT:    v_writelane_b32 v0, s59, 0
 ; GFX11-NEXT:    s_addk_i32 s32, 0x4080
 ; GFX11-NEXT:    s_add_i32 s1, s33, 64
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    s_mov_b32 s59, s1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s59
@@ -1299,7 +1298,6 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX11-NEXT:    s_add_i32 s2, s33, 0x4040
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s2 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1318,18 +1316,17 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX12-NEXT:    v_writelane_b32 v0, s59, 0
-; GFX12-NEXT:    s_mov_b32 s59, s33
 ; GFX12-NEXT:    s_addk_co_i32 s32, 0x4040
+; GFX12-NEXT:    s_mov_b32 s59, s33
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ; use s59
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_mov_b32 s32, s33
 ; GFX12-NEXT:    v_readlane_b32 s59, v0, 0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s1, -1
 ; GFX12-NEXT:    scratch_load_b32 v0, off, s33 offset:16384 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s1
-; GFX12-NEXT:    s_addk_co_i32 s32, 0xbfc0
 ; GFX12-NEXT:    s_mov_b32 s33, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_wait_alu 0xfffe
@@ -1352,11 +1349,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX8-NEXT:    ; use s59
 ; GFX8-NEXT:    ;;#ASMEND
 ; GFX8-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX8-NEXT:    s_mov_b32 s32, s33
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX8-NEXT:    s_add_i32 s5, s33, 0x101000
 ; GFX8-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX8-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX8-NEXT:    s_mov_b32 s33, s4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -1378,11 +1375,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX900-NEXT:    ; use s59
 ; GFX900-NEXT:    ;;#ASMEND
 ; GFX900-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX900-NEXT:    s_mov_b32 s32, s33
 ; GFX900-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX900-NEXT:    s_add_i32 s5, s33, 0x101000
 ; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX900-NEXT:    s_add_i32 s32, s32, 0xffefe000
 ; GFX900-NEXT:    s_mov_b32 s33, s4
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -1404,11 +1401,11 @@ define void @scalar_mov_materializes_frame_index_available_scc_small_offset_fp()
 ; GFX940-NEXT:    ; use s59
 ; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    v_readlane_b32 s59, v0, 0
+; GFX940-NEXT:    s_mov_b32 s32, s33
 ; GFX940-NEXT:    s_xor_saveexec_b64 s[2:3], -1
 ; GFX940-NEXT:    s_add_i32 s1, s33, 0x4040
 ; GFX940-NEXT:    scratch_load_dword v0, off, s1 ; 4-byte Folded Reload
 ; GFX940-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX940-NEXT:    s_addk_i32 s32, 0xbf80
 ; GFX940-NEXT:    s_mov_b32 s33, s0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 299bbdac60091..4bc7711f2f839 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -229,11 +229,11 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
 ; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index 0e750d879ac94..2d853212166e9 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -12,7 +12,7 @@ define hidden fastcc void @callee_has_fp() #1 {
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 1
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffe00
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
@@ -50,10 +50,10 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
 ; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    s_mov_b32 s33, s18
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -192,10 +192,10 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    s_mov_b32 s33, s18
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -226,10 +226,10 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_readlane_b32 s31, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v2, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    s_mov_b32 s33, s19
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 64a94a5ee0e70..593f40fd1b25e 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -24,12 +24,12 @@ declare void @external_void_func_i32(i32) #0
 
 ; GCN: v_readlane_b32 s31, v40, 1
 ; GCN: v_readlane_b32 s30, v40, 0
+; GCN: s_mov_b32 s32, s33
 
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -44,7 +44,6 @@ define void @test_func_call_external_void_func_i32_imm() #0 {
 ; GCN-DAG: s_addk_i32 s32, 0x1400{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
 ; GCN: s_swappc_b64
-; GCN: s_addk_i32 s32, 0xec00{{$}}
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
   %alloca = alloca [16 x i32], align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 9999cb9173b5d..25b7b043fc6b6 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -38,12 +38,11 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
 ; CHECK-NEXT:    .loc 0 32 1 ; lane-info.cpp:32:1
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1
-; CHECK-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
index 2bd60e869f843..fb14f1844427e 100644
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -246,7 +246,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
+; MUBUF-NEXT:    s_mov_b32 s32, s33
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -280,7 +280,7 @@ define void @func_non_entry_block_static_alloca_align4(ptr addrspace(1) %out, i3
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
 ; FLATSCR-NEXT:    s_mov_b32 s33, s3
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 
@@ -316,8 +316,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; MUBUF-NEXT:    s_mov_b32 s7, s33
 ; MUBUF-NEXT:    s_add_i32 s33, s32, 0xfc0
+; MUBUF-NEXT:    s_mov_b32 s8, s34
 ; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfffff000
+; MUBUF-NEXT:    s_mov_b32 s34, s32
 ; MUBUF-NEXT:    s_addk_i32 s32, 0x2000
 ; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
@@ -341,7 +343,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
 ; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
-; MUBUF-NEXT:    s_addk_i32 s32, 0xe000
+; MUBUF-NEXT:    s_mov_b32 s32, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s8
 ; MUBUF-NEXT:    s_mov_b32 s33, s7
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -350,8 +353,10 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; FLATSCR-NEXT:    s_mov_b32 s3, s33
 ; FLATSCR-NEXT:    s_add_i32 s33, s32, 63
+; FLATSCR-NEXT:    s_mov_b32 s4, s34
 ; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; FLATSCR-NEXT:    s_andn2_b32 s33, s33, 63
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
 ; FLATSCR-NEXT:    s_addk_i32 s32, 0x80
 ; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
@@ -373,7 +378,8 @@ define void @func_non_entry_block_static_alloca_align64(ptr addrspace(1) %out, i
 ; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
 ; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; FLATSCR-NEXT:    s_addk_i32 s32, 0xff80
+; FLATSCR-NEXT:    s_mov_b32 s32, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s4
 ; FLATSCR-NEXT:    s_mov_b32 s33, s3
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index a204866170759..ba6524caf668d 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -37,6 +37,8 @@ body:             |
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -50,12 +52,13 @@ body:             |
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = COPY $sgpr4
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -86,11 +89,17 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
-    ; CHECK: liveins: $sgpr29, $vgpr1
+    ; CHECK: liveins: $sgpr29, $vgpr1, $vgpr2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
+    ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 0, undef $vgpr2
+    ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -100,11 +109,16 @@ body:             |
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
-    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33
+    ; CHECK-NEXT: $vgpr3 = COPY killed $sgpr33
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr31
+    ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; CHECK-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    ; CHECK-NEXT: $sgpr6 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
+    ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr6, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+    ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr29
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -135,21 +149,28 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64
-    ; CHECK: liveins: $sgpr28, $vgpr1
+    ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34
+    ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 8192, implicit-def $scc
-    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr29
-    ; CHECK-NEXT: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
-    ; CHECK-NEXT: $sgpr29 = S_ADD_I32 killed $sgpr29, 16384, implicit-def $scc
-    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr29
+    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 8192, implicit-def $scc
+    ; CHECK-NEXT: $vgpr0 = COPY killed $sgpr33
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -8192, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, 16384, implicit-def $scc
+    ; CHECK-NEXT: $vgpr2 = COPY killed $sgpr33
+    ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
+    ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -179,11 +200,13 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc
-    ; CHECK: liveins: $sgpr28, $vgpr1
+    ; CHECK: liveins: $sgpr28, $sgpr29, $vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $sgpr28 = frame-setup COPY $sgpr33
     ; CHECK-NEXT: $sgpr33 = frame-setup S_ADD_I32 $sgpr32, 524224, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = frame-setup S_AND_B32 killed $sgpr33, 4294443008, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr29 = frame-setup COPY $sgpr34
+    ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31
     ; CHECK-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
@@ -193,7 +216,8 @@ body:             |
     ; CHECK-NEXT: $vcc_lo = S_MOV_B32 16384
     ; CHECK-NEXT: $vgpr2, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
+    ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
+    ; CHECK-NEXT: $sgpr34 = frame-destroy COPY $sgpr29
     ; CHECK-NEXT: $sgpr33 = frame-destroy COPY $sgpr28
     ; CHECK-NEXT: S_ENDPGM 0
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr30, implicit-def $sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index a4104737d974f..162d12f651d4a 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -33,6 +33,8 @@ body:             |
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; MUBUF-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
@@ -40,12 +42,13 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; MUBUF-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; MUBUF-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4
     ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc
     ;
@@ -60,6 +63,8 @@ body:             |
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
@@ -68,12 +73,13 @@ body:             |
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
     ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc
+    ; FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
     ; FLATSCR-NEXT: $sgpr33 = COPY $sgpr4
     ; FLATSCR-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index 45e95d133e1bb..a4f936a4d705c 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -32,6 +32,8 @@ body:             |
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; CHECK-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -40,12 +42,13 @@ body:             |
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; CHECK-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc
     ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -786432, implicit-def dead $scc
     ; CHECK-NEXT: $sgpr33 = COPY $sgpr4
     ; CHECK-NEXT: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index 9462d01ba758d..63a4759d8e740 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -34,6 +34,8 @@ body:             |
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; GFX8-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192
@@ -42,12 +44,13 @@ body:             |
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384
     ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
     ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
+    ; GFX8-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; GFX8-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; GFX8-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX8-NEXT: $sgpr33 = COPY $sgpr4
     ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     ;
@@ -62,18 +65,21 @@ body:             |
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; GFX9-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
     ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
+    ; GFX9-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; GFX9-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; GFX9-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX9-NEXT: $sgpr33 = COPY $sgpr4
     ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     ;
@@ -88,17 +94,20 @@ body:             |
     ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
     ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+    ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+    ; GFX9-FLATSCR-NEXT: $sgpr34 = frame-setup COPY $sgpr32
     ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr4, $vgpr1, implicit $exec
+    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy COPY $sgpr34
     ; GFX9-FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+    ; GFX9-FLATSCR-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
     ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -32768, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $sgpr33 = COPY $sgpr4
     ; GFX9-FLATSCR-NEXT: S_ENDPGM 0, amdgpu_allvgprs
     $vgpr0 = V_MOV_B32_e32 %stack.0, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
index 1126db9cae93f..20e5af1b87f1f 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
@@ -340,6 +340,7 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
 ; GFX906-NEXT:    v_readlane_b32 s31, v41, 1
 ; GFX906-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX906-NEXT:    s_mov_b32 s32, s33
 ; GFX906-NEXT:    v_readlane_b32 s4, v41, 4
 ; GFX906-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX906-NEXT:    v_readlane_b32 s35, v41, 3
@@ -366,7 +367,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX906-NEXT:    s_mov_b64 exec, -1
 ; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX906-NEXT:    s_addk_i32 s32, 0xd800
 ; GFX906-NEXT:    s_mov_b32 s33, s4
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
@@ -752,8 +752,9 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:168
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX908-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; GFX908-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_mov_b32 s32, s33
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX908-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
@@ -765,7 +766,6 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
 ; GFX908-NEXT:    s_xor_saveexec_b64 s[6:7], -1
 ; GFX908-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX908-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX908-NEXT:    s_addk_i32 s32, 0xd400
 ; GFX908-NEXT:    s_mov_b32 s33, s4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
index 648f4fc64f9d0..64a8f5484673f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll
@@ -20,8 +20,8 @@
 
 
 ; CHECK-LABEL: {{^}}call_72xi32:
-; GFX11-PAL:    NumSgprs: 35
-; GFX11-PAL-GCNTRACKERS:    NumSgprs: 35
+; GFX11-PAL:    NumSgprs: 37
+; GFX11-PAL-GCNTRACKERS:    NumSgprs: 37
 ; GFX11-PAL:    NumVgprs: 64
 ; GFX11-PAL-GCNTRACKERS:    NumVgprs: 64
 ; GFX11-PAL:    ScratchSize: 2780
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 94cbe568a6a44..925984b15367d 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -198,6 +198,7 @@ body:             |
   ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2
   ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
   ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+  ; GCN-NEXT:   $sgpr32 = frame-destroy COPY $sgpr33
   ; GCN-NEXT:   $sgpr0 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
   ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
@@ -205,7 +206,6 @@ body:             |
   ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
   ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.73, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr0
-  ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
   ; GCN-NEXT:   $sgpr33 = frame-destroy COPY $vcc_hi
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index ff2202f1e177b..4a01962aa4084 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -265,10 +265,10 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -557,10 +557,10 @@ define void @spill_to_lowest_available_vgpr() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -1788,7 +1788,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
-; GCN-NEXT:    s_add_i32 s32, s32, 0xffff8c00
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    s_mov_b32 s33, s18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 5536a09538e6e..0676bc79a46f5 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -226,11 +226,11 @@ entry:
 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
+; GCN-NEXT: s_mov_b32 s32, s33
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: s_addk_i32 s32, 0xfc00
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 70bd63d31d5d7..fed60eecc8a8b 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -42,7 +42,7 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
-; GCN: s_addk_i32 s32, 0xd800
+; GCN: s_mov_b32 s32, s34
 
 ; GCN: ; ScratchSize: 160
 define void @needs_align16_stack_align4(i32 %idx) #2 {
@@ -63,7 +63,7 @@ define void @needs_align16_stack_align4(i32 %idx) #2 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
 
-; GCN: s_addk_i32 s32, 0xd000
+; GCN: s_mov_b32 s32, s34
 
 ; GCN: ; ScratchSize: 192
 define void @needs_align32(i32 %idx) #0 {
@@ -79,7 +79,7 @@ define void @needs_align32(i32 %idx) #0 {
 ; GCN: s_addk_i32 s32, 0xd00{{$}}
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: s_addk_i32 s32, 0xf300
+; GCN: s_mov_b32 s32, s34
 
 ; GCN: ; ScratchSize: 52
 define void @force_realign4(i32 %idx) #1 {
@@ -127,10 +127,12 @@ define amdgpu_kernel void @kernel_call_align4_from_5() {
 ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
 ; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GCN-NEXT: s_mov_b32 s5, s34
+; GCN-NEXT: s_mov_b32 s34, s32
 ; GCN-NEXT: s_addk_i32 s32, 0x4000
 ; GCN-NOT: s33
 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}}
-; GCN: s_addk_i32 s32, 0xc000
+; GCN: s_mov_b32 s32, s34
 ; GCN: s_mov_b32 s33, [[FP_COPY]]
 define void @default_realign_align128(i32 %idx) #0 {
   %alloca.align = alloca i32, align 128, addrspace(5)
@@ -175,12 +177,12 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
 
 ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
+; GCN-NEXT: s_mov_b32 s32, s34
 ; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000
 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
 ; GCN: s_setpc_b64 s[30:31]
   %temp = alloca i32, align 1024, addrspace(5)
@@ -209,8 +211,8 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
 ; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen
 ; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]]
-; GCN: s_mov_b32 s34, [[BP_COPY]]
-; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000
+; GCN: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]]
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_setpc_b64 s[30:31]
 begin:
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index da99052ba69ba..8f16fcf6d0890 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1282,16 +1282,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE32-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE32-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; WAVE32-OPT-NEXT:    s_mov_b32 s32, s18
 ; WAVE32-OPT-NEXT:    ;;#ASMSTART
 ; WAVE32-OPT-NEXT:    ; use s19
 ; WAVE32-OPT-NEXT:    ;;#ASMEND
-; WAVE32-OPT-NEXT:    s_mov_b32 s32, s18
 ; WAVE32-OPT-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-OPT-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE32-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-OPT-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; WAVE32-OPT-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-OPT-NEXT:    s_addk_i32 s32, 0xee00
 ; WAVE32-OPT-NEXT:    s_mov_b32 s33, s20
 ; WAVE32-OPT-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-OPT-NEXT:    s_setpc_b64 s[30:31]
@@ -1317,16 +1317,16 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-OPT-NEXT:    s_waitcnt_vscnt null, 0x0
 ; WAVE64-OPT-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; WAVE64-OPT-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; WAVE64-OPT-NEXT:    s_mov_b32 s32, s18
 ; WAVE64-OPT-NEXT:    ;;#ASMSTART
 ; WAVE64-OPT-NEXT:    ; use s19
 ; WAVE64-OPT-NEXT:    ;;#ASMEND
-; WAVE64-OPT-NEXT:    s_mov_b32 s32, s18
 ; WAVE64-OPT-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-OPT-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE64-OPT-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-OPT-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-OPT-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; WAVE64-OPT-NEXT:    s_mov_b64 exec, s[4:5]
-; WAVE64-OPT-NEXT:    s_addk_i32 s32, 0xdc00
 ; WAVE64-OPT-NEXT:    s_mov_b32 s33, s20
 ; WAVE64-OPT-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-OPT-NEXT:    s_setpc_b64 s[30:31]
@@ -1433,11 +1433,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-O0-NEXT:    s_mov_b32 s32, s4
 ; WAVE32-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE32-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE32-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-O0-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-O0-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-O0-NEXT:    s_add_i32 s32, s32, 0xffffee00
 ; WAVE32-O0-NEXT:    s_mov_b32 s33, s24
 ; WAVE32-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1544,11 +1544,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE64-O0-NEXT:    s_mov_b32 s32, s4
 ; WAVE64-O0-NEXT:    v_readlane_b32 s31, v32, 1
 ; WAVE64-O0-NEXT:    v_readlane_b32 s30, v32, 0
+; WAVE64-O0-NEXT:    s_mov_b32 s32, s33
 ; WAVE64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; WAVE64-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE64-O0-NEXT:    s_mov_b64 exec, s[4:5]
-; WAVE64-O0-NEXT:    s_add_i32 s32, s32, 0xffffdc00
 ; WAVE64-O0-NEXT:    s_mov_b32 s33, s19
 ; WAVE64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE64-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -1655,11 +1655,11 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
-; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0xffffee00
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s24
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt vmcnt(0)
 ; WAVE32-WWM-PREALLOC-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 3bf7fec81c041..ebd4bc881f2af 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -196,11 +196,11 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -232,11 +232,11 @@ define void @outgoing_v2f16_arg(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -268,9 +268,10 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    flat_store_short v[40:41], v0
 ; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
@@ -278,7 +279,6 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -315,8 +315,9 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -326,7 +327,6 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -383,11 +383,11 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -470,11 +470,11 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -528,11 +528,11 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    s_mov_b32 s32, s33
 ; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
index cd6cb4d1e9fe4..242b5e9aeaf42 100644
--- a/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
+++ b/llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll
@@ -28,11 +28,11 @@ define void @tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -64,11 +64,11 @@ define void @indirect_tail_call_i32_inreg_divergent(i32 %vgpr) {
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    v_readlane_b32 s4, v40, 2
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
-; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index b678e3e87202a..d9df80ce6c1c0 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -110,11 +110,11 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_readlane_b32 s31, v41, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v41, 0
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v41, 16
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -458,11 +458,11 @@ define hidden void @blam() {
 ; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v45, 26
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
index 33fb595157256..8a0bf26f81d22 100644
--- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
@@ -47,6 +47,8 @@ body:             |
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
   ; MUBUF-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
   ; MUBUF-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+  ; MUBUF-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+  ; MUBUF-NEXT:   $sgpr34 = frame-setup COPY $sgpr32
   ; MUBUF-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -67,12 +69,13 @@ body:             |
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   liveins: $vgpr2
   ; MUBUF-NEXT: {{  $}}
+  ; MUBUF-NEXT:   $sgpr32 = frame-destroy COPY $sgpr34
   ; MUBUF-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+  ; MUBUF-NEXT:   $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
   ; MUBUF-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; MUBUF-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc
   ; MUBUF-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
   ; MUBUF-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; MUBUF-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -11010048, implicit-def dead $scc
   ; MUBUF-NEXT:   $sgpr33 = COPY $sgpr4
   ; MUBUF-NEXT:   S_ENDPGM 0
   ;
@@ -89,6 +92,8 @@ body:             |
   ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5)
   ; FLATSCR-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
   ; FLATSCR-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
+  ; FLATSCR-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr34, 1, undef $vgpr2
+  ; FLATSCR-NEXT:   $sgpr34 = frame-setup COPY $sgpr32
   ; FLATSCR-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc
   ; FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; FLATSCR-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -118,12 +123,13 @@ body:             |
   ; FLATSCR-NEXT: bb.2:
   ; FLATSCR-NEXT:   liveins: $vgpr2
   ; FLATSCR-NEXT: {{  $}}
+  ; FLATSCR-NEXT:   $sgpr32 = frame-destroy COPY $sgpr34
   ; FLATSCR-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
+  ; FLATSCR-NEXT:   $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
   ; FLATSCR-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; FLATSCR-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc
   ; FLATSCR-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5)
   ; FLATSCR-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; FLATSCR-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -172032, implicit-def dead $scc
   ; FLATSCR-NEXT:   $sgpr33 = COPY $sgpr4
   ; FLATSCR-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 2b96e10fd3cc3..6b9476af7a493 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -54,11 +54,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -111,12 +111,12 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
 ; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -165,11 +165,11 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
 ; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -238,11 +238,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
 ; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -288,12 +288,12 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
 ; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    s_mov_b32 s32, s33
 ; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX10-NEXT:    s_mov_b32 s33, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -337,11 +337,11 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
 ; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
 ; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    s_mov_b32 s32, s33
 ; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
 ; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 4e17be1ebb312..0307472fce732 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2866,12 +2866,12 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1032-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1032-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1032-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1032-NEXT:    s_mov_b32 s32, s33
 ; GFX1032-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1032-NEXT:    s_or_saveexec_b32 s5, -1
 ; GFX1032-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_mov_b32 exec_lo, s5
-; GFX1032-NEXT:    s_addk_i32 s32, 0xfe00
 ; GFX1032-NEXT:    s_mov_b32 s33, s4
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1032-NEXT:    s_setpc_b64 s[30:31]
@@ -2897,12 +2897,12 @@ define void @callee_no_stack_with_call() #1 {
 ; GFX1064-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX1064-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX1064-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX1064-NEXT:    s_mov_b32 s32, s33
 ; GFX1064-NEXT:    v_readlane_b32 s4, v40, 2
 ; GFX1064-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GFX1064-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX1064-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX1064-NEXT:    s_mov_b32 s33, s4
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1064-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
index d1ee82e74b3de..cb3a0e1ebb553 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll
@@ -47,6 +47,7 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    ;;#ASMEND
 ; GFX90A-NEXT:    v_readlane_b32 s31, v40, 1
 ; GFX90A-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX90A-NEXT:    s_mov_b32 s32, s33
 ; GFX90A-NEXT:    v_readlane_b32 s4, v40, 4
 ; GFX90A-NEXT:    v_readlane_b32 s28, v40, 2
 ; GFX90A-NEXT:    v_readlane_b32 s29, v40, 3
@@ -56,7 +57,6 @@ define void @vector_reg_liverange_split() #0 {
 ; GFX90A-NEXT:    buffer_load_dword a32, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX90A-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX90A-NEXT:    s_mov_b32 s33, s4
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 4837efe6606b8..766386d84a616 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -53,6 +53,7 @@ define void @test() #0 {
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-NEXT:    v_readlane_b32 s29, v40, 3
@@ -61,7 +62,6 @@ define void @test() #0 {
 ; GCN-NEXT:    s_mov_b64 exec, -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -113,6 +113,7 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-O0-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-O0-NEXT:    s_mov_b32 s32, s33
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v40, 4
 ; GCN-O0-NEXT:    v_readlane_b32 s28, v40, 2
 ; GCN-O0-NEXT:    v_readlane_b32 s29, v40, 3
@@ -121,7 +122,6 @@ define void @test() #0 {
 ; GCN-O0-NEXT:    s_mov_b64 exec, -1
 ; GCN-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
-; GCN-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GCN-O0-NEXT:    s_mov_b32 s33, s4
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 1089093ea691c..2e59a36adb7e4 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -390,12 +390,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s48
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -428,12 +428,12 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
@@ -636,6 +636,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4
 ; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
+; GFX9-O0-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -650,7 +651,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffff000
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s46
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
@@ -698,6 +698,7 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
 ; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
 ; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O3-NEXT:    s_mov_b32 s32, s33
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
@@ -707,7 +708,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s38
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
index d1500e002d7e9..429bee4195fa9 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.generated.expected
@@ -101,7 +101,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_addk_i32 s32, 0xfa00
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -138,7 +138,7 @@ attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
 ; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:12
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:16
-; CHECK-NEXT:    s_addk_i32 s32, 0xfa00
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s6
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
index deadc4adb02c5..842fd8836da7e 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/amdgpu_generated_funcs.ll.nogenerated.expected
@@ -42,7 +42,7 @@ define dso_local i32 @check_boundaries() #0 {
 ; CHECK-NEXT:  .LBB0_4:
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_addk_i32 s32, 0xfa00
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s8
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -115,7 +115,7 @@ define dso_local i32 @main() #0 {
 ; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s33 offset:12
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s33 offset:16
-; CHECK-NEXT:    s_addk_i32 s32, 0xfa00
+; CHECK-NEXT:    s_mov_b32 s32, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s6
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]

From 970094d50b08e694c2302f7ee39b1c33d08f2405 Mon Sep 17 00:00:00 2001
From: Lou <lou.knauer@sipearl.com>
Date: Fri, 24 Jan 2025 15:08:14 +0100
Subject: [PATCH 005/432] [llvm-opt-report] Show scalable vectorization factors
 (#123367)

Scalable vectorization factors are printed as "vscale x VF" where VF is
the known minimum number of elements, a integer. Currently,
llvm-opt-report always expects a integer (like for vectorization with
fixed-sized vectors), and does not display any vectorization factor in
the output (just 'V', but without a number).

This patch adds support for scalable vectorization factors and prints
them as "VNx<VF>", so for example "VNx4". The "Nx" is used to
differentiate between fixed-sized and scalable factors, and is
consistent with the way LLVM mangles scalable vectors in other places.
---
 .../tools/llvm-opt-report/Inputs/scalable.c   |  9 ++++
 .../llvm-opt-report/Inputs/scalable.yaml      | 12 +++++
 llvm/test/tools/llvm-opt-report/scalabe.test  | 12 +++++
 llvm/tools/llvm-opt-report/OptReport.cpp      | 44 ++++++++++++++-----
 4 files changed, 65 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/tools/llvm-opt-report/Inputs/scalable.c
 create mode 100644 llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml
 create mode 100644 llvm/test/tools/llvm-opt-report/scalabe.test

diff --git a/llvm/test/tools/llvm-opt-report/Inputs/scalable.c b/llvm/test/tools/llvm-opt-report/Inputs/scalable.c
new file mode 100644
index 0000000000000..d2fa6fb879c1f
--- /dev/null
+++ b/llvm/test/tools/llvm-opt-report/Inputs/scalable.c
@@ -0,0 +1,9 @@
+#include <stddef.h>
+
+void foo(size_t N, float A[restrict N], float B[N]) {
+  #pragma clang loop vectorize_width(4, scalable)
+  for (size_t i = 0; i < N; i++) {
+    A[i] = B[i] * 42.f;
+  }
+}
+
diff --git a/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml b/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml
new file mode 100644
index 0000000000000..7f248c57faa6c
--- /dev/null
+++ b/llvm/test/tools/llvm-opt-report/Inputs/scalable.yaml
@@ -0,0 +1,12 @@
+--- !Passed
+Pass:            loop-vectorize
+Name:            Vectorized
+DebugLoc:        { File: './Inputs/scalable.c', Line: 5, Column: 3 }
+Function:        foo
+Args:
+  - String:          'vectorized loop (vectorization width: '
+  - VectorizationFactor: vscale x 4
+  - String:          ', interleaved count: '
+  - InterleaveCount: '2'
+  - String:          ')'
+...
diff --git a/llvm/test/tools/llvm-opt-report/scalabe.test b/llvm/test/tools/llvm-opt-report/scalabe.test
new file mode 100644
index 0000000000000..c853c57c46b2b
--- /dev/null
+++ b/llvm/test/tools/llvm-opt-report/scalabe.test
@@ -0,0 +1,12 @@
+RUN: llvm-opt-report -r %p %p/Inputs/scalable.yaml | FileCheck -strict-whitespace %s
+
+; CHECK: < {{.*[/\]}}scalable.c
+; CHECK-NEXT:  1        | #include <stddef.h>
+; CHECK-NEXT:  2        | 
+; CHECK-NEXT:  3        | void foo(size_t N, float A[restrict N], float B[N]) {
+; CHECK-NEXT:  4        |   #pragma clang loop vectorize_width(4, scalable)
+; CHECK-NEXT:  5 VNx4,2 |   for (size_t i = 0; i < N; i++) {
+; CHECK-NEXT:  6        |     A[i] = B[i] * 42.f;
+; CHECK-NEXT:  7        |   }
+; CHECK-NEXT:  8        | }
+; CHECK-NEXT:  9        | 
diff --git a/llvm/tools/llvm-opt-report/OptReport.cpp b/llvm/tools/llvm-opt-report/OptReport.cpp
index cee9abcb49419..68ed92c8bacea 100644
--- a/llvm/tools/llvm-opt-report/OptReport.cpp
+++ b/llvm/tools/llvm-opt-report/OptReport.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
@@ -100,7 +101,7 @@ struct OptReportLocationInfo {
   OptReportLocationItemInfo Unrolled;
   OptReportLocationItemInfo Vectorized;
 
-  int VectorizationFactor = 1;
+  ElementCount VectorizationFactor = ElementCount::getFixed(1);
   int InterleaveCount = 1;
   int UnrollCount = 1;
 
@@ -109,8 +110,9 @@ struct OptReportLocationInfo {
     Unrolled |= RHS.Unrolled;
     Vectorized |= RHS.Vectorized;
 
-    VectorizationFactor =
-      std::max(VectorizationFactor, RHS.VectorizationFactor);
+    if (ElementCount::isKnownLT(VectorizationFactor, RHS.VectorizationFactor))
+      VectorizationFactor = RHS.VectorizationFactor;
+
     InterleaveCount = std::max(InterleaveCount, RHS.InterleaveCount);
     UnrollCount = std::max(UnrollCount, RHS.UnrollCount);
 
@@ -130,9 +132,11 @@ struct OptReportLocationInfo {
       return true;
     else if (RHS.Vectorized < Vectorized || Succinct)
       return false;
-    else if (VectorizationFactor < RHS.VectorizationFactor)
+    else if (ElementCount::isKnownLT(VectorizationFactor,
+                                     RHS.VectorizationFactor))
       return true;
-    else if (VectorizationFactor > RHS.VectorizationFactor)
+    else if (ElementCount::isKnownGT(VectorizationFactor,
+                                     RHS.VectorizationFactor))
       return false;
     else if (InterleaveCount < RHS.InterleaveCount)
       return true;
@@ -197,17 +201,26 @@ static bool readLocationInfo(LocationInfoTy &LocationInfo) {
 
     bool Transformed = Remark.RemarkType == remarks::Type::Passed;
 
-    int VectorizationFactor = 1;
+    ElementCount VectorizationFactor = ElementCount::getFixed(1);
     int InterleaveCount = 1;
     int UnrollCount = 1;
 
     for (const remarks::Argument &Arg : Remark.Args) {
-      if (Arg.Key == "VectorizationFactor")
-        Arg.Val.getAsInteger(10, VectorizationFactor);
-      else if (Arg.Key == "InterleaveCount")
+      if (Arg.Key == "VectorizationFactor") {
+        int MinValue = 1;
+        bool IsScalable = false;
+        if (Arg.Val.starts_with("vscale x ")) {
+          Arg.Val.drop_front(9).getAsInteger(10, MinValue);
+          IsScalable = true;
+        } else {
+          Arg.Val.getAsInteger(10, MinValue);
+        }
+        VectorizationFactor = ElementCount::get(MinValue, IsScalable);
+      } else if (Arg.Key == "InterleaveCount") {
         Arg.Val.getAsInteger(10, InterleaveCount);
-      else if (Arg.Key == "UnrollCount")
+      } else if (Arg.Key == "UnrollCount") {
         Arg.Val.getAsInteger(10, UnrollCount);
+      }
     }
 
     const std::optional<remarks::RemarkLocation> &Loc = Remark.Loc;
@@ -292,7 +305,11 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
     bool NothingUnrolled = !MaxLI.Unrolled.Transformed;
     bool NothingVectorized = !MaxLI.Vectorized.Transformed;
 
-    unsigned VFDigits = llvm::utostr(MaxLI.VectorizationFactor).size();
+    unsigned VFDigits =
+        llvm::utostr(MaxLI.VectorizationFactor.getKnownMinValue()).size();
+    if (MaxLI.VectorizationFactor.isScalable())
+      VFDigits += 2; // For "Nx..."
+
     unsigned ICDigits = llvm::utostr(MaxLI.InterleaveCount).size();
     unsigned UCDigits = llvm::utostr(MaxLI.UnrollCount).size();
 
@@ -382,7 +399,10 @@ static bool writeReport(LocationInfoTy &LocationInfo) {
           raw_string_ostream RS(R);
 
           if (!Succinct) {
-            RS << LLI.VectorizationFactor << "," << LLI.InterleaveCount;
+            if (LLI.VectorizationFactor.isScalable())
+              RS << "Nx";
+            RS << LLI.VectorizationFactor.getKnownMinValue() << ","
+               << LLI.InterleaveCount;
             RS << std::string(VFDigits + ICDigits + 1 - R.size(), ' ');
           }
 

From e5e55c04d6af4ae32c99d574f59e632595abf607 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Fri, 24 Jan 2025 09:08:34 -0500
Subject: [PATCH 006/432] [GlobalMerge][NFC] Skip sorting by profitability when
 it is not needed (#124146)

We were previously sorting by profitability even if we were choosing to
merge all globals together, which is not impacted by UsedGlobalSet
order.

We can also remove iteration of UsedGlobalSets in reverse order in both
cases. In the first csae, the order does not matter. In the second case,
we just sort by the order we need instead of sorting in the opposite
direction and calling reverse.

This change should only be an improvement on compile time. I have not
measured it, but I think it would never make things worse.
---
 llvm/lib/CodeGen/GlobalMerge.cpp | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 7b76155b175d1..41e01a1d3ccd5 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -423,24 +423,12 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     }
   }
 
-  // Now we found a bunch of sets of globals used together.  We accumulated
-  // the number of times we encountered the sets (i.e., the number of functions
-  // that use that exact set of globals).
-  //
-  // Multiply that by the size of the set to give us a crude profitability
-  // metric.
-  llvm::stable_sort(UsedGlobalSets,
-                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
-                      return UGS1.Globals.count() * UGS1.UsageCount <
-                             UGS2.Globals.count() * UGS2.UsageCount;
-                    });
-
   // We can choose to merge all globals together, but ignore globals never used
   // with another global.  This catches the obviously non-profitable cases of
   // having a single global, but is aggressive enough for any other case.
   if (GlobalMergeIgnoreSingleUse) {
     BitVector AllGlobals(Globals.size());
-    for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
+    for (const UsedGlobalSet &UGS : UsedGlobalSets) {
       if (UGS.UsageCount == 0)
         continue;
       if (UGS.Globals.count() > 1)
@@ -449,6 +437,16 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
   }
 
+  // Now we found a bunch of sets of globals used together. We accumulated
+  // the number of times we encountered the sets (i.e., the number of functions
+  // that use that exact set of globals). Multiply that by the size of the set
+  // to give us a crude profitability metric.
+  llvm::stable_sort(UsedGlobalSets,
+                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
+                      return UGS1.Globals.count() * UGS1.UsageCount >=
+                             UGS2.Globals.count() * UGS2.UsageCount;
+                    });
+
   // Starting from the sets with the best (=biggest) profitability, find a
   // good combination.
   // The ideal (and expensive) solution can only be found by trying all
@@ -458,7 +456,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
   BitVector PickedGlobals(Globals.size());
   bool Changed = false;
 
-  for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
+  for (const UsedGlobalSet &UGS : UsedGlobalSets) {
     if (UGS.UsageCount == 0)
       continue;
     if (PickedGlobals.anyCommon(UGS.Globals))

From 77c780d64b950d6850d5ec1ee06cd0c21b38b89e Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 24 Jan 2025 15:11:13 +0100
Subject: [PATCH 007/432] [bazel] Port eb206e9ea84eff0a0596fed2de8316d924f946d1

Leave around an alias so users can move at their own pace.
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 72c28faed1d16..3336daf6773b2 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7,6 +7,7 @@
 
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
+load("//llvm:binary_alias.bzl", "binary_alias")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -10174,8 +10175,8 @@ cc_binary(
 )
 
 cc_binary(
-    name = "mlir-cpu-runner",
-    srcs = ["tools/mlir-cpu-runner/mlir-cpu-runner.cpp"],
+    name = "mlir-runner",
+    srcs = ["tools/mlir-runner/mlir-runner.cpp"],
     deps = [
         ":AllToLLVMIRTranslations",
         ":BuiltinToLLVMIRTranslation",
@@ -10195,6 +10196,12 @@ cc_binary(
     ],
 )
 
+# TODO: Remove this alias.
+binary_alias(
+    name = "mlir-cpu-runner",
+    binary = ":mlir-runner",
+)
+
 # This target provides the headers from LLVM's Support target without any of
 # the symbols. In particular, it does not contain the static registration code
 # which may be executed by at most one shared library loaded by ORCJit. Direct

From acde3f722ff3766f6f793884108d342b78623fe4 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 24 Jan 2025 09:26:28 -0500
Subject: [PATCH 008/432] [mlir:python] Compute get_op_result_or_value in
 PyOpView's constructor. (#123953)

This logic is in the critical path for constructing an operation from
Python. It is faster to compute this in C++ than it is in Python, and it
is a minor change to do this.

This change also alters the API contract of
_ods_common.get_op_results_or_values to avoid calling
get_op_result_or_value on each element of a sequence, since the C++ code
will now do this.

Most of the diff here is simply reordering the code in IRCore.cpp.
---
 mlir/lib/Bindings/Python/IRCore.cpp           | 432 ++++++++++--------
 mlir/lib/Bindings/Python/IRModule.h           |   2 +-
 mlir/python/mlir/dialects/_ods_common.py      |   7 +-
 mlir/test/mlir-tblgen/op-python-bindings.td   |  26 +-
 mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp |   9 +-
 5 files changed, 255 insertions(+), 221 deletions(-)

diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 738f1444b15fe..8e351cb22eb94 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -1481,12 +1481,11 @@ static void maybeInsertOperation(PyOperationRef &op,
 
 nb::object PyOperation::create(std::string_view name,
                                std::optional<std::vector<PyType *>> results,
-                               std::optional<std::vector<PyValue *>> operands,
+                               llvm::ArrayRef<MlirValue> operands,
                                std::optional<nb::dict> attributes,
                                std::optional<std::vector<PyBlock *>> successors,
                                int regions, DefaultingPyLocation location,
                                const nb::object &maybeIp, bool inferType) {
-  llvm::SmallVector<MlirValue, 4> mlirOperands;
   llvm::SmallVector<MlirType, 4> mlirResults;
   llvm::SmallVector<MlirBlock, 4> mlirSuccessors;
   llvm::SmallVector<std::pair<std::string, MlirAttribute>, 4> mlirAttributes;
@@ -1495,16 +1494,6 @@ nb::object PyOperation::create(std::string_view name,
   if (regions < 0)
     throw nb::value_error("number of regions must be >= 0");
 
-  // Unpack/validate operands.
-  if (operands) {
-    mlirOperands.reserve(operands->size());
-    for (PyValue *operand : *operands) {
-      if (!operand)
-        throw nb::value_error("operand value cannot be None");
-      mlirOperands.push_back(operand->get());
-    }
-  }
-
   // Unpack/validate results.
   if (results) {
     mlirResults.reserve(results->size());
@@ -1562,9 +1551,8 @@ nb::object PyOperation::create(std::string_view name,
   // point, exceptions cannot be thrown or else the state will leak.
   MlirOperationState state =
       mlirOperationStateGet(toMlirStringRef(name), location);
-  if (!mlirOperands.empty())
-    mlirOperationStateAddOperands(&state, mlirOperands.size(),
-                                  mlirOperands.data());
+  if (!operands.empty())
+    mlirOperationStateAddOperands(&state, operands.size(), operands.data());
   state.enableResultTypeInference = inferType;
   if (!mlirResults.empty())
     mlirOperationStateAddResults(&state, mlirResults.size(),
@@ -1632,6 +1620,143 @@ void PyOperation::erase() {
   mlirOperationDestroy(operation);
 }
 
+namespace {
+/// CRTP base class for Python MLIR values that subclass Value and should be
+/// castable from it. The value hierarchy is one level deep and is not supposed
+/// to accommodate other levels unless core MLIR changes.
+template <typename DerivedTy>
+class PyConcreteValue : public PyValue {
+public:
+  // Derived classes must define statics for:
+  //   IsAFunctionTy isaFunction
+  //   const char *pyClassName
+  // and redefine bindDerived.
+  using ClassTy = nb::class_<DerivedTy, PyValue>;
+  using IsAFunctionTy = bool (*)(MlirValue);
+
+  PyConcreteValue() = default;
+  PyConcreteValue(PyOperationRef operationRef, MlirValue value)
+      : PyValue(operationRef, value) {}
+  PyConcreteValue(PyValue &orig)
+      : PyConcreteValue(orig.getParentOperation(), castFrom(orig)) {}
+
+  /// Attempts to cast the original value to the derived type and throws on
+  /// type mismatches.
+  static MlirValue castFrom(PyValue &orig) {
+    if (!DerivedTy::isaFunction(orig.get())) {
+      auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(orig)));
+      throw nb::value_error((Twine("Cannot cast value to ") +
+                             DerivedTy::pyClassName + " (from " + origRepr +
+                             ")")
+                                .str()
+                                .c_str());
+    }
+    return orig.get();
+  }
+
+  /// Binds the Python module objects to functions of this class.
+  static void bind(nb::module_ &m) {
+    auto cls = ClassTy(m, DerivedTy::pyClassName);
+    cls.def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"));
+    cls.def_static(
+        "isinstance",
+        [](PyValue &otherValue) -> bool {
+          return DerivedTy::isaFunction(otherValue);
+        },
+        nb::arg("other_value"));
+    cls.def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
+            [](DerivedTy &self) { return self.maybeDownCast(); });
+    DerivedTy::bindDerived(cls);
+  }
+
+  /// Implemented by derived classes to add methods to the Python subclass.
+  static void bindDerived(ClassTy &m) {}
+};
+
+} // namespace
+
+/// Python wrapper for MlirOpResult.
+class PyOpResult : public PyConcreteValue<PyOpResult> {
+public:
+  static constexpr IsAFunctionTy isaFunction = mlirValueIsAOpResult;
+  static constexpr const char *pyClassName = "OpResult";
+  using PyConcreteValue::PyConcreteValue;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_prop_ro("owner", [](PyOpResult &self) {
+      assert(
+          mlirOperationEqual(self.getParentOperation()->get(),
+                             mlirOpResultGetOwner(self.get())) &&
+          "expected the owner of the value in Python to match that in the IR");
+      return self.getParentOperation().getObject();
+    });
+    c.def_prop_ro("result_number", [](PyOpResult &self) {
+      return mlirOpResultGetResultNumber(self.get());
+    });
+  }
+};
+
+/// Returns the list of types of the values held by container.
+template <typename Container>
+static std::vector<MlirType> getValueTypes(Container &container,
+                                           PyMlirContextRef &context) {
+  std::vector<MlirType> result;
+  result.reserve(container.size());
+  for (int i = 0, e = container.size(); i < e; ++i) {
+    result.push_back(mlirValueGetType(container.getElement(i).get()));
+  }
+  return result;
+}
+
+/// A list of operation results. Internally, these are stored as consecutive
+/// elements, random access is cheap. The (returned) result list is associated
+/// with the operation whose results these are, and thus extends the lifetime of
+/// this operation.
+class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
+public:
+  static constexpr const char *pyClassName = "OpResultList";
+  using SliceableT = Sliceable<PyOpResultList, PyOpResult>;
+
+  PyOpResultList(PyOperationRef operation, intptr_t startIndex = 0,
+                 intptr_t length = -1, intptr_t step = 1)
+      : Sliceable(startIndex,
+                  length == -1 ? mlirOperationGetNumResults(operation->get())
+                               : length,
+                  step),
+        operation(std::move(operation)) {}
+
+  static void bindDerived(ClassTy &c) {
+    c.def_prop_ro("types", [](PyOpResultList &self) {
+      return getValueTypes(self, self.operation->getContext());
+    });
+    c.def_prop_ro("owner", [](PyOpResultList &self) {
+      return self.operation->createOpView();
+    });
+  }
+
+  PyOperationRef &getOperation() { return operation; }
+
+private:
+  /// Give the parent CRTP class access to hook implementations below.
+  friend class Sliceable<PyOpResultList, PyOpResult>;
+
+  intptr_t getRawNumElements() {
+    operation->checkValid();
+    return mlirOperationGetNumResults(operation->get());
+  }
+
+  PyOpResult getRawElement(intptr_t index) {
+    PyValue value(operation, mlirOperationGetResult(operation->get(), index));
+    return PyOpResult(value);
+  }
+
+  PyOpResultList slice(intptr_t startIndex, intptr_t length, intptr_t step) {
+    return PyOpResultList(operation, startIndex, length, step);
+  }
+
+  PyOperationRef operation;
+};
+
 //------------------------------------------------------------------------------
 // PyOpView
 //------------------------------------------------------------------------------
@@ -1733,6 +1858,40 @@ static void populateResultTypes(StringRef name, nb::list resultTypeList,
   }
 }
 
+static MlirValue getUniqueResult(MlirOperation operation) {
+  auto numResults = mlirOperationGetNumResults(operation);
+  if (numResults != 1) {
+    auto name = mlirIdentifierStr(mlirOperationGetName(operation));
+    throw nb::value_error((Twine("Cannot call .result on operation ") +
+                           StringRef(name.data, name.length) + " which has " +
+                           Twine(numResults) +
+                           " results (it is only valid for operations with a "
+                           "single result)")
+                              .str()
+                              .c_str());
+  }
+  return mlirOperationGetResult(operation, 0);
+}
+
+static MlirValue getOpResultOrValue(nb::handle operand) {
+  if (operand.is_none()) {
+    throw nb::value_error("contained a None item");
+  }
+  PyOperationBase *op;
+  if (nb::try_cast<PyOperationBase *>(operand, op)) {
+    return getUniqueResult(op->getOperation());
+  }
+  PyOpResultList *opResultList;
+  if (nb::try_cast<PyOpResultList *>(operand, opResultList)) {
+    return getUniqueResult(opResultList->getOperation()->get());
+  }
+  PyValue *value;
+  if (nb::try_cast<PyValue *>(operand, value)) {
+    return value->get();
+  }
+  throw nb::value_error("is not a Value");
+}
+
 nb::object PyOpView::buildGeneric(
     std::string_view name, std::tuple<int, bool> opRegionSpec,
     nb::object operandSegmentSpecObj, nb::object resultSegmentSpecObj,
@@ -1783,16 +1942,14 @@ nb::object PyOpView::buildGeneric(
   }
 
   // Unpack operands.
-  std::vector<PyValue *> operands;
+  llvm::SmallVector<MlirValue, 4> operands;
   operands.reserve(operands.size());
   if (operandSegmentSpecObj.is_none()) {
     // Non-sized operand unpacking.
     for (const auto &it : llvm::enumerate(operandList)) {
       try {
-        operands.push_back(nb::cast<PyValue *>(it.value()));
-        if (!operands.back())
-          throw nb::cast_error();
-      } catch (nb::cast_error &err) {
+        operands.push_back(getOpResultOrValue(it.value()));
+      } catch (nb::builtin_exception &err) {
         throw nb::value_error((llvm::Twine("Operand ") +
                                llvm::Twine(it.index()) + " of operation \"" +
                                name + "\" must be a Value (" + err.what() + ")")
@@ -1818,29 +1975,31 @@ nb::object PyOpView::buildGeneric(
       int segmentSpec = std::get<1>(it.value());
       if (segmentSpec == 1 || segmentSpec == 0) {
         // Unpack unary element.
-        try {
-          auto *operandValue = nb::cast<PyValue *>(std::get<0>(it.value()));
-          if (operandValue) {
-            operands.push_back(operandValue);
-            operandSegmentLengths.push_back(1);
-          } else if (segmentSpec == 0) {
-            // Allowed to be optional.
-            operandSegmentLengths.push_back(0);
-          } else {
-            throw nb::value_error(
-                (llvm::Twine("Operand ") + llvm::Twine(it.index()) +
-                 " of operation \"" + name +
-                 "\" must be a Value (was None and operand is not optional)")
-                    .str()
-                    .c_str());
+        auto &operand = std::get<0>(it.value());
+        if (!operand.is_none()) {
+          try {
+
+            operands.push_back(getOpResultOrValue(operand));
+          } catch (nb::builtin_exception &err) {
+            throw nb::value_error((llvm::Twine("Operand ") +
+                                   llvm::Twine(it.index()) +
+                                   " of operation \"" + name +
+                                   "\" must be a Value (" + err.what() + ")")
+                                      .str()
+                                      .c_str());
           }
-        } catch (nb::cast_error &err) {
-          throw nb::value_error((llvm::Twine("Operand ") +
-                                 llvm::Twine(it.index()) + " of operation \"" +
-                                 name + "\" must be a Value (" + err.what() +
-                                 ")")
-                                    .str()
-                                    .c_str());
+
+          operandSegmentLengths.push_back(1);
+        } else if (segmentSpec == 0) {
+          // Allowed to be optional.
+          operandSegmentLengths.push_back(0);
+        } else {
+          throw nb::value_error(
+              (llvm::Twine("Operand ") + llvm::Twine(it.index()) +
+               " of operation \"" + name +
+               "\" must be a Value (was None and operand is not optional)")
+                  .str()
+                  .c_str());
         }
       } else if (segmentSpec == -1) {
         // Unpack sequence by appending.
@@ -1852,10 +2011,7 @@ nb::object PyOpView::buildGeneric(
             // Unpack the list.
             auto segment = nb::cast<nb::sequence>(std::get<0>(it.value()));
             for (nb::handle segmentItem : segment) {
-              operands.push_back(nb::cast<PyValue *>(segmentItem));
-              if (!operands.back()) {
-                throw nb::type_error("contained a None item");
-              }
+              operands.push_back(getOpResultOrValue(segmentItem));
             }
             operandSegmentLengths.push_back(nb::len(segment));
           }
@@ -2269,57 +2425,6 @@ void PySymbolTable::walkSymbolTables(PyOperationBase &from,
 }
 
 namespace {
-/// CRTP base class for Python MLIR values that subclass Value and should be
-/// castable from it. The value hierarchy is one level deep and is not supposed
-/// to accommodate other levels unless core MLIR changes.
-template <typename DerivedTy>
-class PyConcreteValue : public PyValue {
-public:
-  // Derived classes must define statics for:
-  //   IsAFunctionTy isaFunction
-  //   const char *pyClassName
-  // and redefine bindDerived.
-  using ClassTy = nb::class_<DerivedTy, PyValue>;
-  using IsAFunctionTy = bool (*)(MlirValue);
-
-  PyConcreteValue() = default;
-  PyConcreteValue(PyOperationRef operationRef, MlirValue value)
-      : PyValue(operationRef, value) {}
-  PyConcreteValue(PyValue &orig)
-      : PyConcreteValue(orig.getParentOperation(), castFrom(orig)) {}
-
-  /// Attempts to cast the original value to the derived type and throws on
-  /// type mismatches.
-  static MlirValue castFrom(PyValue &orig) {
-    if (!DerivedTy::isaFunction(orig.get())) {
-      auto origRepr = nb::cast<std::string>(nb::repr(nb::cast(orig)));
-      throw nb::value_error((Twine("Cannot cast value to ") +
-                             DerivedTy::pyClassName + " (from " + origRepr +
-                             ")")
-                                .str()
-                                .c_str());
-    }
-    return orig.get();
-  }
-
-  /// Binds the Python module objects to functions of this class.
-  static void bind(nb::module_ &m) {
-    auto cls = ClassTy(m, DerivedTy::pyClassName);
-    cls.def(nb::init<PyValue &>(), nb::keep_alive<0, 1>(), nb::arg("value"));
-    cls.def_static(
-        "isinstance",
-        [](PyValue &otherValue) -> bool {
-          return DerivedTy::isaFunction(otherValue);
-        },
-        nb::arg("other_value"));
-    cls.def(MLIR_PYTHON_MAYBE_DOWNCAST_ATTR,
-            [](DerivedTy &self) { return self.maybeDownCast(); });
-    DerivedTy::bindDerived(cls);
-  }
-
-  /// Implemented by derived classes to add methods to the Python subclass.
-  static void bindDerived(ClassTy &m) {}
-};
 
 /// Python wrapper for MlirBlockArgument.
 class PyBlockArgument : public PyConcreteValue<PyBlockArgument> {
@@ -2345,39 +2450,6 @@ class PyBlockArgument : public PyConcreteValue<PyBlockArgument> {
   }
 };
 
-/// Python wrapper for MlirOpResult.
-class PyOpResult : public PyConcreteValue<PyOpResult> {
-public:
-  static constexpr IsAFunctionTy isaFunction = mlirValueIsAOpResult;
-  static constexpr const char *pyClassName = "OpResult";
-  using PyConcreteValue::PyConcreteValue;
-
-  static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("owner", [](PyOpResult &self) {
-      assert(
-          mlirOperationEqual(self.getParentOperation()->get(),
-                             mlirOpResultGetOwner(self.get())) &&
-          "expected the owner of the value in Python to match that in the IR");
-      return self.getParentOperation().getObject();
-    });
-    c.def_prop_ro("result_number", [](PyOpResult &self) {
-      return mlirOpResultGetResultNumber(self.get());
-    });
-  }
-};
-
-/// Returns the list of types of the values held by container.
-template <typename Container>
-static std::vector<MlirType> getValueTypes(Container &container,
-                                           PyMlirContextRef &context) {
-  std::vector<MlirType> result;
-  result.reserve(container.size());
-  for (int i = 0, e = container.size(); i < e; ++i) {
-    result.push_back(mlirValueGetType(container.getElement(i).get()));
-  }
-  return result;
-}
-
 /// A list of block arguments. Internally, these are stored as consecutive
 /// elements, random access is cheap. The argument list is associated with the
 /// operation that contains the block (detached blocks are not allowed in
@@ -2484,53 +2556,6 @@ class PyOpOperandList : public Sliceable<PyOpOperandList, PyValue> {
   PyOperationRef operation;
 };
 
-/// A list of operation results. Internally, these are stored as consecutive
-/// elements, random access is cheap. The (returned) result list is associated
-/// with the operation whose results these are, and thus extends the lifetime of
-/// this operation.
-class PyOpResultList : public Sliceable<PyOpResultList, PyOpResult> {
-public:
-  static constexpr const char *pyClassName = "OpResultList";
-  using SliceableT = Sliceable<PyOpResultList, PyOpResult>;
-
-  PyOpResultList(PyOperationRef operation, intptr_t startIndex = 0,
-                 intptr_t length = -1, intptr_t step = 1)
-      : Sliceable(startIndex,
-                  length == -1 ? mlirOperationGetNumResults(operation->get())
-                               : length,
-                  step),
-        operation(std::move(operation)) {}
-
-  static void bindDerived(ClassTy &c) {
-    c.def_prop_ro("types", [](PyOpResultList &self) {
-      return getValueTypes(self, self.operation->getContext());
-    });
-    c.def_prop_ro("owner", [](PyOpResultList &self) {
-      return self.operation->createOpView();
-    });
-  }
-
-private:
-  /// Give the parent CRTP class access to hook implementations below.
-  friend class Sliceable<PyOpResultList, PyOpResult>;
-
-  intptr_t getRawNumElements() {
-    operation->checkValid();
-    return mlirOperationGetNumResults(operation->get());
-  }
-
-  PyOpResult getRawElement(intptr_t index) {
-    PyValue value(operation, mlirOperationGetResult(operation->get(), index));
-    return PyOpResult(value);
-  }
-
-  PyOpResultList slice(intptr_t startIndex, intptr_t length, intptr_t step) {
-    return PyOpResultList(operation, startIndex, length, step);
-  }
-
-  PyOperationRef operation;
-};
-
 /// A list of operation successors. Internally, these are stored as consecutive
 /// elements, random access is cheap. The (returned) successor list is
 /// associated with the operation whose successors these are, and thus extends
@@ -3123,20 +3148,7 @@ void mlir::python::populateIRCore(nb::module_ &m) {
           "result",
           [](PyOperationBase &self) {
             auto &operation = self.getOperation();
-            auto numResults = mlirOperationGetNumResults(operation);
-            if (numResults != 1) {
-              auto name = mlirIdentifierStr(mlirOperationGetName(operation));
-              throw nb::value_error(
-                  (Twine("Cannot call .result on operation ") +
-                   StringRef(name.data, name.length) + " which has " +
-                   Twine(numResults) +
-                   " results (it is only valid for operations with a "
-                   "single result)")
-                      .str()
-                      .c_str());
-            }
-            return PyOpResult(operation.getRef(),
-                              mlirOperationGetResult(operation, 0))
+            return PyOpResult(operation.getRef(), getUniqueResult(operation))
                 .maybeDownCast();
           },
           "Shortcut to get an op result if it has only one (throws an error "
@@ -3233,14 +3245,36 @@ void mlir::python::populateIRCore(nb::module_ &m) {
            nb::arg("walk_order") = MlirWalkPostOrder);
 
   nb::class_<PyOperation, PyOperationBase>(m, "Operation")
-      .def_static("create", &PyOperation::create, nb::arg("name"),
-                  nb::arg("results").none() = nb::none(),
-                  nb::arg("operands").none() = nb::none(),
-                  nb::arg("attributes").none() = nb::none(),
-                  nb::arg("successors").none() = nb::none(),
-                  nb::arg("regions") = 0, nb::arg("loc").none() = nb::none(),
-                  nb::arg("ip").none() = nb::none(),
-                  nb::arg("infer_type") = false, kOperationCreateDocstring)
+      .def_static(
+          "create",
+          [](std::string_view name,
+             std::optional<std::vector<PyType *>> results,
+             std::optional<std::vector<PyValue *>> operands,
+             std::optional<nb::dict> attributes,
+             std::optional<std::vector<PyBlock *>> successors, int regions,
+             DefaultingPyLocation location, const nb::object &maybeIp,
+             bool inferType) {
+            // Unpack/validate operands.
+            llvm::SmallVector<MlirValue, 4> mlirOperands;
+            if (operands) {
+              mlirOperands.reserve(operands->size());
+              for (PyValue *operand : *operands) {
+                if (!operand)
+                  throw nb::value_error("operand value cannot be None");
+                mlirOperands.push_back(operand->get());
+              }
+            }
+
+            return PyOperation::create(name, results, mlirOperands, attributes,
+                                       successors, regions, location, maybeIp,
+                                       inferType);
+          },
+          nb::arg("name"), nb::arg("results").none() = nb::none(),
+          nb::arg("operands").none() = nb::none(),
+          nb::arg("attributes").none() = nb::none(),
+          nb::arg("successors").none() = nb::none(), nb::arg("regions") = 0,
+          nb::arg("loc").none() = nb::none(), nb::arg("ip").none() = nb::none(),
+          nb::arg("infer_type") = false, kOperationCreateDocstring)
       .def_static(
           "parse",
           [](const std::string &sourceStr, const std::string &sourceName,
diff --git a/mlir/lib/Bindings/Python/IRModule.h b/mlir/lib/Bindings/Python/IRModule.h
index fd70ac7ac6ec3..dd6e7ef912374 100644
--- a/mlir/lib/Bindings/Python/IRModule.h
+++ b/mlir/lib/Bindings/Python/IRModule.h
@@ -686,7 +686,7 @@ class PyOperation : public PyOperationBase, public BaseContextObject {
   /// Creates an operation. See corresponding python docstring.
   static nanobind::object
   create(std::string_view name, std::optional<std::vector<PyType *>> results,
-         std::optional<std::vector<PyValue *>> operands,
+         llvm::ArrayRef<MlirValue> operands,
          std::optional<nanobind::dict> attributes,
          std::optional<std::vector<PyBlock *>> successors, int regions,
          DefaultingPyLocation location, const nanobind::object &ip,
diff --git a/mlir/python/mlir/dialects/_ods_common.py b/mlir/python/mlir/dialects/_ods_common.py
index 5b67ab03d6f49..d3dbdc604ef4c 100644
--- a/mlir/python/mlir/dialects/_ods_common.py
+++ b/mlir/python/mlir/dialects/_ods_common.py
@@ -115,7 +115,10 @@ def get_op_results_or_values(
         _cext.ir.Operation,
         _Sequence[_Union[_cext.ir.OpView, _cext.ir.Operation, _cext.ir.Value]],
     ]
-) -> _Union[_Sequence[_cext.ir.Value], _cext.ir.OpResultList]:
+) -> _Union[
+    _Sequence[_Union[_cext.ir.OpView, _cext.ir.Operation, _cext.ir.Value]],
+    _cext.ir.OpResultList,
+]:
     """Returns the given sequence of values or the results of the given op.
 
     This is useful to implement op constructors so that they can take other ops as
@@ -127,7 +130,7 @@ def get_op_results_or_values(
     elif isinstance(arg, _cext.ir.Operation):
         return arg.results
     else:
-        return [get_op_result_or_value(element) for element in arg]
+        return arg
 
 
 def get_op_result_or_op_results(
diff --git a/mlir/test/mlir-tblgen/op-python-bindings.td b/mlir/test/mlir-tblgen/op-python-bindings.td
index 25833779c2f71..72963cac64d54 100644
--- a/mlir/test/mlir-tblgen/op-python-bindings.td
+++ b/mlir/test/mlir-tblgen/op-python-bindings.td
@@ -27,8 +27,8 @@ def AttrSizedOperandsOp : TestOp<"attr_sized_operands",
   // CHECK:   attributes = {}
   // CHECK:   regions = None
   // CHECK:   operands.append(_get_op_results_or_values(variadic1))
-  // CHECK:   operands.append(_get_op_result_or_value(non_variadic))
-  // CHECK:   operands.append(_get_op_result_or_value(variadic2) if variadic2 is not None else None)
+  // CHECK:   operands.append(non_variadic)
+  // CHECK:   operands.append(variadic2)
   // CHECK:   _ods_successors = None
   // CHECK:   super().__init__(
   // CHECK:     self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS,
@@ -173,8 +173,8 @@ def AttributedOpWithOperands : TestOp<"attributed_op_with_operands"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(_gen_arg_0))
-  // CHECK:   operands.append(_get_op_result_or_value(_gen_arg_2))
+  // CHECK:   operands.append(_gen_arg_0)
+  // CHECK:   operands.append(_gen_arg_2)
   // CHECK:   if bool(in_): attributes["in"] = _ods_ir.UnitAttr.get(
   // CHECK:     _ods_get_default_loc_context(loc))
   // CHECK:   if is_ is not None: attributes["is"] = (is_
@@ -307,9 +307,9 @@ def MissingNamesOp : TestOp<"missing_names"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(_gen_arg_0))
-  // CHECK:   operands.append(_get_op_result_or_value(f32))
-  // CHECK:   operands.append(_get_op_result_or_value(_gen_arg_2))
+  // CHECK:   operands.append(_gen_arg_0)
+  // CHECK:   operands.append(f32)
+  // CHECK:   operands.append(_gen_arg_2)
   // CHECK:   results.append(i32)
   // CHECK:   results.append(_gen_res_1)
   // CHECK:   results.append(i64)
@@ -349,8 +349,8 @@ def OneOptionalOperandOp : TestOp<"one_optional_operand"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(non_optional))
-  // CHECK:   if optional is not None: operands.append(_get_op_result_or_value(optional))
+  // CHECK:   operands.append(non_optional)
+  // CHECK:   if optional is not None: operands.append(optional)
   // CHECK:   _ods_successors = None
   // CHECK:   super().__init__(
   // CHECK:     self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS
@@ -380,7 +380,7 @@ def OneVariadicOperandOp : TestOp<"one_variadic_operand"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(non_variadic))
+  // CHECK:   operands.append(non_variadic)
   // CHECK:   operands.extend(_get_op_results_or_values(variadic))
   // CHECK:   _ods_successors = None
   // CHECK:   super().__init__(
@@ -445,7 +445,7 @@ def PythonKeywordOp : TestOp<"python_keyword"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(in_))
+  // CHECK:   operands.append(in_)
   // CHECK:   _ods_successors = None
   // CHECK:   super().__init__(
   // CHECK:     self.OPERATION_NAME, self._ODS_REGIONS, self._ODS_OPERAND_SEGMENTS, self._ODS_RESULT_SEGMENTS
@@ -547,8 +547,8 @@ def SimpleOp : TestOp<"simple"> {
   // CHECK:   results = []
   // CHECK:   attributes = {}
   // CHECK:   regions = None
-  // CHECK:   operands.append(_get_op_result_or_value(i32))
-  // CHECK:   operands.append(_get_op_result_or_value(f32))
+  // CHECK:   operands.append(i32)
+  // CHECK:   operands.append(f32)
   // CHECK:   results.append(i64)
   // CHECK:   results.append(f64)
   // CHECK:   _ods_successors = None
diff --git a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
index e1540d1750ff1..604d2376052a8 100644
--- a/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpPythonBindingGen.cpp
@@ -37,7 +37,6 @@ from ._ods_common import (
     equally_sized_accessor as _ods_equally_sized_accessor,
     get_default_loc_context as _ods_get_default_loc_context,
     get_op_result_or_op_results as _get_op_result_or_op_results,
-    get_op_result_or_value as _get_op_result_or_value,
     get_op_results_or_values as _get_op_results_or_values,
     segmented_accessor as _ods_segmented_accessor,
 )
@@ -501,17 +500,15 @@ constexpr const char *initTemplate = R"Py(
 
 /// Template for appending a single element to the operand/result list.
 ///   {0} is the field name.
-constexpr const char *singleOperandAppendTemplate =
-    "operands.append(_get_op_result_or_value({0}))";
+constexpr const char *singleOperandAppendTemplate = "operands.append({0})";
 constexpr const char *singleResultAppendTemplate = "results.append({0})";
 
 /// Template for appending an optional element to the operand/result list.
 ///   {0} is the field name.
 constexpr const char *optionalAppendOperandTemplate =
-    "if {0} is not None: operands.append(_get_op_result_or_value({0}))";
+    "if {0} is not None: operands.append({0})";
 constexpr const char *optionalAppendAttrSizedOperandsTemplate =
-    "operands.append(_get_op_result_or_value({0}) if {0} is not None else "
-    "None)";
+    "operands.append({0})";
 constexpr const char *optionalAppendResultTemplate =
     "if {0} is not None: results.append({0})";
 

From f2b253b9613a858ae3dd5bf5ccbba87b64941688 Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95@gmail.com>
Date: Fri, 24 Jan 2025 09:28:27 -0500
Subject: [PATCH 009/432] [SelectionDAG] Fix an incorrect DebugLoc on a COPY
 (#122963)

Fixes: SWDEV-502134
---
 .../lib/CodeGen/SelectionDAG/InstrEmitter.cpp |  5 +-
 llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll  | 70 +++++++++++++++++++
 2 files changed, 73 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 8e313fb21eede..333ec5e98b2bc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -351,8 +351,9 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
         OpRC = TRI->getAllocatableClass(OpRC);
         assert(OpRC && "Constraints cannot be fulfilled for allocation");
         Register NewVReg = MRI->createVirtualRegister(OpRC);
-        BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+        BuildMI(*MBB, InsertPos, MIB->getDebugLoc(),
+                TII->get(TargetOpcode::COPY), NewVReg)
+            .addReg(VReg);
         VReg = NewVReg;
       } else {
         assert(ConstrainedRC->isAllocatable() &&
diff --git a/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
new file mode 100644
index 0000000000000..8b54f709eec7a
--- /dev/null
+++ b/llvm/test/DebugInfo/AMDGPU/debug-loc-copy.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefix=GCN %s
+
+; Verify that the debug locations in this function are correct, in particular
+; that the location for %cast doesn't appear in the block of %lab.
+
+define void @_Z12lane_pc_testj() #0 !dbg !9 {
+; GCN-LABEL: _Z12lane_pc_testj:
+; GCN:       .Lfunc_begin0:
+; GCN-NEXT:    .file 0 "/" "t.cpp"
+; GCN-NEXT:    .loc 0 3 0 ; t.cpp:3:0
+; GCN-NEXT:    .cfi_sections .debug_frame
+; GCN-NEXT:    .cfi_startproc
+; GCN-NEXT:  ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:  ; %bb.1: ; %lab
+; GCN-NEXT:  .Ltmp0:
+; GCN-NEXT:    .loc 0 12 1 prologue_end ; t.cpp:12:1
+; GCN-NEXT:    s_mov_b64 s[4:5], src_private_base
+; GCN-NEXT:    s_mov_b32 s6, 32
+; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s6
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_mov_b32 s5, -1
+; GCN-NEXT:    s_lshr_b32 s8, s32, 5
+; GCN-NEXT:    s_cmp_lg_u32 s8, s5
+; GCN-NEXT:    s_cselect_b32 s5, s4, s7
+; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    .loc 0 13 1 ; t.cpp:13:1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    v_mov_b32_e32 v2, 1
+; GCN-NEXT:    .loc 0 14 1 ; t.cpp:14:1
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    flat_store_dword v[0:1], v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GCN-NEXT:  .Ltmp1:
+  %alloc = alloca i32, align 4, addrspace(5)
+  %cast = addrspacecast ptr addrspace(5) %alloc to ptr, !dbg !12
+  br label %lab
+
+lab:
+  store i32 0, ptr %cast, align 4, !dbg !13
+  store i32 1, ptr %cast, align 4, !dbg !14
+  ret void
+}
+
+attributes #0 = { noinline optnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 20.0.0git", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "t.cpp", directory: "/")
+!2 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!3 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
+!4 = !{i32 7, !"Dwarf Version", i32 5}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 8, !"PIC Level", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = distinct !DISubprogram(name: "lane_pc_test", linkageName: "_Z12lane_pc_testj", scope: !1, file: !1, line: 1, scopeLine: 3, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, type: !10, unit: !0)
+!10 = !DISubroutineType(types: !11)
+!11 = !{}
+!12 = !DILocation(line: 12, column: 1, scope: !9)
+!13 = !DILocation(line: 13, column: 1, scope: !9)
+!14 = !DILocation(line: 14, column: 1, scope: !9)

From 2068b1ba031e258a6448bea372005d19692c802a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 24 Jan 2025 15:31:53 +0100
Subject: [PATCH 010/432] [X86] Fix ABI for passing after i128 (#124134)

If we're passing an i128 value and we no longer have enough argument
registers (only r9 unallocated), the value gets passed via the stack.
However, r9 is still allocated as a shadow register, which means that a
following i64 argument will not use it. This doesn't match the x86-64
psABI.

Fix this by making i128 arguments as requiring consecutive registers,
and then adding a custom CC lowering that will allocate both parts of
the i128 at the same time, either to register or to stack, without
reserving a shadow register.

Fixes https://github.com/llvm/llvm-project/issues/123935.
---
 llvm/lib/Target/X86/X86CallingConv.cpp        | 34 ++++++++++++
 llvm/lib/Target/X86/X86CallingConv.td         |  6 +--
 llvm/lib/Target/X86/X86ISelLowering.h         |  4 ++
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  8 +++
 llvm/test/CodeGen/X86/addcarry.ll             |  2 +-
 .../CodeGen/X86/apx/flags-copy-lowering.ll    |  8 +--
 llvm/test/CodeGen/X86/avgflooru-i128.ll       |  2 +-
 llvm/test/CodeGen/X86/fmuladd-soft-float.ll   | 54 +++++++++----------
 llvm/test/CodeGen/X86/i128-abi.ll             | 10 ++--
 llvm/test/CodeGen/X86/sadd_sat_vec.ll         | 36 ++++++-------
 llvm/test/CodeGen/X86/ssub_sat_vec.ll         | 36 ++++++-------
 llvm/test/CodeGen/X86/subcarry.ll             |  2 +-
 llvm/test/CodeGen/X86/uadd_sat_vec.ll         |  8 +--
 llvm/test/CodeGen/X86/usub_sat_vec.ll         |  8 +--
 14 files changed, 130 insertions(+), 88 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index 7359ef341dde5..0b4c63f7a81f7 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -340,5 +340,39 @@ static bool CC_X86_64_Pointer(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   return false;
 }
 
+/// Special handling for i128: Either allocate the value to two consecutive
+/// i64 registers, or to the stack. Do not partially allocate in registers,
+/// and do not reserve any registers when allocating to the stack.
+static bool CC_X86_64_I128(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  assert(ValVT == MVT::i64 && "Should have i64 parts");
+  SmallVectorImpl<CCValAssign> &PendingMembers = State.getPendingLocs();
+  PendingMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (!ArgFlags.isInConsecutiveRegsLast())
+    return true;
+
+  unsigned NumRegs = PendingMembers.size();
+  assert(NumRegs == 2 && "Should have two parts");
+
+  static const MCPhysReg Regs[] = {X86::RDI, X86::RSI, X86::RDX,
+                                   X86::RCX, X86::R8,  X86::R9};
+  ArrayRef<MCPhysReg> Allocated = State.AllocateRegBlock(Regs, NumRegs);
+  if (!Allocated.empty()) {
+    PendingMembers[0].convertToReg(Allocated[0]);
+    PendingMembers[1].convertToReg(Allocated[1]);
+  } else {
+    int64_t Offset = State.AllocateStack(16, Align(16));
+    PendingMembers[0].convertToMem(Offset);
+    PendingMembers[1].convertToMem(Offset + 8);
+  }
+  State.addLoc(PendingMembers[0]);
+  State.addLoc(PendingMembers[1]);
+  PendingMembers.clear();
+  return true;
+}
+
 // Provides entry points of CC_X86 and RetCC_X86.
 #include "X86GenCallingConv.inc"
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 91af111db8cda..72b103b0bb0c5 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -548,11 +548,9 @@ def CC_X86_64_C : CallingConv<[
   CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
 
   // i128 can be either passed in two i64 registers, or on the stack, but
-  // not split across register and stack. As such, do not allow using R9
-  // for a split i64.
+  // not split across register and stack. Handle this with a custom function.
   CCIfType<[i64],
-           CCIfSplit<CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>>,
-  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [R9]>>>,
+           CCIfConsecutiveRegs<CCCustom<"CC_X86_64_I128">>>,
 
   CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e07bcd989c518..fe79fefeed631 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1604,6 +1604,10 @@ namespace llvm {
         LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
         unsigned &NumIntermediates, MVT &RegisterVT) const override;
 
+    bool functionArgumentNeedsConsecutiveRegisters(
+        Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+        const DataLayout &DL) const override;
+
     bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 4a4fd246cb7cd..6835c7e336a5c 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -233,6 +233,14 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
   return VT.changeVectorElementTypeToInteger();
 }
 
+bool X86TargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg,
+    const DataLayout &DL) const {
+  // i128 split into i64 needs to be allocated to two consecutive registers,
+  // or spilled to the stack as a whole.
+  return Ty->isIntegerTy(128);
+}
+
 /// Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
 static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
diff --git a/llvm/test/CodeGen/X86/addcarry.ll b/llvm/test/CodeGen/X86/addcarry.ll
index f8d32fc2d2925..97894db1188e2 100644
--- a/llvm/test/CodeGen/X86/addcarry.ll
+++ b/llvm/test/CodeGen/X86/addcarry.ll
@@ -49,7 +49,7 @@ define i256 @add256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: add256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    addq %r9, %rsi
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
diff --git a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
index deca130a04ff0..bd764c2edef29 100644
--- a/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
+++ b/llvm/test/CodeGen/X86/apx/flags-copy-lowering.ll
@@ -31,15 +31,15 @@ define <2 x i128> @flag_copy_2(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; CHECK-NEXT:    movq %r8, %rdi
 ; CHECK-NEXT:    {nf} sarq $63, %rdi
 ; CHECK-NEXT:    cmovoq %rdi, %rcx
-; CHECK-NEXT:    movabsq $-9223372036854775808, %r9 # imm = 0x8000000000000000
-; CHECK-NEXT:    {nf} xorq %r9, %rdi
+; CHECK-NEXT:    movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
+; CHECK-NEXT:    {nf} xorq %r10, %rdi
 ; CHECK-NEXT:    cmovnoq %r8, %rdi
-; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    subq %r9, %rsi
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    movq %rdx, %r8
 ; CHECK-NEXT:    {nf} sarq $63, %r8
 ; CHECK-NEXT:    cmovoq %r8, %rsi
-; CHECK-NEXT:    {nf} xorq %r9, %r8
+; CHECK-NEXT:    {nf} xorq %r10, %r8
 ; CHECK-NEXT:    cmovnoq %rdx, %r8
 ; CHECK-NEXT:    movq %rcx, 16(%rax)
 ; CHECK-NEXT:    movq %rsi, (%rax)
diff --git a/llvm/test/CodeGen/X86/avgflooru-i128.ll b/llvm/test/CodeGen/X86/avgflooru-i128.ll
index da16a7da48ca6..11e886e25ba4e 100644
--- a/llvm/test/CodeGen/X86/avgflooru-i128.ll
+++ b/llvm/test/CodeGen/X86/avgflooru-i128.ll
@@ -119,7 +119,7 @@ define <2 x i128> @avgflooru_i128_vec(<2 x i128> %x, <2 x i128> %y) {
 ; CHECK-LABEL: avgflooru_i128_vec:
 ; CHECK:       # %bb.0: # %start
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    addq %r9, %rsi
 ; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    setb %dil
 ; CHECK-NEXT:    movzbl %dil, %edi
diff --git a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
index ccb2f37590b0a..cbdfa32ed4627 100644
--- a/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
+++ b/llvm/test/CodeGen/X86/fmuladd-soft-float.ll
@@ -1555,30 +1555,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-NEXT:    .cfi_offset %r14, -32
 ; SOFT-FLOAT-64-NEXT:    .cfi_offset %r15, -24
 ; SOFT-FLOAT-64-NEXT:    .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-NEXT:    movq %r9, %rbp
 ; SOFT-FLOAT-64-NEXT:    movq %rcx, %r14
 ; SOFT-FLOAT-64-NEXT:    movq %rdx, %r15
-; SOFT-FLOAT-64-NEXT:    movq %rsi, %r12
+; SOFT-FLOAT-64-NEXT:    movq %rsi, %r13
 ; SOFT-FLOAT-64-NEXT:    movq %rdi, %rbx
-; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    movq %r8, %rdi
 ; SOFT-FLOAT-64-NEXT:    callq __muldf3@PLT
-; SOFT-FLOAT-64-NEXT:    movq %rax, %r13
+; SOFT-FLOAT-64-NEXT:    movq %rax, %r12
 ; SOFT-FLOAT-64-NEXT:    movq %r14, %rdi
-; SOFT-FLOAT-64-NEXT:    movq %rbp, %rsi
+; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-NEXT:    movq %rax, %r14
 ; SOFT-FLOAT-64-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-NEXT:    movq %rax, %r15
-; SOFT-FLOAT-64-NEXT:    movq %r12, %rdi
-; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-NEXT:    movq %rbp, %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-NEXT:    movq %rax, %rdi
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __adddf3@PLT
-; SOFT-FLOAT-64-NEXT:    movq %rax, %r12
+; SOFT-FLOAT-64-NEXT:    movq %rax, %r13
 ; SOFT-FLOAT-64-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __adddf3@PLT
@@ -1587,13 +1587,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-NEXT:    movq %rax, %r14
-; SOFT-FLOAT-64-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-NEXT:    movq %r12, %rdi
 ; SOFT-FLOAT-64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-NEXT:    movq %rax, 24(%rbx)
 ; SOFT-FLOAT-64-NEXT:    movq %r14, 16(%rbx)
 ; SOFT-FLOAT-64-NEXT:    movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-NEXT:    movq %r12, (%rbx)
+; SOFT-FLOAT-64-NEXT:    movq %r13, (%rbx)
 ; SOFT-FLOAT-64-NEXT:    movq %rbx, %rax
 ; SOFT-FLOAT-64-NEXT:    addq $8, %rsp
 ; SOFT-FLOAT-64-NEXT:    .cfi_def_cfa_offset 56
@@ -1633,30 +1633,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-FMA-NEXT:    .cfi_offset %r14, -32
 ; SOFT-FLOAT-64-FMA-NEXT:    .cfi_offset %r15, -24
 ; SOFT-FLOAT-64-FMA-NEXT:    .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-FMA-NEXT:    movq %r9, %rbp
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rcx, %r14
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rdx, %r15
-; SOFT-FLOAT-64-FMA-NEXT:    movq %rsi, %r12
+; SOFT-FLOAT-64-FMA-NEXT:    movq %rsi, %r13
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rdi, %rbx
-; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r8, %rdi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __muldf3@PLT
-; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r13
+; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r12
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r14, %rdi
-; SOFT-FLOAT-64-FMA-NEXT:    movq %rbp, %rsi
+; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r14
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r15
-; SOFT-FLOAT-64-FMA-NEXT:    movq %r12, %rdi
-; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-FMA-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-FMA-NEXT:    movq %rbp, %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %rdi
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __adddf3@PLT
-; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r12
+; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r13
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __adddf3@PLT
@@ -1665,13 +1665,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, %r14
-; SOFT-FLOAT-64-FMA-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-FMA-NEXT:    movq %r12, %rdi
 ; SOFT-FLOAT-64-FMA-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rax, 24(%rbx)
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r14, 16(%rbx)
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-FMA-NEXT:    movq %r12, (%rbx)
+; SOFT-FLOAT-64-FMA-NEXT:    movq %r13, (%rbx)
 ; SOFT-FLOAT-64-FMA-NEXT:    movq %rbx, %rax
 ; SOFT-FLOAT-64-FMA-NEXT:    addq $8, %rsp
 ; SOFT-FLOAT-64-FMA-NEXT:    .cfi_def_cfa_offset 56
@@ -1711,30 +1711,30 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-FMA4-NEXT:    .cfi_offset %r14, -32
 ; SOFT-FLOAT-64-FMA4-NEXT:    .cfi_offset %r15, -24
 ; SOFT-FLOAT-64-FMA4-NEXT:    .cfi_offset %rbp, -16
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %r9, %rbp
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rcx, %r14
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rdx, %r15
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %rsi, %r12
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %rsi, %r13
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rdi, %rbx
-; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r8, %rdi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __muldf3@PLT
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r13
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r12
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r14, %rdi
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %rbp, %rsi
+; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r14
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r15
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %r12, %rdi
-; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %rbp, %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __muldf3@PLT
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %rdi
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __adddf3@PLT
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r12
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r13
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r15, %rdi
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __adddf3@PLT
@@ -1743,13 +1743,13 @@ define <4 x double> @fmuladd_contract_v4f64(<4 x double> %a, <4 x double> %b, <4
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, %r14
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %r13, %rdi
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %r12, %rdi
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
 ; SOFT-FLOAT-64-FMA4-NEXT:    callq __adddf3@PLT
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rax, 24(%rbx)
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r14, 16(%rbx)
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %r15, 8(%rbx)
-; SOFT-FLOAT-64-FMA4-NEXT:    movq %r12, (%rbx)
+; SOFT-FLOAT-64-FMA4-NEXT:    movq %r13, (%rbx)
 ; SOFT-FLOAT-64-FMA4-NEXT:    movq %rbx, %rax
 ; SOFT-FLOAT-64-FMA4-NEXT:    addq $8, %rsp
 ; SOFT-FLOAT-64-FMA4-NEXT:    .cfi_def_cfa_offset 56
diff --git a/llvm/test/CodeGen/X86/i128-abi.ll b/llvm/test/CodeGen/X86/i128-abi.ll
index 23eb6ec0322ab..264c546b4cae2 100644
--- a/llvm/test/CodeGen/X86/i128-abi.ll
+++ b/llvm/test/CodeGen/X86/i128-abi.ll
@@ -31,7 +31,7 @@ define i128 @on_stack2(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i1
 define i64 @trailing_arg_on_stack(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i128 %a5, i64 %a6) {
 ; CHECK-LABEL: trailing_arg_on_stack:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq 24(%rsp), %rax
+; CHECK-NEXT:    movq %r9, %rax
 ; CHECK-NEXT:    retq
   ret i64 %a6
 }
@@ -78,20 +78,18 @@ define void @call_trailing_arg_on_stack(i128 %x, i64 %y) nounwind {
 ; CHECK-LABEL: call_trailing_arg_on_stack:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movq %rdx, %rax
-; CHECK-NEXT:    movq %rsi, %r9
+; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    movq %rsi, %rax
 ; CHECK-NEXT:    movq %rdi, %r10
-; CHECK-NEXT:    subq $8, %rsp
 ; CHECK-NEXT:    movl $1, %esi
 ; CHECK-NEXT:    movl $2, %edx
 ; CHECK-NEXT:    movl $3, %ecx
 ; CHECK-NEXT:    movl $4, %r8d
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    pushq %r9
 ; CHECK-NEXT:    pushq %r10
 ; CHECK-NEXT:    callq trailing_arg_on_stack@PLT
-; CHECK-NEXT:    addq $32, %rsp
+; CHECK-NEXT:    addq $16, %rsp
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    retq
   call i128 @trailing_arg_on_stack(i64 0, i64 1, i64 2, i64 3, i64 4, i128 %x, i64 %y)
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index 322acd76e12e6..bd563f97b0ac4 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -1795,27 +1795,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; SSE-NEXT:    seto %dil
-; SSE-NEXT:    movq %r8, %r9
-; SSE-NEXT:    sarq $63, %r9
+; SSE-NEXT:    movq %r8, %r10
+; SSE-NEXT:    sarq $63, %r10
 ; SSE-NEXT:    testb %dil, %dil
-; SSE-NEXT:    cmovneq %r9, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %r10, %r9
+; SSE-NEXT:    cmovneq %r10, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %r10
 ; SSE-NEXT:    testb %dil, %dil
-; SSE-NEXT:    cmoveq %r8, %r9
-; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    cmoveq %r8, %r10
+; SSE-NEXT:    addq %r9, %rsi
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %dil
 ; SSE-NEXT:    movq %rdx, %r8
 ; SSE-NEXT:    sarq $63, %r8
 ; SSE-NEXT:    testb %dil, %dil
 ; SSE-NEXT:    cmovneq %r8, %rsi
-; SSE-NEXT:    xorq %r10, %r8
+; SSE-NEXT:    xorq %r11, %r8
 ; SSE-NEXT:    testb %dil, %dil
 ; SSE-NEXT:    cmoveq %rdx, %r8
 ; SSE-NEXT:    movq %rcx, 16(%rax)
 ; SSE-NEXT:    movq %rsi, (%rax)
-; SSE-NEXT:    movq %r9, 24(%rax)
+; SSE-NEXT:    movq %r10, 24(%rax)
 ; SSE-NEXT:    movq %r8, 8(%rax)
 ; SSE-NEXT:    retq
 ;
@@ -1825,27 +1825,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; AVX-NEXT:    seto %dil
-; AVX-NEXT:    movq %r8, %r9
-; AVX-NEXT:    sarq $63, %r9
+; AVX-NEXT:    movq %r8, %r10
+; AVX-NEXT:    sarq $63, %r10
 ; AVX-NEXT:    testb %dil, %dil
-; AVX-NEXT:    cmovneq %r9, %rcx
-; AVX-NEXT:    movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; AVX-NEXT:    xorq %r10, %r9
+; AVX-NEXT:    cmovneq %r10, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %r10
 ; AVX-NEXT:    testb %dil, %dil
-; AVX-NEXT:    cmoveq %r8, %r9
-; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT:    cmoveq %r8, %r10
+; AVX-NEXT:    addq %r9, %rsi
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %dil
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    sarq $63, %r8
 ; AVX-NEXT:    testb %dil, %dil
 ; AVX-NEXT:    cmovneq %r8, %rsi
-; AVX-NEXT:    xorq %r10, %r8
+; AVX-NEXT:    xorq %r11, %r8
 ; AVX-NEXT:    testb %dil, %dil
 ; AVX-NEXT:    cmoveq %rdx, %r8
 ; AVX-NEXT:    movq %rcx, 16(%rax)
 ; AVX-NEXT:    movq %rsi, (%rax)
-; AVX-NEXT:    movq %r9, 24(%rax)
+; AVX-NEXT:    movq %r10, 24(%rax)
 ; AVX-NEXT:    movq %r8, 8(%rax)
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index ac8b561abf003..88df3c175ec9c 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -2026,27 +2026,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; SSE-NEXT:    seto %dil
-; SSE-NEXT:    movq %r8, %r9
-; SSE-NEXT:    sarq $63, %r9
+; SSE-NEXT:    movq %r8, %r10
+; SSE-NEXT:    sarq $63, %r10
 ; SSE-NEXT:    testb %dil, %dil
-; SSE-NEXT:    cmovneq %r9, %rcx
-; SSE-NEXT:    movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; SSE-NEXT:    xorq %r10, %r9
+; SSE-NEXT:    cmovneq %r10, %rcx
+; SSE-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; SSE-NEXT:    xorq %r11, %r10
 ; SSE-NEXT:    testb %dil, %dil
-; SSE-NEXT:    cmoveq %r8, %r9
-; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    cmoveq %r8, %r10
+; SSE-NEXT:    subq %r9, %rsi
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    seto %dil
 ; SSE-NEXT:    movq %rdx, %r8
 ; SSE-NEXT:    sarq $63, %r8
 ; SSE-NEXT:    testb %dil, %dil
 ; SSE-NEXT:    cmovneq %r8, %rsi
-; SSE-NEXT:    xorq %r10, %r8
+; SSE-NEXT:    xorq %r11, %r8
 ; SSE-NEXT:    testb %dil, %dil
 ; SSE-NEXT:    cmoveq %rdx, %r8
 ; SSE-NEXT:    movq %rcx, 16(%rax)
 ; SSE-NEXT:    movq %rsi, (%rax)
-; SSE-NEXT:    movq %r9, 24(%rax)
+; SSE-NEXT:    movq %r10, 24(%rax)
 ; SSE-NEXT:    movq %r8, 8(%rax)
 ; SSE-NEXT:    retq
 ;
@@ -2056,27 +2056,27 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; AVX-NEXT:    seto %dil
-; AVX-NEXT:    movq %r8, %r9
-; AVX-NEXT:    sarq $63, %r9
+; AVX-NEXT:    movq %r8, %r10
+; AVX-NEXT:    sarq $63, %r10
 ; AVX-NEXT:    testb %dil, %dil
-; AVX-NEXT:    cmovneq %r9, %rcx
-; AVX-NEXT:    movabsq $-9223372036854775808, %r10 # imm = 0x8000000000000000
-; AVX-NEXT:    xorq %r10, %r9
+; AVX-NEXT:    cmovneq %r10, %rcx
+; AVX-NEXT:    movabsq $-9223372036854775808, %r11 # imm = 0x8000000000000000
+; AVX-NEXT:    xorq %r11, %r10
 ; AVX-NEXT:    testb %dil, %dil
-; AVX-NEXT:    cmoveq %r8, %r9
-; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT:    cmoveq %r8, %r10
+; AVX-NEXT:    subq %r9, %rsi
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    seto %dil
 ; AVX-NEXT:    movq %rdx, %r8
 ; AVX-NEXT:    sarq $63, %r8
 ; AVX-NEXT:    testb %dil, %dil
 ; AVX-NEXT:    cmovneq %r8, %rsi
-; AVX-NEXT:    xorq %r10, %r8
+; AVX-NEXT:    xorq %r11, %r8
 ; AVX-NEXT:    testb %dil, %dil
 ; AVX-NEXT:    cmoveq %rdx, %r8
 ; AVX-NEXT:    movq %rcx, 16(%rax)
 ; AVX-NEXT:    movq %rsi, (%rax)
-; AVX-NEXT:    movq %r9, 24(%rax)
+; AVX-NEXT:    movq %r10, 24(%rax)
 ; AVX-NEXT:    movq %r8, 8(%rax)
 ; AVX-NEXT:    retq
   %z = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %x, <2 x i128> %y)
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 1e9db9f55a8d5..9538ea1061cd1 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -21,7 +21,7 @@ define i256 @sub256(i256 %a, i256 %b) nounwind {
 ; CHECK-LABEL: sub256:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    subq %r9, %rsi
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
index 1ff95c876a6b1..d744ce6ed6af0 100644
--- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll
@@ -1161,11 +1161,11 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE-LABEL: v2i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    addq %r9, %rsi
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    movq $-1, %rdi
-; SSE-NEXT:    cmovbq %rdi, %rdx
 ; SSE-NEXT:    cmovbq %rdi, %rsi
+; SSE-NEXT:    cmovbq %rdi, %rdx
 ; SSE-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
 ; SSE-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; SSE-NEXT:    cmovbq %rdi, %r8
@@ -1179,11 +1179,11 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX-LABEL: v2i128:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
-; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT:    addq %r9, %rsi
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %rdx
 ; AVX-NEXT:    movq $-1, %rdi
-; AVX-NEXT:    cmovbq %rdi, %rdx
 ; AVX-NEXT:    cmovbq %rdi, %rsi
+; AVX-NEXT:    cmovbq %rdi, %rdx
 ; AVX-NEXT:    addq {{[0-9]+}}(%rsp), %rcx
 ; AVX-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
 ; AVX-NEXT:    cmovbq %rdi, %r8
diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll
index 34eb30dfebeeb..4e17ca6fbae33 100644
--- a/llvm/test/CodeGen/X86/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll
@@ -1057,10 +1057,10 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq %rdi, %rax
 ; SSE-NEXT:    xorl %edi, %edi
-; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    subq %r9, %rsi
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    cmovbq %rdi, %rdx
 ; SSE-NEXT:    cmovbq %rdi, %rsi
+; SSE-NEXT:    cmovbq %rdi, %rdx
 ; SSE-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
 ; SSE-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; SSE-NEXT:    cmovbq %rdi, %r8
@@ -1075,10 +1075,10 @@ define <2 x i128> @v2i128(<2 x i128> %x, <2 x i128> %y) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movq %rdi, %rax
 ; AVX-NEXT:    xorl %edi, %edi
-; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT:    subq %r9, %rsi
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %rdx
-; AVX-NEXT:    cmovbq %rdi, %rdx
 ; AVX-NEXT:    cmovbq %rdi, %rsi
+; AVX-NEXT:    cmovbq %rdi, %rdx
 ; AVX-NEXT:    subq {{[0-9]+}}(%rsp), %rcx
 ; AVX-NEXT:    sbbq {{[0-9]+}}(%rsp), %r8
 ; AVX-NEXT:    cmovbq %rdi, %r8

From 9cf52fe1f94fdcd8e27c76f7d33a80eeb2075833 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Fri, 24 Jan 2025 14:48:47 +0000
Subject: [PATCH 011/432] [flang][OpenMP][NFC] test the current private dealloc
 runtime calls (#124017)

It looks like in most cases we still don't make calls to deallocate
allocatable members of derived types which have been privatized.

This is just intended to add a test for the one case where we do, to
make sure this doesn't regress with my upcoming changes.
---
 flang/test/Lower/OpenMP/derived-type-allocatable.f90 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/flang/test/Lower/OpenMP/derived-type-allocatable.f90 b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
index 2dc4e20f27af2..1d6e22212eedd 100644
--- a/flang/test/Lower/OpenMP/derived-type-allocatable.f90
+++ b/flang/test/Lower/OpenMP/derived-type-allocatable.f90
@@ -24,6 +24,9 @@ module m1
 !CHECK-LABEL: omp.private {type = private} @_QMm1Ftest_array_of_allocs
 !CHECK:       fir.call @_FortranAInitializeClone
 !CHECK-NEXT:  omp.yield
+!CHECK:       } dealloc {
+!CHECK:       fir.call @_FortranAAllocatableDeallocate
+!CHECK:       omp.yield
 
 !CHECK-LABEL: omp.private {type = firstprivate} @_QMm1Ftest_array
 !CHECK-NOT:   fir.call @_FortranAInitializeClone

From c546b5317c518987a5f45dd4c4d25321a955c758 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Fri, 24 Jan 2025 23:02:50 +0800
Subject: [PATCH 012/432] [ValueTracking] Pass changed predicate `SignedLPred`
 to `isImpliedByMatchingCmp` (#124271)

Fixes #124267.

Since we are using the new predicate, we should also update the
parameters of `isImpliedByMatchingCmp`.
---
 llvm/lib/Analysis/ValueTracking.cpp           |  4 +-
 .../implied-condition-samesign.ll             | 48 +++++++++++++++++++
 2 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 264fedd6b66b9..eba728c7c8c36 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -9494,7 +9494,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
        SignedLPred == ICmpInst::ICMP_SGE) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonPositive()) &&
-        ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == false)
+        ICmpInst::isImpliedByMatchingCmp(SignedLPred, RPred) == false)
       return false;
   }
 
@@ -9504,7 +9504,7 @@ isImpliedCondICmps(const ICmpInst *LHS, CmpPredicate RPred, const Value *R0,
        SignedLPred == ICmpInst::ICMP_SLE) &&
       match(R0, m_NSWSub(m_Specific(L0), m_Specific(L1)))) {
     if (match(R1, m_NonNegative()) &&
-        ICmpInst::isImpliedByMatchingCmp(LPred, RPred) == true)
+        ICmpInst::isImpliedByMatchingCmp(SignedLPred, RPred) == true)
       return true;
   }
 
diff --git a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
index 0e6db403512ae..9a0591245fae0 100644
--- a/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
+++ b/llvm/test/Analysis/ValueTracking/implied-condition-samesign.ll
@@ -292,3 +292,51 @@ taken:
 end:
   ret i32 0
 }
+
+define i1 @gt_sub_nsw_ult(i8 %L0, i8 %L1, i1 %V) {
+; CHECK-LABEL: define i1 @gt_sub_nsw_ult(
+; CHECK-SAME: i8 [[L0:%.*]], i8 [[L1:%.*]], i1 [[V:%.*]]) {
+; CHECK-NEXT:    [[LHS:%.*]] = icmp samesign ugt i8 [[L0]], [[L1]]
+; CHECK-NEXT:    br i1 [[LHS]], label %[[LHS_TRUE:.*]], label %[[LHS_FALSE:.*]]
+; CHECK:       [[LHS_TRUE]]:
+; CHECK-NEXT:    [[R0:%.*]] = sub nsw i8 [[L0]], [[L1]]
+; CHECK-NEXT:    [[RHS:%.*]] = icmp ult i8 [[R0]], -1
+; CHECK-NEXT:    ret i1 [[RHS]]
+; CHECK:       [[LHS_FALSE]]:
+; CHECK-NEXT:    ret i1 [[V]]
+;
+  %LHS = icmp samesign ugt i8 %L0, %L1
+  br i1 %LHS, label %LHS_true, label %LHS_false
+
+LHS_true:
+  %R0 = sub nsw i8 %L0, %L1
+  %RHS = icmp ult i8 %R0, -1
+  ret i1 %RHS
+
+LHS_false:
+  ret i1 %V
+}
+
+define i1 @lt_sub_nsw_ult(i8 %L0, i8 %L1, i1 %V) {
+; CHECK-LABEL: define i1 @lt_sub_nsw_ult(
+; CHECK-SAME: i8 [[L0:%.*]], i8 [[L1:%.*]], i1 [[V:%.*]]) {
+; CHECK-NEXT:    [[LHS:%.*]] = icmp samesign ult i8 [[L0]], [[L1]]
+; CHECK-NEXT:    br i1 [[LHS]], label %[[LHS_TRUE:.*]], label %[[LHS_FALSE:.*]]
+; CHECK:       [[LHS_TRUE]]:
+; CHECK-NEXT:    [[R0:%.*]] = sub nsw i8 [[L0]], [[L1]]
+; CHECK-NEXT:    [[RHS:%.*]] = icmp ult i8 [[R0]], 1
+; CHECK-NEXT:    ret i1 [[RHS]]
+; CHECK:       [[LHS_FALSE]]:
+; CHECK-NEXT:    ret i1 [[V]]
+;
+  %LHS = icmp samesign ult i8 %L0, %L1
+  br i1 %LHS, label %LHS_true, label %LHS_false
+
+LHS_true:
+  %R0 = sub nsw i8 %L0, %L1
+  %RHS = icmp ult i8 %R0, 1
+  ret i1 %RHS
+
+LHS_false:
+  ret i1 %V
+}

From a12d7e4b611f0db2525da68f5576beaeeb6c84ac Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 24 Jan 2025 15:13:13 +0000
Subject: [PATCH 013/432] [SLP] getVectorCallCosts - don't provide scalar
 argument data for vector IntrinsicCostAttributes (#124254)

getVectorCallCosts determines the cost of a vector intrinsic, based off
an existing scalar intrinsic call - but we were including the scalar
argument data to the IntrinsicCostAttributes, which meant that not only
was the cost calculation not type-only based, it was making incorrect
assumptions about constant values etc.

This also exposed an issue that x86 relied on fallback calculations for
funnel shift costs - this is great when we have the argument data as
that improves the accuracy of uniform shift amounts etc., but meant that
type-only costs would default to Cost=2 for all custom lowered funnel
shifts, which was far too cheap.

This is the reverse of #124129 where we weren't including argument data
when we could.

Fixes #63980
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  18 +
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   4 +-
 .../SLPVectorizer/X86/arith-fshl-rot.ll       | 445 ++++++++----------
 .../SLPVectorizer/X86/arith-fshr-rot.ll       | 445 ++++++++----------
 4 files changed, 409 insertions(+), 503 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 34ba46f5e6cfd..d3c923a76d074 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4719,6 +4719,24 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
       if (auto KindCost = Entry->Cost[CostKind])
         return adjustTableCost(Entry->ISD, *KindCost, LT, ICA.getFlags());
+
+    // Without arg data, we need to compute the expanded costs of custom lowered
+    // intrinsics to prevent use of the (very low) default costs.
+    if (ICA.isTypeBasedOnly() &&
+        (IID == Intrinsic::fshl || IID == Intrinsic::fshr)) {
+      Type *CondTy = RetTy->getWithNewBitWidth(1);
+      InstructionCost Cost = 0;
+      Cost += getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
+      Cost += getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
+      Cost += getArithmeticInstrCost(BinaryOperator::Shl, RetTy, CostKind);
+      Cost += getArithmeticInstrCost(BinaryOperator::LShr, RetTy, CostKind);
+      Cost += getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
+                                 CmpInst::ICMP_EQ, CostKind);
+      Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
+                                 CmpInst::ICMP_EQ, CostKind);
+      return Cost;
+    }
   }
 
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 040c57703b7c6..eea6b32460d70 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9031,9 +9031,7 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
   FastMathFlags FMF;
   if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
     FMF = FPCI->getFastMathFlags();
-  SmallVector<const Value *> Arguments(CI->args());
-  IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
-                                    dyn_cast<IntrinsicInst>(CI));
+  IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);
   auto IntrinsicCost =
     TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
index 153191b1eea08..3b526c4537243 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
 @b64 = common global [8 x i64] zeroinitializer, align 64
@@ -240,16 +240,46 @@ define void @fshl_v16i32() {
 ; SSE-NEXT:    store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @fshl_v16i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
-; AVX-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @fshl_v16i32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; AVX1-NEXT:    store <4 x i32> [[TMP3]], ptr @d32, align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; AVX1-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; AVX1-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fshl_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX2-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX256-LABEL: @fshl_v16i32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX256-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX256-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fshl_v16i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
@@ -333,155 +363,136 @@ define void @fshl_v16i32() {
 }
 
 define void @fshl_v32i16() {
-; SSE2-LABEL: @fshl_v32i16(
-; SSE2-NEXT:    [[A0:%.*]] = load i16, ptr @a16, align 2
-; SSE2-NEXT:    [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE2-NEXT:    [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
-; SSE2-NEXT:    [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
-; SSE2-NEXT:    [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
-; SSE2-NEXT:    [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
-; SSE2-NEXT:    [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
-; SSE2-NEXT:    [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
-; SSE2-NEXT:    [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
-; SSE2-NEXT:    [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE2-NEXT:    [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
-; SSE2-NEXT:    [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
-; SSE2-NEXT:    [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
-; SSE2-NEXT:    [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
-; SSE2-NEXT:    [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
-; SSE2-NEXT:    [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
-; SSE2-NEXT:    [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
-; SSE2-NEXT:    [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE2-NEXT:    [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
-; SSE2-NEXT:    [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
-; SSE2-NEXT:    [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
-; SSE2-NEXT:    [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
-; SSE2-NEXT:    [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
-; SSE2-NEXT:    [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
-; SSE2-NEXT:    [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
-; SSE2-NEXT:    [[B0:%.*]] = load i16, ptr @b16, align 2
-; SSE2-NEXT:    [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE2-NEXT:    [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
-; SSE2-NEXT:    [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
-; SSE2-NEXT:    [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
-; SSE2-NEXT:    [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
-; SSE2-NEXT:    [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
-; SSE2-NEXT:    [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
-; SSE2-NEXT:    [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
-; SSE2-NEXT:    [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE2-NEXT:    [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
-; SSE2-NEXT:    [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
-; SSE2-NEXT:    [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
-; SSE2-NEXT:    [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
-; SSE2-NEXT:    [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
-; SSE2-NEXT:    [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
-; SSE2-NEXT:    [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
-; SSE2-NEXT:    [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE2-NEXT:    [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
-; SSE2-NEXT:    [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
-; SSE2-NEXT:    [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
-; SSE2-NEXT:    [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
-; SSE2-NEXT:    [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
-; SSE2-NEXT:    [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
-; SSE2-NEXT:    [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
-; SSE2-NEXT:    [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
-; SSE2-NEXT:    [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
-; SSE2-NEXT:    [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
-; SSE2-NEXT:    [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
-; SSE2-NEXT:    [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
-; SSE2-NEXT:    [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
-; SSE2-NEXT:    [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
-; SSE2-NEXT:    [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
-; SSE2-NEXT:    [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
-; SSE2-NEXT:    [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
-; SSE2-NEXT:    [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
-; SSE2-NEXT:    [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
-; SSE2-NEXT:    [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
-; SSE2-NEXT:    [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
-; SSE2-NEXT:    [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
-; SSE2-NEXT:    [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
-; SSE2-NEXT:    [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
-; SSE2-NEXT:    [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
-; SSE2-NEXT:    [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
-; SSE2-NEXT:    [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
-; SSE2-NEXT:    [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
-; SSE2-NEXT:    [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
-; SSE2-NEXT:    [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
-; SSE2-NEXT:    [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
-; SSE2-NEXT:    [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
-; SSE2-NEXT:    [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
-; SSE2-NEXT:    [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
-; SSE2-NEXT:    [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
-; SSE2-NEXT:    [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
-; SSE2-NEXT:    [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
-; SSE2-NEXT:    [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
-; SSE2-NEXT:    [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
-; SSE2-NEXT:    store i16 [[R0]], ptr @d16, align 2
-; SSE2-NEXT:    store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
-; SSE2-NEXT:    store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
-; SSE2-NEXT:    store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
-; SSE2-NEXT:    store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
-; SSE2-NEXT:    store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
-; SSE2-NEXT:    store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
-; SSE2-NEXT:    store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
-; SSE2-NEXT:    store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
-; SSE2-NEXT:    store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
-; SSE2-NEXT:    store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
-; SSE2-NEXT:    store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
-; SSE2-NEXT:    store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
-; SSE2-NEXT:    store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
-; SSE2-NEXT:    store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
-; SSE2-NEXT:    store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
-; SSE2-NEXT:    store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
-; SSE2-NEXT:    store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
-; SSE2-NEXT:    store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
-; SSE2-NEXT:    store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
-; SSE2-NEXT:    store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
-; SSE2-NEXT:    store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
-; SSE2-NEXT:    store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
-; SSE2-NEXT:    store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
-; SSE2-NEXT:    store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
-; SSE2-NEXT:    store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE4-LABEL: @fshl_v32i16(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE4-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE4-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE4-NEXT:    store <8 x i16> [[TMP3]], ptr @d16, align 2
-; SSE4-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE4-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE4-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE4-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE4-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SSE4-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
-; SSE4-NEXT:    ret void
+; SSE-LABEL: @fshl_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, ptr @a16, align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, ptr @b16, align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = call i16 @llvm.fshl.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i16 @llvm.fshl.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i16 @llvm.fshl.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i16 @llvm.fshl.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i16 @llvm.fshl.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i16 @llvm.fshl.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i16 @llvm.fshl.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i16 @llvm.fshl.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
+; SSE-NEXT:    [[R8:%.*]] = call i16 @llvm.fshl.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
+; SSE-NEXT:    [[R9:%.*]] = call i16 @llvm.fshl.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
+; SSE-NEXT:    [[R10:%.*]] = call i16 @llvm.fshl.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
+; SSE-NEXT:    [[R11:%.*]] = call i16 @llvm.fshl.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
+; SSE-NEXT:    [[R12:%.*]] = call i16 @llvm.fshl.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
+; SSE-NEXT:    [[R13:%.*]] = call i16 @llvm.fshl.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
+; SSE-NEXT:    [[R14:%.*]] = call i16 @llvm.fshl.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
+; SSE-NEXT:    [[R15:%.*]] = call i16 @llvm.fshl.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
+; SSE-NEXT:    [[R16:%.*]] = call i16 @llvm.fshl.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
+; SSE-NEXT:    [[R17:%.*]] = call i16 @llvm.fshl.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
+; SSE-NEXT:    [[R18:%.*]] = call i16 @llvm.fshl.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
+; SSE-NEXT:    [[R19:%.*]] = call i16 @llvm.fshl.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
+; SSE-NEXT:    [[R20:%.*]] = call i16 @llvm.fshl.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
+; SSE-NEXT:    [[R21:%.*]] = call i16 @llvm.fshl.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
+; SSE-NEXT:    [[R22:%.*]] = call i16 @llvm.fshl.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
+; SSE-NEXT:    [[R23:%.*]] = call i16 @llvm.fshl.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
+; SSE-NEXT:    [[R24:%.*]] = call i16 @llvm.fshl.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
+; SSE-NEXT:    [[R25:%.*]] = call i16 @llvm.fshl.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
+; SSE-NEXT:    [[R26:%.*]] = call i16 @llvm.fshl.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
+; SSE-NEXT:    [[R27:%.*]] = call i16 @llvm.fshl.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
+; SSE-NEXT:    [[R28:%.*]] = call i16 @llvm.fshl.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
+; SSE-NEXT:    [[R29:%.*]] = call i16 @llvm.fshl.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
+; SSE-NEXT:    [[R30:%.*]] = call i16 @llvm.fshl.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
+; SSE-NEXT:    [[R31:%.*]] = call i16 @llvm.fshl.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
+; SSE-NEXT:    store i16 [[R0]], ptr @d16, align 2
+; SSE-NEXT:    store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @fshl_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
@@ -944,52 +955,16 @@ define void @fshl_v64i8() {
 }
 
 define void @fshl_v2i32() {
-; SSE-LABEL: @fshl_v2i32(
-; SSE-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; SSE-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; SSE-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @fshl_v2i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @fshl_v2i32(
-; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; AVX2-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX2-NEXT:    ret void
-;
-; AVX256-LABEL: @fshl_v2i32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
-; AVX256-NEXT:    store <2 x i32> [[TMP3]], ptr @d32, align 4
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fshl_v2i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
-; AVX512-NEXT:    store <2 x i32> [[TMP3]], ptr @d32, align 4
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @fshl_v2i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
 ;
 ; AVX512VBMI2-LABEL: @fshl_v2i32(
 ; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
@@ -1011,44 +986,14 @@ define void @fshl_v2i32() {
 
 ; PR63980
 define void @fshl_v2i32_uniformconst() {
-; SSE-LABEL: @fshl_v2i32_uniformconst(
-; SSE-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; SSE-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @fshl_v2i32_uniformconst(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @fshl_v2i32_uniformconst(
-; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX2-NEXT:    ret void
-;
-; AVX256-LABEL: @fshl_v2i32_uniformconst(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
-; AVX256-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fshl_v2i32_uniformconst(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
-; AVX512-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @fshl_v2i32_uniformconst(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
 ;
 ; AVX512VBMI2-LABEL: @fshl_v2i32_uniformconst(
 ; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
index 4d50ffad7f8b5..aae540b4b2454 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=SSE,SSE4
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX,AVX256
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=+prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX256
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -mattr=-prefer-256-bit -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=znver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512VBMI2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX512
 
 @a64 = common global [8 x i64] zeroinitializer, align 64
 @b64 = common global [8 x i64] zeroinitializer, align 64
@@ -240,16 +240,46 @@ define void @fshr_v16i32() {
 ; SSE-NEXT:    store i32 [[R15]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 15), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @fshr_v16i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
-; AVX-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
-; AVX-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
-; AVX-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
-; AVX-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
-; AVX-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
-; AVX-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
-; AVX-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @fshr_v16i32(
+; AVX1-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @a32, align 4
+; AVX1-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @b32, align 4
+; AVX1-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> [[TMP2]])
+; AVX1-NEXT:    store <4 x i32> [[TMP3]], ptr @d32, align 4
+; AVX1-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP4]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]])
+; AVX1-NEXT:    store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 4), align 4
+; AVX1-NEXT:    [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]])
+; AVX1-NEXT:    store <4 x i32> [[TMP9]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX1-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[TMP11:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 12), align 4
+; AVX1-NEXT:    [[TMP12:%.*]] = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[TMP10]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]])
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 12), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @fshr_v16i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX2-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX2-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX2-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX2-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX2-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX2-NEXT:    ret void
+;
+; AVX256-LABEL: @fshr_v16i32(
+; AVX256-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @a32, align 4
+; AVX256-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr @b32, align 4
+; AVX256-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP1]], <8 x i32> [[TMP2]])
+; AVX256-NEXT:    store <8 x i32> [[TMP3]], ptr @d32, align 4
+; AVX256-NEXT:    [[TMP4:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 8), align 4
+; AVX256-NEXT:    [[TMP5:%.*]] = load <8 x i32>, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 8), align 4
+; AVX256-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.fshr.v8i32(<8 x i32> [[TMP4]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]])
+; AVX256-NEXT:    store <8 x i32> [[TMP6]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 8), align 4
+; AVX256-NEXT:    ret void
 ;
 ; AVX512-LABEL: @fshr_v16i32(
 ; AVX512-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @a32, align 4
@@ -333,155 +363,136 @@ define void @fshr_v16i32() {
 }
 
 define void @fshr_v32i16() {
-; SSE2-LABEL: @fshr_v32i16(
-; SSE2-NEXT:    [[A0:%.*]] = load i16, ptr @a16, align 2
-; SSE2-NEXT:    [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE2-NEXT:    [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
-; SSE2-NEXT:    [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
-; SSE2-NEXT:    [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
-; SSE2-NEXT:    [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
-; SSE2-NEXT:    [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
-; SSE2-NEXT:    [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
-; SSE2-NEXT:    [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
-; SSE2-NEXT:    [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE2-NEXT:    [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
-; SSE2-NEXT:    [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
-; SSE2-NEXT:    [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
-; SSE2-NEXT:    [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
-; SSE2-NEXT:    [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
-; SSE2-NEXT:    [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
-; SSE2-NEXT:    [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
-; SSE2-NEXT:    [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE2-NEXT:    [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
-; SSE2-NEXT:    [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
-; SSE2-NEXT:    [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
-; SSE2-NEXT:    [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
-; SSE2-NEXT:    [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
-; SSE2-NEXT:    [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
-; SSE2-NEXT:    [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
-; SSE2-NEXT:    [[B0:%.*]] = load i16, ptr @b16, align 2
-; SSE2-NEXT:    [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
-; SSE2-NEXT:    [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
-; SSE2-NEXT:    [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
-; SSE2-NEXT:    [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
-; SSE2-NEXT:    [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
-; SSE2-NEXT:    [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
-; SSE2-NEXT:    [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE2-NEXT:    [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
-; SSE2-NEXT:    [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
-; SSE2-NEXT:    [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
-; SSE2-NEXT:    [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
-; SSE2-NEXT:    [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
-; SSE2-NEXT:    [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
-; SSE2-NEXT:    [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
-; SSE2-NEXT:    [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE2-NEXT:    [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
-; SSE2-NEXT:    [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
-; SSE2-NEXT:    [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
-; SSE2-NEXT:    [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
-; SSE2-NEXT:    [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
-; SSE2-NEXT:    [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
-; SSE2-NEXT:    [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
-; SSE2-NEXT:    [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE2-NEXT:    [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
-; SSE2-NEXT:    [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
-; SSE2-NEXT:    [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
-; SSE2-NEXT:    [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
-; SSE2-NEXT:    [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
-; SSE2-NEXT:    [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
-; SSE2-NEXT:    [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
-; SSE2-NEXT:    [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
-; SSE2-NEXT:    [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
-; SSE2-NEXT:    [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
-; SSE2-NEXT:    [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
-; SSE2-NEXT:    [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
-; SSE2-NEXT:    [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
-; SSE2-NEXT:    [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
-; SSE2-NEXT:    [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
-; SSE2-NEXT:    [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
-; SSE2-NEXT:    [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
-; SSE2-NEXT:    [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
-; SSE2-NEXT:    [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
-; SSE2-NEXT:    [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
-; SSE2-NEXT:    [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
-; SSE2-NEXT:    [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
-; SSE2-NEXT:    [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
-; SSE2-NEXT:    [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
-; SSE2-NEXT:    [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
-; SSE2-NEXT:    [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
-; SSE2-NEXT:    [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
-; SSE2-NEXT:    [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
-; SSE2-NEXT:    [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
-; SSE2-NEXT:    [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
-; SSE2-NEXT:    [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
-; SSE2-NEXT:    [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
-; SSE2-NEXT:    [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
-; SSE2-NEXT:    [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
-; SSE2-NEXT:    [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
-; SSE2-NEXT:    [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
-; SSE2-NEXT:    [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
-; SSE2-NEXT:    [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
-; SSE2-NEXT:    [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
-; SSE2-NEXT:    store i16 [[R0]], ptr @d16, align 2
-; SSE2-NEXT:    store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
-; SSE2-NEXT:    store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
-; SSE2-NEXT:    store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
-; SSE2-NEXT:    store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
-; SSE2-NEXT:    store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
-; SSE2-NEXT:    store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
-; SSE2-NEXT:    store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
-; SSE2-NEXT:    store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
-; SSE2-NEXT:    store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
-; SSE2-NEXT:    store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
-; SSE2-NEXT:    store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
-; SSE2-NEXT:    store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
-; SSE2-NEXT:    store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
-; SSE2-NEXT:    store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
-; SSE2-NEXT:    store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
-; SSE2-NEXT:    store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
-; SSE2-NEXT:    store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
-; SSE2-NEXT:    store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
-; SSE2-NEXT:    store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
-; SSE2-NEXT:    store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
-; SSE2-NEXT:    store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
-; SSE2-NEXT:    store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
-; SSE2-NEXT:    store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
-; SSE2-NEXT:    store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
-; SSE2-NEXT:    store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
-; SSE2-NEXT:    store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
-; SSE2-NEXT:    store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
-; SSE2-NEXT:    store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
-; SSE2-NEXT:    store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
-; SSE2-NEXT:    store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
-; SSE2-NEXT:    store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
-; SSE2-NEXT:    ret void
-;
-; SSE4-LABEL: @fshr_v32i16(
-; SSE4-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @a16, align 2
-; SSE4-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr @b16, align 2
-; SSE4-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i16> [[TMP2]])
-; SSE4-NEXT:    store <8 x i16> [[TMP3]], ptr @d16, align 2
-; SSE4-NEXT:    [[TMP4:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
-; SSE4-NEXT:    store <8 x i16> [[TMP6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
-; SSE4-NEXT:    [[TMP7:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP8:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP9:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP7]], <8 x i16> [[TMP7]], <8 x i16> [[TMP8]])
-; SSE4-NEXT:    store <8 x i16> [[TMP9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
-; SSE4-NEXT:    [[TMP10:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
-; SSE4-NEXT:    [[TMP11:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
-; SSE4-NEXT:    [[TMP12:%.*]] = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> [[TMP10]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]])
-; SSE4-NEXT:    store <8 x i16> [[TMP12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
-; SSE4-NEXT:    ret void
+; SSE-LABEL: @fshr_v32i16(
+; SSE-NEXT:    [[A0:%.*]] = load i16, ptr @a16, align 2
+; SSE-NEXT:    [[A1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[A2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[A3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[A4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[A5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[A6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[A7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[A8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[A9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[A10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[A11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[A12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[A13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[A14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[A15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[A16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[A17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[A18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[A19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[A20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[A21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[A22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[A23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[A24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[A25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[A26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[A27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[A28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[A29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[A30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[A31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @a16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[B0:%.*]] = load i16, ptr @b16, align 2
+; SSE-NEXT:    [[B1:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 1), align 2
+; SSE-NEXT:    [[B2:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 2), align 2
+; SSE-NEXT:    [[B3:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 3), align 2
+; SSE-NEXT:    [[B4:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 4), align 2
+; SSE-NEXT:    [[B5:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 5), align 2
+; SSE-NEXT:    [[B6:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 6), align 2
+; SSE-NEXT:    [[B7:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 7), align 2
+; SSE-NEXT:    [[B8:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 8), align 2
+; SSE-NEXT:    [[B9:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 9), align 2
+; SSE-NEXT:    [[B10:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 10), align 2
+; SSE-NEXT:    [[B11:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 11), align 2
+; SSE-NEXT:    [[B12:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 12), align 2
+; SSE-NEXT:    [[B13:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 13), align 2
+; SSE-NEXT:    [[B14:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 14), align 2
+; SSE-NEXT:    [[B15:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 15), align 2
+; SSE-NEXT:    [[B16:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 16), align 2
+; SSE-NEXT:    [[B17:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 17), align 2
+; SSE-NEXT:    [[B18:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 18), align 2
+; SSE-NEXT:    [[B19:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 19), align 2
+; SSE-NEXT:    [[B20:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 20), align 2
+; SSE-NEXT:    [[B21:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 21), align 2
+; SSE-NEXT:    [[B22:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 22), align 2
+; SSE-NEXT:    [[B23:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 23), align 2
+; SSE-NEXT:    [[B24:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 24), align 2
+; SSE-NEXT:    [[B25:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 25), align 2
+; SSE-NEXT:    [[B26:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 26), align 2
+; SSE-NEXT:    [[B27:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 27), align 2
+; SSE-NEXT:    [[B28:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 28), align 2
+; SSE-NEXT:    [[B29:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 29), align 2
+; SSE-NEXT:    [[B30:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 30), align 2
+; SSE-NEXT:    [[B31:%.*]] = load i16, ptr getelementptr inbounds ([32 x i16], ptr @b16, i32 0, i64 31), align 2
+; SSE-NEXT:    [[R0:%.*]] = call i16 @llvm.fshr.i16(i16 [[A0]], i16 [[A0]], i16 [[B0]])
+; SSE-NEXT:    [[R1:%.*]] = call i16 @llvm.fshr.i16(i16 [[A1]], i16 [[A1]], i16 [[B1]])
+; SSE-NEXT:    [[R2:%.*]] = call i16 @llvm.fshr.i16(i16 [[A2]], i16 [[A2]], i16 [[B2]])
+; SSE-NEXT:    [[R3:%.*]] = call i16 @llvm.fshr.i16(i16 [[A3]], i16 [[A3]], i16 [[B3]])
+; SSE-NEXT:    [[R4:%.*]] = call i16 @llvm.fshr.i16(i16 [[A4]], i16 [[A4]], i16 [[B4]])
+; SSE-NEXT:    [[R5:%.*]] = call i16 @llvm.fshr.i16(i16 [[A5]], i16 [[A5]], i16 [[B5]])
+; SSE-NEXT:    [[R6:%.*]] = call i16 @llvm.fshr.i16(i16 [[A6]], i16 [[A6]], i16 [[B6]])
+; SSE-NEXT:    [[R7:%.*]] = call i16 @llvm.fshr.i16(i16 [[A7]], i16 [[A7]], i16 [[B7]])
+; SSE-NEXT:    [[R8:%.*]] = call i16 @llvm.fshr.i16(i16 [[A8]], i16 [[A8]], i16 [[B8]])
+; SSE-NEXT:    [[R9:%.*]] = call i16 @llvm.fshr.i16(i16 [[A9]], i16 [[A9]], i16 [[B9]])
+; SSE-NEXT:    [[R10:%.*]] = call i16 @llvm.fshr.i16(i16 [[A10]], i16 [[A10]], i16 [[B10]])
+; SSE-NEXT:    [[R11:%.*]] = call i16 @llvm.fshr.i16(i16 [[A11]], i16 [[A11]], i16 [[B11]])
+; SSE-NEXT:    [[R12:%.*]] = call i16 @llvm.fshr.i16(i16 [[A12]], i16 [[A12]], i16 [[B12]])
+; SSE-NEXT:    [[R13:%.*]] = call i16 @llvm.fshr.i16(i16 [[A13]], i16 [[A13]], i16 [[B13]])
+; SSE-NEXT:    [[R14:%.*]] = call i16 @llvm.fshr.i16(i16 [[A14]], i16 [[A14]], i16 [[B14]])
+; SSE-NEXT:    [[R15:%.*]] = call i16 @llvm.fshr.i16(i16 [[A15]], i16 [[A15]], i16 [[B15]])
+; SSE-NEXT:    [[R16:%.*]] = call i16 @llvm.fshr.i16(i16 [[A16]], i16 [[A16]], i16 [[B16]])
+; SSE-NEXT:    [[R17:%.*]] = call i16 @llvm.fshr.i16(i16 [[A17]], i16 [[A17]], i16 [[B17]])
+; SSE-NEXT:    [[R18:%.*]] = call i16 @llvm.fshr.i16(i16 [[A18]], i16 [[A18]], i16 [[B18]])
+; SSE-NEXT:    [[R19:%.*]] = call i16 @llvm.fshr.i16(i16 [[A19]], i16 [[A19]], i16 [[B19]])
+; SSE-NEXT:    [[R20:%.*]] = call i16 @llvm.fshr.i16(i16 [[A20]], i16 [[A20]], i16 [[B20]])
+; SSE-NEXT:    [[R21:%.*]] = call i16 @llvm.fshr.i16(i16 [[A21]], i16 [[A21]], i16 [[B21]])
+; SSE-NEXT:    [[R22:%.*]] = call i16 @llvm.fshr.i16(i16 [[A22]], i16 [[A22]], i16 [[B22]])
+; SSE-NEXT:    [[R23:%.*]] = call i16 @llvm.fshr.i16(i16 [[A23]], i16 [[A23]], i16 [[B23]])
+; SSE-NEXT:    [[R24:%.*]] = call i16 @llvm.fshr.i16(i16 [[A24]], i16 [[A24]], i16 [[B24]])
+; SSE-NEXT:    [[R25:%.*]] = call i16 @llvm.fshr.i16(i16 [[A25]], i16 [[A25]], i16 [[B25]])
+; SSE-NEXT:    [[R26:%.*]] = call i16 @llvm.fshr.i16(i16 [[A26]], i16 [[A26]], i16 [[B26]])
+; SSE-NEXT:    [[R27:%.*]] = call i16 @llvm.fshr.i16(i16 [[A27]], i16 [[A27]], i16 [[B27]])
+; SSE-NEXT:    [[R28:%.*]] = call i16 @llvm.fshr.i16(i16 [[A28]], i16 [[A28]], i16 [[B28]])
+; SSE-NEXT:    [[R29:%.*]] = call i16 @llvm.fshr.i16(i16 [[A29]], i16 [[A29]], i16 [[B29]])
+; SSE-NEXT:    [[R30:%.*]] = call i16 @llvm.fshr.i16(i16 [[A30]], i16 [[A30]], i16 [[B30]])
+; SSE-NEXT:    [[R31:%.*]] = call i16 @llvm.fshr.i16(i16 [[A31]], i16 [[A31]], i16 [[B31]])
+; SSE-NEXT:    store i16 [[R0]], ptr @d16, align 2
+; SSE-NEXT:    store i16 [[R1]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 1), align 2
+; SSE-NEXT:    store i16 [[R2]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 2), align 2
+; SSE-NEXT:    store i16 [[R3]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 3), align 2
+; SSE-NEXT:    store i16 [[R4]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 4), align 2
+; SSE-NEXT:    store i16 [[R5]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 5), align 2
+; SSE-NEXT:    store i16 [[R6]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 6), align 2
+; SSE-NEXT:    store i16 [[R7]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 7), align 2
+; SSE-NEXT:    store i16 [[R8]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 8), align 2
+; SSE-NEXT:    store i16 [[R9]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 9), align 2
+; SSE-NEXT:    store i16 [[R10]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 10), align 2
+; SSE-NEXT:    store i16 [[R11]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 11), align 2
+; SSE-NEXT:    store i16 [[R12]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 12), align 2
+; SSE-NEXT:    store i16 [[R13]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 13), align 2
+; SSE-NEXT:    store i16 [[R14]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 14), align 2
+; SSE-NEXT:    store i16 [[R15]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 15), align 2
+; SSE-NEXT:    store i16 [[R16]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 16), align 2
+; SSE-NEXT:    store i16 [[R17]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 17), align 2
+; SSE-NEXT:    store i16 [[R18]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 18), align 2
+; SSE-NEXT:    store i16 [[R19]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 19), align 2
+; SSE-NEXT:    store i16 [[R20]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 20), align 2
+; SSE-NEXT:    store i16 [[R21]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 21), align 2
+; SSE-NEXT:    store i16 [[R22]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 22), align 2
+; SSE-NEXT:    store i16 [[R23]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 23), align 2
+; SSE-NEXT:    store i16 [[R24]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 24), align 2
+; SSE-NEXT:    store i16 [[R25]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 25), align 2
+; SSE-NEXT:    store i16 [[R26]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 26), align 2
+; SSE-NEXT:    store i16 [[R27]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 27), align 2
+; SSE-NEXT:    store i16 [[R28]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 28), align 2
+; SSE-NEXT:    store i16 [[R29]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 29), align 2
+; SSE-NEXT:    store i16 [[R30]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 30), align 2
+; SSE-NEXT:    store i16 [[R31]], ptr getelementptr inbounds ([32 x i16], ptr @d16, i32 0, i64 31), align 2
+; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @fshr_v32i16(
 ; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @a16, align 2
@@ -944,52 +955,16 @@ define void @fshr_v64i8() {
 }
 
 define void @fshr_v2i32() {
-; SSE-LABEL: @fshr_v2i32(
-; SSE-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; SSE-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; SSE-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @fshr_v2i32(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; AVX1-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @fshr_v2i32(
-; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
-; AVX2-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
-; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
-; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX2-NEXT:    ret void
-;
-; AVX256-LABEL: @fshr_v2i32(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
-; AVX256-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
-; AVX256-NEXT:    store <2 x i32> [[TMP3]], ptr @d32, align 4
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fshr_v2i32(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr @b32, align 4
-; AVX512-NEXT:    [[TMP3:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> [[TMP2]])
-; AVX512-NEXT:    store <2 x i32> [[TMP3]], ptr @d32, align 4
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @fshr_v2i32(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[B0:%.*]] = load i32, ptr @b32, align 4
+; CHECK-NEXT:    [[B1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @b32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 [[B0]])
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 [[B1]])
+; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
 ;
 ; AVX512VBMI2-LABEL: @fshr_v2i32(
 ; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
@@ -1011,44 +986,14 @@ define void @fshr_v2i32() {
 
 ; PR63980
 define void @fshr_v2i32_uniformconst() {
-; SSE-LABEL: @fshr_v2i32_uniformconst(
-; SSE-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; SSE-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; SSE-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; SSE-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; SSE-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; SSE-NEXT:    ret void
-;
-; AVX1-LABEL: @fshr_v2i32_uniformconst(
-; AVX1-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX1-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX1-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX1-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX1-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX1-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @fshr_v2i32_uniformconst(
-; AVX2-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
-; AVX2-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
-; AVX2-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
-; AVX2-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
-; AVX2-NEXT:    store i32 [[R0]], ptr @d32, align 4
-; AVX2-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
-; AVX2-NEXT:    ret void
-;
-; AVX256-LABEL: @fshr_v2i32_uniformconst(
-; AVX256-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX256-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
-; AVX256-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
-; AVX256-NEXT:    ret void
-;
-; AVX512-LABEL: @fshr_v2i32_uniformconst(
-; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4
-; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> splat (i32 1))
-; AVX512-NEXT:    store <2 x i32> [[TMP2]], ptr @d32, align 4
-; AVX512-NEXT:    ret void
+; CHECK-LABEL: @fshr_v2i32_uniformconst(
+; CHECK-NEXT:    [[A0:%.*]] = load i32, ptr @a32, align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i32, ptr getelementptr inbounds ([16 x i32], ptr @a32, i32 0, i64 1), align 4
+; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A0]], i32 [[A0]], i32 1)
+; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.fshr.i32(i32 [[A1]], i32 [[A1]], i32 1)
+; CHECK-NEXT:    store i32 [[R0]], ptr @d32, align 4
+; CHECK-NEXT:    store i32 [[R1]], ptr getelementptr inbounds ([16 x i32], ptr @d32, i32 0, i64 1), align 4
+; CHECK-NEXT:    ret void
 ;
 ; AVX512VBMI2-LABEL: @fshr_v2i32_uniformconst(
 ; AVX512VBMI2-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr @a32, align 4

From 6c11b7e689c89ff46e4472810dd555434eab1010 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Fri, 24 Jan 2025 07:23:22 -0800
Subject: [PATCH 014/432] [CodeGen] NFC: Change order of checks in
 MachineInstr->isDead() (#124207)

[[Change-Id:
Ic349022bb99ef91f5396e462ade0366bc772ae02](https://github.com/llvm/llvm-project/pull/123531)](https://github.com/llvm/llvm-project/pull/123531)
moved isDead() from DeadMachineInstrElim to MachineInstr . In the
process of moving, I reordered the checks to improve chances of early
exit, but this has caused a slight increase in compile time.

This PR reverts back to the original order of checks.
---
 llvm/lib/CodeGen/MachineInstr.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 0f7f525fa479e..a9f756b684360 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -1353,18 +1353,6 @@ bool MachineInstr::wouldBeTriviallyDead() const {
 
 bool MachineInstr::isDead(const MachineRegisterInfo &MRI,
                           LiveRegUnits *LivePhysRegs) const {
-  // Technically speaking inline asm without side effects and no defs can still
-  // be deleted. But there is so much bad inline asm code out there, we should
-  // let them be.
-  if (isInlineAsm())
-    return false;
-
-  // If we suspect this instruction may have some side-effects, then we say
-  // this instruction cannot be dead.
-  // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
-  if (!isLifetimeMarker() && !wouldBeTriviallyDead())
-    return false;
-
   // Instructions without side-effects are dead iff they only define dead regs.
   // This function is hot and this loop returns early in the common case,
   // so only perform additional checks before this if absolutely necessary.
@@ -1385,7 +1373,19 @@ bool MachineInstr::isDead(const MachineRegisterInfo &MRI,
     }
   }
 
-  return true;
+  // Technically speaking inline asm without side effects and no defs can still
+  // be deleted. But there is so much bad inline asm code out there, we should
+  // let them be.
+  if (isInlineAsm())
+    return false;
+
+  // FIXME: See issue #105950 for why LIFETIME markers are considered dead here.
+  if (isLifetimeMarker())
+    return true;
+
+  // If there are no defs with uses, then we call the instruction dead so long
+  // as we do not suspect it may have sideeffects.
+  return wouldBeTriviallyDead();
 }
 
 static bool MemOperandsHaveAlias(const MachineFrameInfo &MFI,

From a94226f9e6f5be4d6978134e7813f22b0510f3d4 Mon Sep 17 00:00:00 2001
From: Eric Astor <epastor@google.com>
Date: Fri, 24 Jan 2025 10:30:10 -0500
Subject: [PATCH 015/432] [llvm-ml] Remove unsafe getCurrentSegmentOnly() call
 (#123355)

This call was made unsafe recently, but was not fixed in
db48f1a1764023f8efeb055e343b967d1eb37d19 (the commit that fixed the
parallel code in AsmParser.cpp).

Fixes #123189
---
 llvm/lib/MC/MCParser/COFFMasmParser.cpp      | 3 +++
 llvm/lib/MC/MCParser/MasmParser.cpp          | 3 ++-
 llvm/test/tools/llvm-ml/bare_proc_error.asm  | 7 +++++++
 llvm/test/tools/llvm-ml/no_section_error.asm | 4 ++++
 4 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/llvm-ml/bare_proc_error.asm
 create mode 100644 llvm/test/tools/llvm-ml/no_section_error.asm

diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
index c323e64a40aee..8464a2392680b 100644
--- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -441,6 +441,9 @@ bool COFFMasmParser::parseDirectiveOption(StringRef Directive, SMLoc Loc) {
 ///          statements
 ///      label "endproc"
 bool COFFMasmParser::parseDirectiveProc(StringRef Directive, SMLoc Loc) {
+  if (!getStreamer().getCurrentFragment())
+    return Error(getTok().getLoc(), "expected section directive");
+
   StringRef Label;
   if (getParser().parseIdentifier(Label))
     return Error(Loc, "expected identifier for procedure");
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index 78261c1f9fedb..b2c956e0a4598 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -1454,7 +1454,8 @@ bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 }
 
 bool MasmParser::checkForValidSection() {
-  if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
+  if (!ParsingMSInlineAsm && !(getStreamer().getCurrentFragment() &&
+                               getStreamer().getCurrentSectionOnly())) {
     Out.initSections(false, getTargetParser().getSTI());
     return Error(getTok().getLoc(),
                  "expected section directive before assembly directive");
diff --git a/llvm/test/tools/llvm-ml/bare_proc_error.asm b/llvm/test/tools/llvm-ml/bare_proc_error.asm
new file mode 100644
index 0000000000000..59668edafccf1
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/bare_proc_error.asm
@@ -0,0 +1,7 @@
+; RUN: not llvm-ml -filetype=s %s /Fo /dev/null 2>&1 | FileCheck %s
+
+; CHECK: :[[# @LINE+1]]:1: error: expected section directive
+foo PROC
+; CHECK: :[[# @LINE+1]]:6: error: expected section directive before assembly directive
+  ret
+foo ENDP
diff --git a/llvm/test/tools/llvm-ml/no_section_error.asm b/llvm/test/tools/llvm-ml/no_section_error.asm
new file mode 100644
index 0000000000000..65c111908b81a
--- /dev/null
+++ b/llvm/test/tools/llvm-ml/no_section_error.asm
@@ -0,0 +1,4 @@
+; RUN: not llvm-ml -filetype=s %s /Fo /dev/null 2>&1 | FileCheck %s
+
+; CHECK: :[[# @LINE + 1]]:6: error: expected section directive before assembly directive in 'BYTE' directive
+BYTE 2, 3, 4

From ec66c4af09263e68d800971906e60afc27d54a06 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Fri, 24 Jan 2025 10:44:00 -0500
Subject: [PATCH 016/432] [AMDGPU][True16][CodeGen] true16 codegen pattern for
 f16 canonicalize (#122000)

true16 codegen pattern for f16 canonicalize
---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  33 +-
 .../GlobalISel/inst-select-fcanonicalize.mir  |  43 ++-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 363 ++++++++++++------
 3 files changed, 304 insertions(+), 135 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index da44faac2f910..15c77c2a723e4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3093,7 +3093,7 @@ foreach vt = [f16, v2f16, f32, v2f32, f64] in {
 // Prefer selecting to max when legal, but using mul is always valid.
 let AddedComplexity = -5 in {
 
-let OtherPredicates = [NotHasTrue16BitInsts] in {
+let True16Predicate = NotHasTrue16BitInsts in {
 def : GCNPat<
   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
   (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3103,9 +3103,21 @@ def : GCNPat<
   (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
   (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
 >;
-} // End OtherPredicates
+} // End True16Predicate
 
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat<
+  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
+>;
+
+def : GCNPat<
+  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+  (V_MUL_F16_t16_e64 0, (i16 CONST.FP16_NEG_ONE), $src_mods, $src, 0/*Clamp*/, /*omod*/0, /*opsel*/0)
+>;
+} // End True16Predicate
+
+let True16Predicate = UseFakeTrue16Insts in {
 def : GCNPat<
   (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
   (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
@@ -3115,7 +3127,7 @@ def : GCNPat<
   (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
   (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
 >;
-} // End OtherPredicates
+} // End True16Predicate
 
 def : GCNPat<
   (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
@@ -3173,13 +3185,22 @@ multiclass SelectCanonicalizeAsMax<
   def : GCNPat<
     (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
     (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
-    let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, NotHasTrue16BitInsts]);
+    let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
+    let True16Predicate = NotHasTrue16BitInsts;
+  }
+
+  def : GCNPat<
+    (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+    (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
+    let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
+    let True16Predicate = UseRealTrue16Insts;
   }
 
   def : GCNPat<
     (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
     (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
-    let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]);
+    let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts]);
+    let True16Predicate = UseFakeTrue16Insts;
   }
 
   def : GCNPat<
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
index c07a2b0b85921..d32634806f7bd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
@@ -2,7 +2,8 @@
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel-abort=0 -disable-gisel-legality-check -o - %s | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -mattr=-real-true16 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ---
 
@@ -38,12 +39,20 @@ body: |
     ; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcanonicalize_f16_denorm
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_denorm
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_denorm
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FCANONICALIZE %1
@@ -84,12 +93,20 @@ body: |
     ; GFX10-NEXT: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fcanonicalize_f16_flush
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcanonicalize_f16_flush
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_MAX_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_MAX_F16_t16_e64 0, [[COPY1]], 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcanonicalize_f16_flush
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_MAX_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MAX_F16_fake16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_MAX_F16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FCANONICALIZE %1
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 3c70883f09d2c..b4dbe0e7be924 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 declare half @llvm.fabs.f16(half) #0
 declare half @llvm.canonicalize.f16(half) #0
@@ -96,16 +97,27 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_canonicalize_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_canonicalize_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, ptr addrspace(1) undef
@@ -147,16 +159,29 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_test_canonicalize_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, s2, s2
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_test_canonicalize_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_test_canonicalize_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, s2, s2
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, ptr addrspace(1) %out
@@ -239,16 +264,27 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_canonicalize_fabs_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, |v0.l|, |v0.l|
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_fabs_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, |v1|, |v1|
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
@@ -293,16 +329,27 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_fabs_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fneg half %val.fabs
@@ -348,16 +395,27 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_canonicalize_fneg_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_fneg_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fneg = fneg half %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -402,16 +460,27 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -v1, -v1
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -v0.l, -v0.l
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -v1, -v1
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fneg = fneg half %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -456,16 +525,27 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[0:1]
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
-; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v0, v1, s[0:1]
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e64 v0.l, -|v0.l|, -|v0.l|
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e64 v1, -|v1|, -|v1|
+; GFX11-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load half, ptr addrspace(1) %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fneg half %val.fabs
@@ -2325,13 +2405,21 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 0
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
@@ -2358,13 +2446,21 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_undef_reg_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_undef_reg_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <2 x half> undef, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
@@ -2513,13 +2609,21 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_reg_k_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 2.0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 2.0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_k_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 2.0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half %val, i32 0
   %vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
@@ -2549,13 +2653,21 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
 ; CI-NEXT:    v_mov_b32_e32 v0, 2.0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_k_reg_v2f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, 2.0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_k_reg_v2f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, 2.0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <2 x half> undef, half 2.0, i32 0
   %vec1 = insertelement <2 x half> %vec0, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
@@ -2635,14 +2747,23 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x7e007e00
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec = insertelement <4 x half> undef, half %val, i32 0
   %canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
   ret <4 x half> %canonicalized
@@ -2725,15 +2846,25 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
-; GFX11-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, 0
-; GFX11-NEXT:    v_pk_max_f16 v1, v1, v1
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_max_f16_e32 v0.l, v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, 0
+; GFX11-TRUE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-FAKE16-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, 0
+; GFX11-FAKE16-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %vec0 = insertelement <4 x half> undef, half %val0, i32 0
   %vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
   %vec2 = insertelement <4 x half> %vec1, half %val2, i32 3

From 5d2393a222c751723b0906485bf90a28dd4e564b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 24 Jan 2025 08:09:20 -0800
Subject: [PATCH 017/432] [InstCombine] Avoid repeated hash lookups (NFC)
 (#124243)

---
 .../Transforms/InstCombine/InstCombineVectorOps.cpp  | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index a9ae09b8dba43..6860a7cd07b78 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -3060,14 +3060,10 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
       unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
       assert(SrcElemsPerTgtElem);
       BegIdx /= SrcElemsPerTgtElem;
-      bool BCAlreadyExists = NewBCs.contains(CastSrcTy);
-      auto *NewBC =
-          BCAlreadyExists
-              ? NewBCs[CastSrcTy]
-              : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
-      if (!BCAlreadyExists)
-        NewBCs[CastSrcTy] = NewBC;
-      auto *Ext = Builder.CreateExtractElement(NewBC, BegIdx,
+      auto [It, Inserted] = NewBCs.try_emplace(CastSrcTy);
+      if (Inserted)
+        It->second = Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+      auto *Ext = Builder.CreateExtractElement(It->second, BegIdx,
                                                SVI.getName() + ".extract");
       // The shufflevector isn't being replaced: the bitcast that used it
       // is. InstCombine will visit the newly-created instructions.

From 37bf0a10fb4cee10f4acbb7da453e7c19c8ee599 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 24 Jan 2025 17:10:10 +0100
Subject: [PATCH 018/432] [SCEV] Add test for #123550 (NFC)

---
 .../test/Analysis/ScalarEvolution/pr123550.ll | 50 +++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 llvm/test/Analysis/ScalarEvolution/pr123550.ll

diff --git a/llvm/test/Analysis/ScalarEvolution/pr123550.ll b/llvm/test/Analysis/ScalarEvolution/pr123550.ll
new file mode 100644
index 0000000000000..c1f2051248a12
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/pr123550.ll
@@ -0,0 +1,50 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -disable-output -passes='print<scalar-evolution>' < %s 2>&1 | FileCheck %s
+
+; FIXME: This is a miscompile.
+define i32 @test() {
+; CHECK-LABEL: 'test'
+; CHECK-NEXT:  Classifying expressions for: @test
+; CHECK-NEXT:    %phi = phi i32 [ -173, %bb ], [ %sub, %loop ]
+; CHECK-NEXT:    --> (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> U: empty-set S: empty-set Exits: -173 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv2 = phi i32 [ 1, %bb ], [ %iv2.inc, %loop ]
+; CHECK-NEXT:    --> {1,+,1}<nuw><nsw><%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %srem = srem i32 729259140, %phi
+; CHECK-NEXT:    --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw>)))<nuw><nsw> U: empty-set S: empty-set Exits: 729259140 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %trunc = trunc i32 %iv2 to i8
+; CHECK-NEXT:    --> {1,+,1}<%loop> U: [1,2) S: [1,2) Exits: 1 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %urem = urem i8 -83, %trunc
+; CHECK-NEXT:    --> (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %zext = zext i8 %urem to i32
+; CHECK-NEXT:    --> (zext i8 (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) to i32) U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %sub = sub i32 0, %zext
+; CHECK-NEXT:    --> (-1 * (zext i8 (-83 + ((-83 /u {1,+,1}<%loop>) * {-1,+,-1}<%loop>)) to i32))<nuw><nsw> U: [0,1) S: [0,1) Exits: 0 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %iv2.inc = add i32 %iv2, 1
+; CHECK-NEXT:    --> {2,+,1}<nuw><nsw><%loop> U: [2,3) S: [2,3) Exits: 2 LoopDispositions: { %loop: Computable }
+; CHECK-NEXT:    %srem.lcssa = phi i32 [ %srem, %loop ]
+; CHECK-NEXT:    --> (729259140 + (-1 * (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw> * (729259140 /u (-173 + (-1 * (zext i8 ((-83 /u {0,+,1}<%loop>) * {0,+,-1}<%loop>) to i32))<nuw><nsw>)<nuw><nsw>)))<nuw><nsw> U: empty-set S: empty-set --> 729259140 U: [729259140,729259141) S: [729259140,729259141)
+; CHECK-NEXT:  Determining loop execution counts for: @test
+; CHECK-NEXT:  Loop %loop: backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is i32 0
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+bb:
+  br label %loop
+
+loop:
+  %phi = phi i32 [ -173, %bb ], [ %sub, %loop ]
+  %iv2 = phi i32 [ 1, %bb ], [ %iv2.inc, %loop ]
+  %srem = srem i32 729259140, %phi
+  %trunc = trunc i32 %iv2 to i8
+  %urem = urem i8 -83, %trunc
+  %zext = zext i8 %urem to i32
+  %sub = sub i32 0, %zext
+  %iv2.inc = add i32 %iv2, 1
+  %icmp = icmp eq i32 %zext, 0
+  br i1 %icmp, label %exit, label %loop
+
+exit:
+  %srem.lcssa = phi i32 [ %srem, %loop ]
+  ret i32 %srem.lcssa
+}

From 256f40d0e6b2beb0e951b0f5f836847223c5695c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 24 Jan 2025 10:17:20 -0600
Subject: [PATCH 019/432] [libc] Use the NVIDIA device allocator for GPU malloc
 (#124277)

Summary:
This is a blocker on another patch in the OpenMP runtime. The problem is
that NVIDIA truly doesn't handle RPC-based allocations very well. It
cannot reliably update the MMU while a kernel is running and it will
usually deadlock if called from a separate thread due to internal use of
TLS.

This patch just removes the definition of `malloc` and `free` for NVPTX.
The result here is that they will be undefined, which is the cue for the
`nvlink` linker to define them for us. So, as far as `libc` is concerned
it still implements malloc.
---
 libc/src/stdlib/gpu/free.cpp        | 4 ++++
 libc/src/stdlib/gpu/malloc.cpp      | 4 ++++
 libc/test/src/stdlib/CMakeLists.txt | 3 ++-
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/libc/src/stdlib/gpu/free.cpp b/libc/src/stdlib/gpu/free.cpp
index 1f0e9ec735974..6ef9d718315a5 100644
--- a/libc/src/stdlib/gpu/free.cpp
+++ b/libc/src/stdlib/gpu/free.cpp
@@ -14,6 +14,10 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+// FIXME: For now we just default to the NVIDIA device allocator which is
+// always available on NVPTX targets. This will be implemented fully later.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
 LLVM_LIBC_FUNCTION(void, free, (void *ptr)) { gpu::deallocate(ptr); }
+#endif
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdlib/gpu/malloc.cpp b/libc/src/stdlib/gpu/malloc.cpp
index 54f2d8843996e..b5909cb9cb4d0 100644
--- a/libc/src/stdlib/gpu/malloc.cpp
+++ b/libc/src/stdlib/gpu/malloc.cpp
@@ -14,8 +14,12 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
+// FIXME: For now we just default to the NVIDIA device allocator which is
+// always available on NVPTX targets. This will be implemented fully later.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
 LLVM_LIBC_FUNCTION(void *, malloc, (size_t size)) {
   return gpu::allocate(size);
 }
+#endif
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 8cc0428632ba3..aba76833be9d4 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -420,7 +420,8 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 
   # Only baremetal and GPU has an in-tree 'malloc' implementation.
-  if(LIBC_TARGET_OS_IS_BAREMETAL OR LIBC_TARGET_OS_IS_GPU)
+  if((LIBC_TARGET_OS_IS_BAREMETAL OR LIBC_TARGET_OS_IS_GPU) AND
+      NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
     add_libc_test(
       malloc_test
       HERMETIC_TEST_ONLY

From 7842374103b26933d71a8fe354cd4d8715d55b1c Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 24 Jan 2025 08:20:35 -0800
Subject: [PATCH 020/432] [NFC][TableGen] Emit nested namespaces in
 InstrInfoEmitter (#124210)

- Emit C++17 nested namespaces in InstrInfoEmitter.
---
 llvm/utils/TableGen/InstrInfoEmitter.cpp | 89 +++++++-----------------
 1 file changed, 27 insertions(+), 62 deletions(-)

diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 7811734d5fdac..12401a2f246a1 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -283,7 +283,6 @@ void InstrInfoEmitter::emitOperandNameMappings(
     raw_ostream &OS, const CodeGenTarget &Target,
     ArrayRef<const CodeGenInstruction *> NumberedInstructions) {
   StringRef Namespace = Target.getInstNamespace();
-  std::string OpNameNS = "OpName";
   // Map of operand names to their enumeration value.  This will be used to
   // generate the OpName enum.
   std::map<std::string, unsigned> Operands;
@@ -293,24 +292,19 @@ void InstrInfoEmitter::emitOperandNameMappings(
 
   OS << "#ifdef GET_INSTRINFO_OPERAND_ENUM\n";
   OS << "#undef GET_INSTRINFO_OPERAND_ENUM\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
-  OS << "namespace " << OpNameNS << " {\n";
+  OS << "namespace llvm::" << Namespace << "::OpName {\n";
   OS << "enum {\n";
   for (const auto &Op : Operands)
     OS << "  " << Op.first << " = " << Op.second << ",\n";
 
   OS << "  OPERAND_LAST";
   OS << "\n};\n";
-  OS << "} // end namespace OpName\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "::OpName\n";
   OS << "#endif //GET_INSTRINFO_OPERAND_ENUM\n\n";
 
   OS << "#ifdef GET_INSTRINFO_NAMED_OPS\n";
   OS << "#undef GET_INSTRINFO_NAMED_OPS\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
   OS << "int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx) {\n";
   if (!Operands.empty()) {
@@ -343,8 +337,7 @@ void InstrInfoEmitter::emitOperandNameMappings(
     OS << "  return -1;\n";
   }
   OS << "}\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif //GET_INSTRINFO_NAMED_OPS\n\n";
 }
 
@@ -365,9 +358,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
 
   OS << "#ifdef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
   OS << "#undef GET_INSTRINFO_OPERAND_TYPES_ENUM\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
-  OS << "namespace OpTypes {\n";
+  OS << "namespace llvm::" << Namespace << "::OpTypes {\n";
   OS << "enum OperandType {\n";
 
   unsigned EnumVal = 0;
@@ -382,15 +373,12 @@ void InstrInfoEmitter::emitOperandTypeMappings(
 
   OS << "  OPERAND_TYPE_LIST_END"
      << "\n};\n";
-  OS << "} // end namespace OpTypes\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "::OpTypes\n";
   OS << "#endif // GET_INSTRINFO_OPERAND_TYPES_ENUM\n\n";
 
   OS << "#ifdef GET_INSTRINFO_OPERAND_TYPE\n";
   OS << "#undef GET_INSTRINFO_OPERAND_TYPE\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
   OS << "static int getOperandType(uint16_t Opcode, uint16_t OpIdx) {\n";
   auto getInstrName = [&](int I) -> StringRef {
@@ -465,14 +453,12 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     OS << "  llvm_unreachable(\"No instructions defined\");\n";
   }
   OS << "}\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif // GET_INSTRINFO_OPERAND_TYPE\n\n";
 
   OS << "#ifdef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
   OS << "#undef GET_INSTRINFO_MEM_OPERAND_SIZE\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
   OS << "LLVM_READONLY\n";
   OS << "static int getMemOperandSize(int OpType) {\n";
   OS << "  switch (OpType) {\n";
@@ -490,8 +476,7 @@ void InstrInfoEmitter::emitOperandTypeMappings(
     OS << "    return " << KV.first << ";\n\n";
   }
   OS << "  }\n}\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif // GET_INSTRINFO_MEM_OPERAND_SIZE\n\n";
 }
 
@@ -526,8 +511,7 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
 
   OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
   OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
   OS << "LLVM_READONLY static unsigned\n";
   OS << "getLogicalOperandSize(uint16_t Opcode, uint16_t LogicalOpIdx) {\n";
   if (!InstMap.empty()) {
@@ -577,8 +561,7 @@ void InstrInfoEmitter::emitLogicalOperandSizeMappings(
   OS << "  return S;\n";
   OS << "}\n";
 
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_SIZE_MAP\n\n";
 }
 
@@ -619,8 +602,7 @@ void InstrInfoEmitter::emitLogicalOperandTypeMappings(
 
   OS << "#ifdef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n";
   OS << "#undef GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n";
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
   OS << "LLVM_READONLY static int\n";
   OS << "getLogicalOperandType(uint16_t Opcode, uint16_t LogicalOpIdx) {\n";
   if (!InstMap.empty()) {
@@ -666,8 +648,7 @@ void InstrInfoEmitter::emitLogicalOperandTypeMappings(
     OS << "  return -1;\n";
   }
   OS << "}\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif // GET_INSTRINFO_LOGICAL_OPERAND_TYPE_MAP\n\n";
 }
 
@@ -701,8 +682,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
   OS << "#ifdef GET_INSTRINFO_MC_HELPERS\n";
   OS << "#undef GET_INSTRINFO_MC_HELPERS\n\n";
 
-  OS << "namespace llvm {\n";
-  OS << "namespace " << TargetName << "_MC {\n\n";
+  OS << "namespace llvm::" << TargetName << "_MC {\n";
 
   PredicateExpander PE(TargetName);
   PE.setExpandForMC(true);
@@ -716,8 +696,7 @@ void InstrInfoEmitter::emitMCIIHelperMethods(raw_ostream &OS,
     OS << "\n}\n\n";
   }
 
-  OS << "} // end namespace " << TargetName << "_MC\n";
-  OS << "} // end namespace llvm\n\n";
+  OS << "} // end namespace llvm::" << TargetName << "_MC\n";
 
   OS << "#endif // GET_GENISTRINFO_MC_HELPERS\n\n";
 }
@@ -743,8 +722,7 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
      << "#endif\n";
   OS << "#ifdef GET_COMPUTE_FEATURES\n"
      << "#undef GET_COMPUTE_FEATURES\n"
-     << "namespace llvm {\n"
-     << "namespace " << Target.getName() << "_MC {\n\n";
+     << "namespace llvm::" << Target.getName() << "_MC {\n";
 
   // Emit the subtarget feature enumeration.
   SubtargetFeatureInfo::emitSubtargetFeatureBitEnumeration(SubtargetFeatures,
@@ -827,14 +805,12 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
      << "  return FeatureBitsets[RequiredFeaturesRefs[Opcode]];\n"
      << "}\n\n";
 
-  OS << "} // end namespace " << Target.getName() << "_MC\n"
-     << "} // end namespace llvm\n"
+  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
      << "#endif // GET_COMPUTE_FEATURES\n\n";
 
   OS << "#ifdef GET_AVAILABLE_OPCODE_CHECKER\n"
      << "#undef GET_AVAILABLE_OPCODE_CHECKER\n"
-     << "namespace llvm {\n"
-     << "namespace " << Target.getName() << "_MC {\n";
+     << "namespace llvm::" << Target.getName() << "_MC {\n";
   OS << "bool isOpcodeAvailable("
      << "unsigned Opcode, const FeatureBitset &Features) {\n"
      << "  FeatureBitset AvailableFeatures = "
@@ -846,16 +822,14 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
      << "      RequiredFeatures;\n"
      << "  return !MissingFeatures.any();\n"
      << "}\n";
-  OS << "} // end namespace " << Target.getName() << "_MC\n"
-     << "} // end namespace llvm\n"
+  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n"
      << "#endif // GET_AVAILABLE_OPCODE_CHECKER\n\n";
 
   OS << "#ifdef ENABLE_INSTR_PREDICATE_VERIFIER\n"
      << "#undef ENABLE_INSTR_PREDICATE_VERIFIER\n"
      << "#include <sstream>\n\n";
 
-  OS << "namespace llvm {\n";
-  OS << "namespace " << Target.getName() << "_MC {\n\n";
+  OS << "namespace llvm::" << Target.getName() << "_MC {\n";
 
   // Emit the name table for error messages.
   OS << "#ifndef NDEBUG\n";
@@ -886,8 +860,7 @@ void InstrInfoEmitter::emitFeatureVerifier(raw_ostream &OS,
      << "  }\n"
      << "#endif // NDEBUG\n";
   OS << "}\n";
-  OS << "} // end namespace " << Target.getName() << "_MC\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Target.getName() << "_MC\n";
   OS << "#endif // ENABLE_INSTR_PREDICATE_VERIFIER\n\n";
 }
 
@@ -1318,17 +1291,14 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
   OS << "#ifdef GET_INSTRINFO_ENUM\n";
   OS << "#undef GET_INSTRINFO_ENUM\n";
 
-  OS << "namespace llvm {\n\n";
-
   const CodeGenTarget &Target = CDP.getTargetInfo();
-
-  // We must emit the PHI opcode first...
   StringRef Namespace = Target.getInstNamespace();
 
   if (Namespace.empty())
     PrintFatalError("No instructions defined!");
 
-  OS << "namespace " << Namespace << " {\n";
+  OS << "namespace llvm::" << Namespace << " {\n";
+
   OS << "  enum {\n";
   unsigned Num = 0;
   for (const CodeGenInstruction *Inst : Target.getInstructionsByEnumValue())
@@ -1336,24 +1306,19 @@ void InstrInfoEmitter::emitEnums(raw_ostream &OS) {
        << "\t= " << (Num = Target.getInstrIntValue(Inst->TheDef)) << ",\n";
   OS << "    INSTRUCTION_LIST_END = " << Num + 1 << "\n";
   OS << "  };\n\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "\n";
   OS << "#endif // GET_INSTRINFO_ENUM\n\n";
 
   OS << "#ifdef GET_INSTRINFO_SCHED_ENUM\n";
   OS << "#undef GET_INSTRINFO_SCHED_ENUM\n";
-  OS << "namespace llvm {\n\n";
-  OS << "namespace " << Namespace << " {\n";
-  OS << "namespace Sched {\n";
+  OS << "namespace llvm::" << Namespace << "::Sched {\n\n";
   OS << "  enum {\n";
   Num = 0;
   for (const auto &Class : SchedModels.explicit_classes())
     OS << "    " << Class.Name << "\t= " << Num++ << ",\n";
   OS << "    SCHED_LIST_END = " << Num << "\n";
   OS << "  };\n";
-  OS << "} // end namespace Sched\n";
-  OS << "} // end namespace " << Namespace << "\n";
-  OS << "} // end namespace llvm\n";
+  OS << "} // end namespace llvm::" << Namespace << "::Sched\n";
 
   OS << "#endif // GET_INSTRINFO_SCHED_ENUM\n\n";
 }

From cccb55491223cd410cb2f83973377dd75757cb60 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 24 Jan 2025 11:33:05 -0500
Subject: [PATCH 021/432] [lldb] Remove unused posix_openpt function definition
 for Android (#124257)

This was for the wrapper function that was in
source/Host/android/LibcGlue.cpp. Android added
support 10+ years ago.
---
 lldb/source/Host/common/PseudoTerminal.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/lldb/source/Host/common/PseudoTerminal.cpp b/lldb/source/Host/common/PseudoTerminal.cpp
index d53327973eb27..53e91aff212a4 100644
--- a/lldb/source/Host/common/PseudoTerminal.cpp
+++ b/lldb/source/Host/common/PseudoTerminal.cpp
@@ -27,10 +27,6 @@
 #include <Availability.h>
 #endif
 
-#if defined(__ANDROID__)
-int posix_openpt(int flags);
-#endif
-
 using namespace lldb_private;
 
 // PseudoTerminal constructor

From 3da7de34a2bcfeef73747a9796652f6bff225de3 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Fri, 24 Jan 2025 08:49:35 -0800
Subject: [PATCH 022/432] [flang][runtime] Disable optimization for traceback
 related functions. (#124172)

The backtrace may at least print the backtrace name in the call stack,
but this does not happen with the release builds of the runtime.
Surprisingly, specifying "no-omit-frame-pointer" did not work
with GCC, so I decided to fall back to -O0 for these functions.
---
 flang/include/flang/Common/api-attrs.h | 11 +++++++++++
 flang/runtime/stop.cpp                 | 14 +++++++++-----
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/flang/include/flang/Common/api-attrs.h b/flang/include/flang/Common/api-attrs.h
index d73e60996bc81..1ee91ca8e0d9d 100644
--- a/flang/include/flang/Common/api-attrs.h
+++ b/flang/include/flang/Common/api-attrs.h
@@ -178,4 +178,15 @@
 #define RT_DEVICE_NOINLINE_HOST_INLINE inline
 #endif
 
+/* RT_OPTNONE_ATTR allows disabling optimizations per function. */
+#if __has_attribute(optimize)
+/* GCC style. */
+#define RT_OPTNONE_ATTR __attribute__((optimize("O0")))
+#elif __has_attribute(optnone)
+/* Clang style. */
+#define RT_OPTNONE_ATTR __attribute__((optnone))
+#else
+#define RT_OPTNONE_ATTR
+#endif
+
 #endif /* !FORTRAN_RUNTIME_API_ATTRS_H_ */
diff --git a/flang/runtime/stop.cpp b/flang/runtime/stop.cpp
index a7be8a082e026..f8c180e0aaffa 100644
--- a/flang/runtime/stop.cpp
+++ b/flang/runtime/stop.cpp
@@ -157,7 +157,7 @@ void RTNAME(PauseStatementText)(const char *code, std::size_t length) {
   std::exit(status);
 }
 
-static void PrintBacktrace() {
+static RT_NOINLINE_ATTR void PrintBacktrace() {
 #ifdef HAVE_BACKTRACE
   // TODO: Need to parse DWARF information to print function line numbers
   constexpr int MAX_CALL_STACK{999};
@@ -165,8 +165,12 @@ static void PrintBacktrace() {
   int nptrs{(int)backtrace(buffer, MAX_CALL_STACK)};
 
   if (char **symbols{backtrace_symbols(buffer, nptrs)}) {
-    for (int i = 0; i < nptrs; i++) {
-      Fortran::runtime::Terminator{}.PrintCrashArgs("#%d %s\n", i, symbols[i]);
+    // Skip the PrintBacktrace() frame, as it is just a utility.
+    // It makes sense to start printing the backtrace
+    // from Abort() or backtrace().
+    for (int i = 1; i < nptrs; i++) {
+      Fortran::runtime::Terminator{}.PrintCrashArgs(
+          "#%d %s\n", i - 1, symbols[i]);
     }
     free(symbols);
   }
@@ -179,14 +183,14 @@ static void PrintBacktrace() {
 #endif
 }
 
-[[noreturn]] void RTNAME(Abort)() {
+[[noreturn]] RT_OPTNONE_ATTR void RTNAME(Abort)() {
 #ifdef HAVE_BACKTRACE
   PrintBacktrace();
 #endif
   std::abort();
 }
 
-void FORTRAN_PROCEDURE_NAME(backtrace)() { PrintBacktrace(); }
+RT_OPTNONE_ATTR void FORTRAN_PROCEDURE_NAME(backtrace)() { PrintBacktrace(); }
 
 [[noreturn]] void RTNAME(ReportFatalUserError)(
     const char *message, const char *source, int line) {

From a976036a100b7dd459b6cabac96159875fcd513d Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Fri, 24 Jan 2025 11:57:43 -0500
Subject: [PATCH 023/432] [libc][NFC] Remove extra ; in exhaustive_test.h.
 (#124216)

These cause warnings when running check-libc.
---
 libc/test/src/math/exhaustive/exhaustive_test.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/test/src/math/exhaustive/exhaustive_test.h b/libc/test/src/math/exhaustive/exhaustive_test.h
index 94489d2e55daa..5912f7a27dc52 100644
--- a/libc/test/src/math/exhaustive/exhaustive_test.h
+++ b/libc/test/src/math/exhaustive/exhaustive_test.h
@@ -225,7 +225,7 @@ struct LlvmLibcExhaustiveMathTest
     std::cout << "-- Testing for FE_TOWARDZERO in range [0x" << std::hex
               << start << ", 0x" << stop << ") --" << std::dec << std::endl;
     test_full_range(mpfr::RoundingMode::TowardZero, start, stop);
-  };
+  }
 
   void test_full_range_all_roundings(StorageType x_start, StorageType x_stop,
                                      StorageType y_start, StorageType y_stop) {
@@ -252,7 +252,7 @@ struct LlvmLibcExhaustiveMathTest
               << ", 0x" << y_stop << ") --" << std::dec << std::endl;
     test_full_range(mpfr::RoundingMode::TowardZero, x_start, x_stop, y_start,
                     y_stop);
-  };
+  }
 };
 
 template <typename FloatType, mpfr::Operation Op, UnaryOp<FloatType> Func>

From ba6774f997ee28157b0a3b8816cc76b94ed1da17 Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Fri, 24 Jan 2025 18:09:48 +0100
Subject: [PATCH 024/432] [mlir][xegpu] Fix verifier diagnostic recursion
 (#124148)

Uses global diagnostic message in operation verifier to avoid infinite
recursion on a warning.

Emitting diagnostics through the operation under verification creates a
loop where verifier runs again before printing the message.
---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 15c435f1fa257..81f46f941785a 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -294,7 +294,7 @@ LogicalResult LoadNdOp::verify() {
     if (valid)
       transpose(trans, tdescShape);
     else
-      emitWarning("Invalid transpose attr. It is ignored.");
+      mlir::emitWarning(getLoc()) << "Invalid transpose attr. It is ignored.";
   }
 
   if (getPacked()) {
@@ -304,8 +304,9 @@ LogicalResult LoadNdOp::verify() {
       tdescShape[axis] /= vnni_factor;
       tdescShape.push_back(vnni_factor);
     } else {
-      emitWarning("Invalid Packed Attr. It is ignored (available for 2D "
-                  "TensorDesc only).");
+      mlir::emitWarning(getLoc())
+          << "Invalid Packed Attr. It is ignored (available for 2D "
+             "TensorDesc only).";
     }
   }
 

From d88293d8a2005b19f89a86252c60102cec6c9b01 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 24 Jan 2025 17:15:06 +0000
Subject: [PATCH 025/432] [mlir][vector] Disable `BreakDownVectorBitCast` for
 scalable vectors (#122725)

`BreakDownVectorBitCast` leverages
  * `vector.extract_strided_slices` + `vector.insert_strided_slices`

As these Ops do not support extracting scalable sub-vectors (i.e.
extracting/inserting a fraction of a scalable dim), it's best to bail
out.
---
 .../Dialect/Vector/Transforms/VectorTransforms.cpp    |  7 +++++++
 .../Dialect/Vector/vector-break-down-bitcast.mlir     | 11 +++++++++++
 2 files changed, 18 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index d9be8d0e578ae..275f11160487a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -906,6 +906,13 @@ struct BreakDownVectorBitCast : public OpRewritePattern<vector::BitCastOp> {
     VectorType castDstType = bitcastOp.getResultVectorType();
     assert(castSrcType.getRank() == castDstType.getRank());
 
+    // This transformation builds on top of
+    // vector.{extract|insert}_strided_slice, which do not support
+    // extracting/inserting "scallable sub-vectors". Bail out.
+    if (castSrcType.isScalable())
+      return rewriter.notifyMatchFailure(bitcastOp,
+                                         "Scalable vectors are not supported");
+
     // Only support rank 1 case for now.
     if (castSrcType.getRank() != 1)
       return failure();
diff --git a/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir b/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir
index fbb2f7605e649..173388f63ecda 100644
--- a/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir
+++ b/mlir/test/Dialect/Vector/vector-break-down-bitcast.mlir
@@ -39,3 +39,14 @@ func.func @bitcast_i8_to_i32(%input: vector<16xi8>) -> vector<4xi32> {
 // CHECK: %[[CAST3:.+]] = vector.bitcast %[[EXTRACT3]] : vector<4xi8> to vector<1xi32>
 // CHECK: %[[INSERT3:.+]] = vector.insert_strided_slice %[[CAST3]], %[[INSERT2]] {offsets = [3], strides = [1]} : vector<1xi32> into vector<4xi32>
 // CHECK: return %[[INSERT3]]
+
+// -----
+
+// Scalable vectors are not supported!
+
+// CHECK-LABEL: func.func @bitcast_scalable_negative
+// CHECK: vector.bitcast
+func.func @bitcast_scalable_negative(%input: vector<[8]xf16>) -> vector<[4]xf32> {
+  %0 = vector.bitcast %input : vector<[8]xf16> to vector<[4]xf32>
+  return %0: vector<[4]xf32>
+}

From 474f5d2aefb44430b89ed72774a3c1d26a0adfb1 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 24 Jan 2025 17:22:27 +0000
Subject: [PATCH 026/432] [FMV][AArch64] Remove features predres and ls64.
 (#124266)

These cannot be detected by reading the ID_AA64ISAR1_EL1 register since
their corresponding bitfields are hidden. Additionally the instructions
that these features enable are unusable from EL0.

ACLE: https://github.com/ARM-software/acle/pull/382
---
 .../CodeGen/AArch64/cpu-supports-target.c     |  6 +--
 clang/test/CodeGen/AArch64/cpu-supports.c     |  6 +--
 clang/test/CodeGen/AArch64/fmv-dependencies.c |  8 ----
 clang/test/CodeGen/AArch64/fmv-features.c     |  8 ----
 clang/test/CodeGen/AArch64/fmv-priority.c     | 10 ++---
 .../test/CodeGen/attr-target-clones-aarch64.c | 18 ++++-----
 clang/test/CodeGen/attr-target-version.c      | 38 +++++++++----------
 .../CodeGenCXX/attr-target-clones-aarch64.cpp | 20 +++++-----
 clang/test/Sema/attr-target-version.c         |  2 +-
 .../builtins/cpu_model/AArch64CPUFeatures.inc | 10 ++---
 .../builtins/cpu_model/aarch64/fmv/apple.inc  |  2 -
 .../builtins/cpu_model/aarch64/fmv/mrs.inc    | 11 ------
 .../llvm/TargetParser/AArch64CPUFeatures.inc  | 10 ++---
 llvm/lib/Target/AArch64/AArch64FMV.td         |  2 -
 14 files changed, 60 insertions(+), 91 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/cpu-supports-target.c b/clang/test/CodeGen/AArch64/cpu-supports-target.c
index 6223db7c09253..a39ffd4e4a74d 100644
--- a/clang/test/CodeGen/AArch64/cpu-supports-target.c
+++ b/clang/test/CodeGen/AArch64/cpu-supports-target.c
@@ -91,8 +91,8 @@
 // CHECK-NEXT:    br label %[[RETURN]]
 // CHECK:       [[IF_ELSE16]]:
 // CHECK-NEXT:    [[TMP36:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 10836786603360256
-// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 10836786603360256
+// CHECK-NEXT:    [[TMP37:%.*]] = and i64 [[TMP36]], 1688849860263936
+// CHECK-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[TMP37]], 1688849860263936
 // CHECK-NEXT:    [[TMP39:%.*]] = and i1 true, [[TMP38]]
 // CHECK-NEXT:    br i1 [[TMP39]], label %[[IF_THEN17:.*]], label %[[IF_ELSE18:.*]]
 // CHECK:       [[IF_THEN17]]:
@@ -142,7 +142,7 @@ int check_all_features() {
     return 8;
   else if (__builtin_cpu_supports("sme+memtag+sb"))
     return 9;
-  else if (__builtin_cpu_supports("predres+ssbs+bti+ls64"))
+  else if (__builtin_cpu_supports("ssbs+bti"))
     return 10;
   else if (__builtin_cpu_supports("wfxt+sme-f64f64"))
     return 11;
diff --git a/clang/test/CodeGen/AArch64/cpu-supports.c b/clang/test/CodeGen/AArch64/cpu-supports.c
index 406201781d480..5691901bcd98f 100644
--- a/clang/test/CodeGen/AArch64/cpu-supports.c
+++ b/clang/test/CodeGen/AArch64/cpu-supports.c
@@ -27,8 +27,8 @@
 // CHECK-NEXT:    br label [[RETURN]]
 // CHECK:       if.end2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 171141184020873984
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 171141184020873984
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 162133984766132992
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 162133984766132992
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[IF_THEN3:%.*]], label [[IF_END4:%.*]]
 // CHECK:       if.then3:
@@ -53,7 +53,7 @@ int main(void) {
   if (__builtin_cpu_supports("sve2-aes+memtag"))
     return 2;
 
-  if (__builtin_cpu_supports("sme2+ls64+wfxt"))
+  if (__builtin_cpu_supports("sme2+wfxt"))
     return 3;
 
   if (__builtin_cpu_supports("avx2"))
diff --git a/clang/test/CodeGen/AArch64/fmv-dependencies.c b/clang/test/CodeGen/AArch64/fmv-dependencies.c
index 8dda3b647fcd0..7cfab7de41a9d 100644
--- a/clang/test/CodeGen/AArch64/fmv-dependencies.c
+++ b/clang/test/CodeGen/AArch64/fmv-dependencies.c
@@ -60,9 +60,6 @@ __attribute__((target_version("i8mm"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] {
 __attribute__((target_version("jscvt"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] {
-__attribute__((target_version("ls64"))) int fmv(void) { return 0; }
-
 // CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] {
 __attribute__((target_version("lse"))) int fmv(void) { return 0; }
 
@@ -72,9 +69,6 @@ __attribute__((target_version("memtag"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] {
 __attribute__((target_version("mops"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] {
-__attribute__((target_version("predres"))) int fmv(void) { return 0; }
-
 // CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] {
 __attribute__((target_version("rcpc"))) int fmv(void) { return 0; }
 
@@ -169,11 +163,9 @@ int caller() {
 // CHECK: attributes #[[frintts]] = { {{.*}} "target-features"="+fp-armv8,+fptoint,+neon,+outline-atomics,+v8a"
 // CHECK: attributes #[[i8mm]] = { {{.*}} "target-features"="+fp-armv8,+i8mm,+neon,+outline-atomics,+v8a"
 // CHECK: attributes #[[jscvt]] = { {{.*}} "target-features"="+fp-armv8,+jsconv,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[ls64]] = { {{.*}} "target-features"="+fp-armv8,+ls64,+neon,+outline-atomics,+v8a"
 // CHECK: attributes #[[lse]] = { {{.*}} "target-features"="+fp-armv8,+lse,+neon,+outline-atomics,+v8a"
 // CHECK: attributes #[[memtag]] = { {{.*}} "target-features"="+fp-armv8,+mte,+neon,+outline-atomics,+v8a"
 // CHECK: attributes #[[mops]] = { {{.*}} "target-features"="+fp-armv8,+mops,+neon,+outline-atomics,+v8a"
-// CHECK: attributes #[[predres]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+predres,+v8a"
 // CHECK: attributes #[[rcpc]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+v8a"
 // CHECK: attributes #[[rcpc2]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+v8a"
 // CHECK: attributes #[[rcpc3]] = { {{.*}} "target-features"="+fp-armv8,+neon,+outline-atomics,+rcpc,+rcpc-immo,+rcpc3,+v8a"
diff --git a/clang/test/CodeGen/AArch64/fmv-features.c b/clang/test/CodeGen/AArch64/fmv-features.c
index d191f8187eb6b..fdc64e2cd395c 100644
--- a/clang/test/CodeGen/AArch64/fmv-features.c
+++ b/clang/test/CodeGen/AArch64/fmv-features.c
@@ -58,9 +58,6 @@ __attribute__((target_version("i8mm"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mjscvt() #[[jscvt:[0-9]+]] {
 __attribute__((target_version("jscvt"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mls64() #[[ls64:[0-9]+]] {
-__attribute__((target_version("ls64"))) int fmv(void) { return 0; }
-
 // CHECK: define dso_local i32 @fmv._Mlse() #[[lse:[0-9]+]] {
 __attribute__((target_version("lse"))) int fmv(void) { return 0; }
 
@@ -70,9 +67,6 @@ __attribute__((target_version("memtag"))) int fmv(void) { return 0; }
 // CHECK: define dso_local i32 @fmv._Mmops() #[[mops:[0-9]+]] {
 __attribute__((target_version("mops"))) int fmv(void) { return 0; }
 
-// CHECK: define dso_local i32 @fmv._Mpredres() #[[predres:[0-9]+]] {
-__attribute__((target_version("predres"))) int fmv(void) { return 0; }
-
 // CHECK: define dso_local i32 @fmv._Mrcpc() #[[rcpc:[0-9]+]] {
 __attribute__((target_version("rcpc"))) int fmv(void) { return 0; }
 
@@ -171,11 +165,9 @@ int caller() {
 // CHECK: attributes #[[frintts]] = {{.*}} "fmv-features"="frintts"
 // CHECK: attributes #[[i8mm]] = {{.*}} "fmv-features"="i8mm"
 // CHECK: attributes #[[jscvt]] = {{.*}} "fmv-features"="jscvt"
-// CHECK: attributes #[[ls64]] = {{.*}} "fmv-features"="ls64"
 // CHECK: attributes #[[lse]] = {{.*}} "fmv-features"="lse"
 // CHECK: attributes #[[memtag]] = {{.*}} "fmv-features"="memtag"
 // CHECK: attributes #[[mops]] = {{.*}} "fmv-features"="mops"
-// CHECK: attributes #[[predres]] = {{.*}} "fmv-features"="predres"
 // CHECK: attributes #[[rcpc]] = {{.*}} "fmv-features"="rcpc"
 // CHECK: attributes #[[rcpc2]] = {{.*}} "fmv-features"="rcpc2"
 // CHECK: attributes #[[rcpc3]] = {{.*}} "fmv-features"="rcpc3"
diff --git a/clang/test/CodeGen/AArch64/fmv-priority.c b/clang/test/CodeGen/AArch64/fmv-priority.c
index ff82aef89a33d..c92e0c4e9c3db 100644
--- a/clang/test/CodeGen/AArch64/fmv-priority.c
+++ b/clang/test/CodeGen/AArch64/fmv-priority.c
@@ -5,7 +5,7 @@
 //
 // MSB                                                    LSB
 //
-// sme2 | ls64 | sme | bf16 |       |      | fp16 | simd | fp
+// sme2 | wfxt | sme | bf16 |       |      | fp16 | simd | fp
 // -----+------+-----+------+-------+------+------+------+---
 // sme2 |      | sme | bf16 | rcpc2 | rcpc | fp16 | simd | fp
 //
@@ -13,7 +13,7 @@
 // feature can only depend on lower priority features:
 // https://github.com/ARM-software/acle/pull/376
 
-__attribute__((target_version("sme2+ls64"))) int fn(void);
+__attribute__((target_version("sme2+wfxt"))) int fn(void);
 __attribute__((target_version("sme2+rcpc2"))) int fn(void);
 __attribute__((target_version("default"))) int fn(void) { return 0; }
 
@@ -36,12 +36,12 @@ int call() { return fn(); }
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 153126785511392000
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 153126785511392000
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 162133984766132992
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 162133984766132992
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
 // CHECK:       [[RESOLVER_RETURN]]:
-// CHECK-NEXT:    ret ptr @fn._Mls64Msme2
+// CHECK-NEXT:    ret ptr @fn._Msme2Mwfxt
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 144119586269233920
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 9e1588cd48336..ac926f2329cc4 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -12,7 +12,7 @@ int foo() {
   return ftc() + ftc_def() + ftc_dup1() + ftc_dup2() + ftc_dup3();
 }
 
-inline int __attribute__((target_clones("rng+simd", "rcpc+predres", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; }
+inline int __attribute__((target_clones("rng+simd", "rcpc", "sve2-aes+wfxt"))) ftc_inline1(void) { return 1; }
 inline int __attribute__((target_clones("fp16", "fcma+sve2-bitperm", "default"))) ftc_inline2(void);
 inline int __attribute__((target_clones("bti", "sve+sb"))) ftc_inline3(void) { return 3; }
 
@@ -336,7 +336,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
+// CHECK-LABEL: define {{[^@]+}}@ftc_inline1._Mrcpc
 // CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
@@ -368,12 +368,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4194304
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4194304
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
+// CHECK-NEXT:    ret ptr @ftc_inline1._Mrcpc
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 769
@@ -793,7 +793,7 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 //
 //
 // CHECK-MTE-BTI: Function Attrs: noinline nounwind optnone
-// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._MpredresMrcpc
+// CHECK-MTE-BTI-LABEL: define {{[^@]+}}@ftc_inline1._Mrcpc
 // CHECK-MTE-BTI-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-MTE-BTI-NEXT:  entry:
 // CHECK-MTE-BTI-NEXT:    ret i32 1
@@ -825,12 +825,12 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._Msve2-aesMwfxt
 // CHECK-MTE-BTI:       resolver_else:
 // CHECK-MTE-BTI-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 140737492549632
-// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 140737492549632
+// CHECK-MTE-BTI-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 4194304
+// CHECK-MTE-BTI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 4194304
 // CHECK-MTE-BTI-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-MTE-BTI-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK-MTE-BTI:       resolver_return1:
-// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._MpredresMrcpc
+// CHECK-MTE-BTI-NEXT:    ret ptr @ftc_inline1._Mrcpc
 // CHECK-MTE-BTI:       resolver_else2:
 // CHECK-MTE-BTI-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-MTE-BTI-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 769
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index a75514d63bce3..11655b2efcd84 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -5,14 +5,14 @@
 int __attribute__((target_version("rng+flagm+fp16fml"))) fmv(void) { return 1; }
 int __attribute__((target_version("flagm2+sme-i16i64"))) fmv(void) { return 2; }
 int __attribute__((target_version("lse+sha2"))) fmv(void) { return 3; }
-int __attribute__((target_version("dotprod+ls64"))) fmv(void) { return 4; }
+int __attribute__((target_version("dotprod+wfxt"))) fmv(void) { return 4; }
 int __attribute__((target_version("fp16fml+memtag"))) fmv(void) { return 5; }
 int __attribute__((target_version("fp+aes"))) fmv(void) { return 6; }
-int __attribute__((target_version("crc+ls64"))) fmv(void) { return 7; }
+int __attribute__((target_version("crc+wfxt"))) fmv(void) { return 7; }
 int __attribute__((target_version("bti"))) fmv(void) { return 8; }
 int __attribute__((target_version("sme2"))) fmv(void) { return 9; }
 int __attribute__((target_version("default"))) fmv(void) { return 0; }
-int __attribute__((target_version("ls64+simd"))) fmv_one(void) { return 1; }
+int __attribute__((target_version("wfxt+simd"))) fmv_one(void) { return 1; }
 int __attribute__((target_version("dpb"))) fmv_one(void) { return 2; }
 int __attribute__((target_version("default"))) fmv_one(void) { return 0; }
 int __attribute__((target_version("fp"))) fmv_two(void) { return 1; }
@@ -41,7 +41,7 @@ inline int __attribute__((target_version("fp+sm4"))) fmv_inline(void) { return 1
 inline int __attribute__((target_version("lse+rdm"))) fmv_inline(void) { return 16; }
 inline int __attribute__((target_version("default"))) fmv_inline(void) { return 3; }
 
-__attribute__((target_version("ls64"))) int fmv_e(void);
+__attribute__((target_version("wfxt"))) int fmv_e(void);
 int fmv_e(void) { return 20; }
 
 static __attribute__((target_version("sb"))) inline int fmv_d(void);
@@ -173,7 +173,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64
+// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMwfxt
 // CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
@@ -194,7 +194,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64
+// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMwfxt
 // CHECK-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 7
@@ -222,7 +222,7 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_one._Mls64Msimd
+// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMwfxt
 // CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
@@ -479,20 +479,20 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:    ret ptr @fmv._Mflagm2Msme-i16i64
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 9007199254742016
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 9007199254742016
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 18014398509483008
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 18014398509483008
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv._McrcMls64
+// CHECK-NEXT:    ret ptr @fmv._McrcMwfxt
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 9007199254741776
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 9007199254741776
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 18014398509482768
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 18014398509482768
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv._MdotprodMls64
+// CHECK-NEXT:    ret ptr @fmv._MdotprodMwfxt
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 1125899906842624
@@ -541,12 +541,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254741760
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254741760
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014398509482752
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014398509482752
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_one._Mls64Msimd
+// CHECK-NEXT:    ret ptr @fmv_one._MsimdMwfxt
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 262144
@@ -593,12 +593,12 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254740992
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254740992
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 18014398509481984
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 18014398509481984
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
 // CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_e._Mls64
+// CHECK-NEXT:    ret ptr @fmv_e._Mwfxt
 // CHECK:       resolver_else:
 // CHECK-NEXT:    ret ptr @fmv_e.default
 //
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index a2cc9f30f026a..4f553262c73b5 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -1,8 +1,8 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --include-generated-funcs --global-value-regex ".*" --version 5
 // RUN: %clang_cc1 -std=c++11 -triple aarch64-linux-gnu -emit-llvm %s -o - | FileCheck %s
 
-int __attribute__((target_clones("ls64+fp16", "default"))) foo_ovl(int) { return 1; }
-int __attribute__((target_clones("fp16+ls64"))) foo_ovl(void) { return 2; }
+int __attribute__((target_clones("fp16", "default"))) foo_ovl(int) { return 1; }
+int __attribute__((target_clones("fp16"))) foo_ovl(void) { return 2; }
 
 int bar() {
   return foo_ovl(1) + foo_ovl();
@@ -45,7 +45,7 @@ void run_foo_tml() {
 // CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
 // CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver
 //.
-// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16Mls64(
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovli._Mfp16(
 // CHECK-SAME: i32 noundef [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
@@ -57,17 +57,17 @@ void run_foo_tml() {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254806784
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254806784
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65792
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65792
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
 // CHECK:       [[RESOLVER_RETURN]]:
-// CHECK-NEXT:    ret ptr @_Z7foo_ovli._Mfp16Mls64
+// CHECK-NEXT:    ret ptr @_Z7foo_ovli._Mfp16
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovli.default
 //
 //
-// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mfp16Mls64(
+// CHECK-LABEL: define dso_local noundef i32 @_Z7foo_ovlv._Mfp16(
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    ret i32 2
@@ -77,12 +77,12 @@ void run_foo_tml() {
 // CHECK-NEXT:  [[RESOLVER_ENTRY:.*:]]
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 9007199254806784
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 9007199254806784
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 65792
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 65792
 // CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
 // CHECK-NEXT:    br i1 [[TMP3]], label %[[RESOLVER_RETURN:.*]], label %[[RESOLVER_ELSE:.*]]
 // CHECK:       [[RESOLVER_RETURN]]:
-// CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mfp16Mls64
+// CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mfp16
 // CHECK:       [[RESOLVER_ELSE]]:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovlv.default
 //
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index 096d2f003a004..cfcc1622abe5c 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -78,7 +78,7 @@ void __attribute__((target_version("rdm+rng+crc"))) redef(void) {}
 
 int def(void);
 void __attribute__((target_version("dit"))) nodef(void);
-void __attribute__((target_version("ls64"))) nodef(void);
+void __attribute__((target_version("wfxt"))) nodef(void);
 void __attribute__((target_version("aes"))) ovl(void);
 void __attribute__((target_version("default"))) ovl(void);
 int bar() {
diff --git a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
index 6b373ce424678..778f568c95c5e 100644
--- a/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
+++ b/compiler-rt/lib/builtins/cpu_model/AArch64CPUFeatures.inc
@@ -69,13 +69,13 @@ enum CPUFeatures {
   FEAT_MEMTAG2,
   RESERVED_FEAT_MEMTAG3, // previously used and now ABI legacy
   FEAT_SB,
-  FEAT_PREDRES,
-  RESERVED_FEAT_SSBS, // previously used and now ABI legacy
+  RESERVED_FEAT_PREDRES, // previously used and now ABI legacy
+  RESERVED_FEAT_SSBS,    // previously used and now ABI legacy
   FEAT_SSBS2,
   FEAT_BTI,
-  RESERVED_FEAT_LS64,   // previously used and now ABI legacy
-  RESERVED_FEAT_LS64_V, // previously used and now ABI legacy
-  FEAT_LS64_ACCDATA,
+  RESERVED_FEAT_LS64,         // previously used and now ABI legacy
+  RESERVED_FEAT_LS64_V,       // previously used and now ABI legacy
+  RESERVED_FEAT_LS64_ACCDATA, // previously used and now ABI legacy
   FEAT_WFXT,
   FEAT_SME_F64,
   FEAT_SME_I64,
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
index 56ad3f8967b9a..d5c85701ad1a0 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -74,7 +74,6 @@ void __init_cpu_features_resolver(void) {
     CHECK_BIT(CAP_BIT_FEAT_LSE, FEAT_LSE);
     CHECK_BIT(CAP_BIT_FEAT_SHA256, FEAT_SHA2);
     CHECK_BIT(CAP_BIT_FEAT_PMULL, FEAT_PMULL);
-    CHECK_BIT(CAP_BIT_FEAT_SPECRES, FEAT_PREDRES);
     CHECK_BIT(CAP_BIT_FEAT_SB, FEAT_SB);
     CHECK_BIT(CAP_BIT_FEAT_FRINTTS, FEAT_FRINTTS);
     CHECK_BIT(CAP_BIT_FEAT_LRCPC, FEAT_RCPC);
@@ -132,7 +131,6 @@ void __init_cpu_features_resolver(void) {
       {"hw.optional.arm.FEAT_I8MM", FEAT_I8MM},
       {"hw.optional.arm.FEAT_BF16", FEAT_BF16},
       {"hw.optional.arm.FEAT_SB", FEAT_SB},
-      {"hw.optional.arm.FEAT_SPECRES", FEAT_PREDRES},
       {"hw.optional.arm.FEAT_SSBS", FEAT_SSBS2},
       {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
   };
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
index a3dbeb065403d..6d46fccdc79d9 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -81,17 +81,6 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_SME_F64);
   if (hwcap2 & HWCAP2_MOPS)
     setCPUFeature(FEAT_MOPS);
-  if (hwcap & HWCAP_CPUID) {
-    unsigned long ftr;
-
-    getCPUFeature(ID_AA64ISAR1_EL1, ftr);
-    /* ID_AA64ISAR1_EL1.SPECRES >= 0b0001  */
-    if (extractBits(ftr, 40, 4) >= 0x1)
-      setCPUFeature(FEAT_PREDRES);
-    /* ID_AA64ISAR1_EL1.LS64 >= 0b0011  */
-    if (extractBits(ftr, 60, 4) >= 0x3)
-      setCPUFeature(FEAT_LS64_ACCDATA);
-  }
   if (hwcap & HWCAP_FP) {
     setCPUFeature(FEAT_FP);
     // FP and AdvSIMD fields have the same value
diff --git a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
index 6b373ce424678..778f568c95c5e 100644
--- a/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
+++ b/llvm/include/llvm/TargetParser/AArch64CPUFeatures.inc
@@ -69,13 +69,13 @@ enum CPUFeatures {
   FEAT_MEMTAG2,
   RESERVED_FEAT_MEMTAG3, // previously used and now ABI legacy
   FEAT_SB,
-  FEAT_PREDRES,
-  RESERVED_FEAT_SSBS, // previously used and now ABI legacy
+  RESERVED_FEAT_PREDRES, // previously used and now ABI legacy
+  RESERVED_FEAT_SSBS,    // previously used and now ABI legacy
   FEAT_SSBS2,
   FEAT_BTI,
-  RESERVED_FEAT_LS64,   // previously used and now ABI legacy
-  RESERVED_FEAT_LS64_V, // previously used and now ABI legacy
-  FEAT_LS64_ACCDATA,
+  RESERVED_FEAT_LS64,         // previously used and now ABI legacy
+  RESERVED_FEAT_LS64_V,       // previously used and now ABI legacy
+  RESERVED_FEAT_LS64_ACCDATA, // previously used and now ABI legacy
   FEAT_WFXT,
   FEAT_SME_F64,
   FEAT_SME_I64,
diff --git a/llvm/lib/Target/AArch64/AArch64FMV.td b/llvm/lib/Target/AArch64/AArch64FMV.td
index e0f56fd555619..a9503b1e6248b 100644
--- a/llvm/lib/Target/AArch64/AArch64FMV.td
+++ b/llvm/lib/Target/AArch64/AArch64FMV.td
@@ -57,11 +57,9 @@ def : FMVExtension<"fp16fml", "FP16FML">;
 let BackendFeature = "fptoint" in def : FMVExtension<"frintts", "FRINTTS">;
 def : FMVExtension<"i8mm", "I8MM">;
 def : FMVExtension<"jscvt", "JSCVT">;
-def : FMVExtension<"ls64", "LS64_ACCDATA">;
 def : FMVExtension<"lse", "LSE">;
 def : FMVExtension<"memtag", "MEMTAG2">;
 def : FMVExtension<"mops", "MOPS">;
-def : FMVExtension<"predres", "PREDRES">;
 def : FMVExtension<"rcpc", "RCPC">;
 let BackendFeature = "rcpc-immo" in def : FMVExtension<"rcpc2", "RCPC2">;
 def : FMVExtension<"rcpc3", "RCPC3">;

From 5daecd4a3b9c6cca10ab6d44f539adf7310ace23 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 24 Jan 2025 09:22:47 -0800
Subject: [PATCH 027/432] [Support] Fix namespace after #123990

https://llvm.org/docs/CodingStandards.html#use-namespace-qualifiers-to-implement-previously-declared-functions
---
 llvm/lib/Support/AArch64BuildAttributes.cpp | 34 +++++++++++----------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp
index ada34eb3f927d..4a6b2fd538803 100644
--- a/llvm/lib/Support/AArch64BuildAttributes.cpp
+++ b/llvm/lib/Support/AArch64BuildAttributes.cpp
@@ -9,10 +9,10 @@
 #include "llvm/Support/AArch64BuildAttributes.h"
 #include "llvm/ADT/StringSwitch.h"
 
-namespace llvm {
-namespace AArch64BuildAttributes {
+using namespace llvm;
+using namespace llvm::AArch64BuildAttributes;
 
-StringRef getVendorName(unsigned Vendor) {
+StringRef AArch64BuildAttributes::getVendorName(unsigned Vendor) {
   switch (Vendor) {
   case AEABI_FEATURE_AND_BITS:
     return "aeabi_feature_and_bits";
@@ -25,14 +25,14 @@ StringRef getVendorName(unsigned Vendor) {
     return "";
   }
 }
-VendorID getVendorID(StringRef Vendor) {
+VendorID AArch64BuildAttributes::getVendorID(StringRef Vendor) {
   return StringSwitch<VendorID>(Vendor)
       .Case("aeabi_feature_and_bits", AEABI_FEATURE_AND_BITS)
       .Case("aeabi_pauthabi", AEABI_PAUTHABI)
       .Default(VENDOR_UNKNOWN);
 }
 
-StringRef getOptionalStr(unsigned Optional) {
+StringRef AArch64BuildAttributes::getOptionalStr(unsigned Optional) {
   switch (Optional) {
   case REQUIRED:
     return "required";
@@ -43,18 +43,18 @@ StringRef getOptionalStr(unsigned Optional) {
     return "";
   }
 }
-SubsectionOptional getOptionalID(StringRef Optional) {
+SubsectionOptional AArch64BuildAttributes::getOptionalID(StringRef Optional) {
   return StringSwitch<SubsectionOptional>(Optional)
       .Case("required", REQUIRED)
       .Case("optional", OPTIONAL)
       .Default(OPTIONAL_NOT_FOUND);
 }
-StringRef getSubsectionOptionalUnknownError() {
+StringRef AArch64BuildAttributes::getSubsectionOptionalUnknownError() {
   return "unknown AArch64 build attributes optionality, expected "
          "required|optional";
 }
 
-StringRef getTypeStr(unsigned Type) {
+StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
   switch (Type) {
   case ULEB128:
     return "uleb128";
@@ -65,17 +65,17 @@ StringRef getTypeStr(unsigned Type) {
     return "";
   }
 }
-SubsectionType getTypeID(StringRef Type) {
+SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) {
   return StringSwitch<SubsectionType>(Type)
       .Cases("uleb128", "ULEB128", ULEB128)
       .Cases("ntbs", "NTBS", NTBS)
       .Default(TYPE_NOT_FOUND);
 }
-StringRef getSubsectionTypeUnknownError() {
+StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() {
   return "unknown AArch64 build attributes type, expected uleb128|ntbs";
 }
 
-StringRef getPauthABITagsStr(unsigned PauthABITag) {
+StringRef AArch64BuildAttributes::getPauthABITagsStr(unsigned PauthABITag) {
   switch (PauthABITag) {
   case TAG_PAUTH_PLATFORM:
     return "Tag_PAuth_Platform";
@@ -86,14 +86,16 @@ StringRef getPauthABITagsStr(unsigned PauthABITag) {
     return "";
   }
 }
-PauthABITags getPauthABITagsID(StringRef PauthABITag) {
+
+PauthABITags AArch64BuildAttributes::getPauthABITagsID(StringRef PauthABITag) {
   return StringSwitch<PauthABITags>(PauthABITag)
       .Case("Tag_PAuth_Platform", TAG_PAUTH_PLATFORM)
       .Case("Tag_PAuth_Schema", TAG_PAUTH_SCHEMA)
       .Default(PAUTHABI_TAG_NOT_FOUND);
 }
 
-StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
+StringRef
+AArch64BuildAttributes::getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
   switch (FeatureAndBitsTag) {
   case TAG_FEATURE_BTI:
     return "Tag_Feature_BTI";
@@ -106,12 +108,12 @@ StringRef getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
     return "";
   }
 }
-FeatureAndBitsTags getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) {
+
+FeatureAndBitsTags
+AArch64BuildAttributes::getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) {
   return StringSwitch<FeatureAndBitsTags>(FeatureAndBitsTag)
       .Case("Tag_Feature_BTI", TAG_FEATURE_BTI)
       .Case("Tag_Feature_PAC", TAG_FEATURE_PAC)
       .Case("Tag_Feature_GCS", TAG_FEATURE_GCS)
       .Default(FEATURE_AND_BITS_TAG_NOT_FOUND);
 }
-} // namespace AArch64BuildAttributes
-} // namespace llvm

From c025b96ef9bb364c79f73fc3afb45c851c2efb17 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 24 Jan 2025 09:34:48 -0800
Subject: [PATCH 028/432] [ELF] Symbol::extract : remove unneeded file->lazy
 check

---
 lld/ELF/Symbols.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index ce1e89f2d0801..497db842bc9be 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -254,10 +254,9 @@ void Symbol::parseSymbolVersion(Ctx &ctx) {
 }
 
 void Symbol::extract(Ctx &ctx) const {
-  if (file->lazy) {
-    file->lazy = false;
-    parseFile(ctx, file);
-  }
+  assert(file->lazy);
+  file->lazy = false;
+  parseFile(ctx, file);
 }
 
 uint8_t Symbol::computeBinding(Ctx &ctx) const {

From 134401deea5e86d646bb99fab39c182cfa8e5292 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 24 Jan 2025 11:36:45 -0600
Subject: [PATCH 029/432] [Offload] Move RPC server handling to a dedicated
 thread (#112988)

Summary:
Handling the RPC server requires running through list of jobs that the
device has requested to be done. Currently this is handled by the thread
that does the waiting for the kernel to finish. However, this is not
sound on NVIDIA architectures and only works for async launches in the
OpenMP model that uses helper threads.

However, we also don't want to have this thread doing work
unnnecessarily. For this reason we track the execution of kernels and
cause the thread to sleep via a condition variable (usually backed by
some kind of futex or other intelligent sleeping mechanism) so that the
thread will be idle while no kernels are running.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp    |  59 +++++----
 offload/plugins-nextgen/common/include/RPC.h  |  80 +++++++++++-
 .../common/src/PluginInterface.cpp            |   8 +-
 offload/plugins-nextgen/common/src/RPC.cpp    | 123 +++++++++++++-----
 .../cuda/dynamic_cuda/cuda.cpp                |   1 +
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |   3 +
 offload/plugins-nextgen/cuda/src/rtl.cpp      |  39 +++---
 offload/test/libc/server.c                    |  56 ++++++++
 8 files changed, 281 insertions(+), 88 deletions(-)
 create mode 100644 offload/test/libc/server.c

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 7114dad020e3a..6fc75ac154289 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -621,9 +621,9 @@ struct AMDGPUSignalTy {
   }
 
   /// Wait until the signal gets a zero value.
-  Error wait(const uint64_t ActiveTimeout = 0, RPCServerTy *RPCServer = nullptr,
+  Error wait(const uint64_t ActiveTimeout = 0,
              GenericDeviceTy *Device = nullptr) const {
-    if (ActiveTimeout && !RPCServer) {
+    if (ActiveTimeout) {
       hsa_signal_value_t Got = 1;
       Got = hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
                                       ActiveTimeout, HSA_WAIT_STATE_ACTIVE);
@@ -632,14 +632,11 @@ struct AMDGPUSignalTy {
     }
 
     // If there is an RPC device attached to this stream we run it as a server.
-    uint64_t Timeout = RPCServer ? 8192 : UINT64_MAX;
-    auto WaitState = RPCServer ? HSA_WAIT_STATE_ACTIVE : HSA_WAIT_STATE_BLOCKED;
+    uint64_t Timeout = UINT64_MAX;
+    auto WaitState = HSA_WAIT_STATE_BLOCKED;
     while (hsa_signal_wait_scacquire(HSASignal, HSA_SIGNAL_CONDITION_EQ, 0,
-                                     Timeout, WaitState) != 0) {
-      if (RPCServer && Device)
-        if (auto Err = RPCServer->runServer(*Device))
-          return Err;
-    }
+                                     Timeout, WaitState) != 0)
+      ;
     return Plugin::success();
   }
 
@@ -1052,11 +1049,6 @@ struct AMDGPUStreamTy {
   /// operation that was already finalized in a previous stream sycnhronize.
   uint32_t SyncCycle;
 
-  /// A pointer associated with an RPC server running on the given device. If
-  /// RPC is not being used this will be a null pointer. Otherwise, this
-  /// indicates that an RPC server is expected to be run on this stream.
-  RPCServerTy *RPCServer;
-
   /// Mutex to protect stream's management.
   mutable std::mutex Mutex;
 
@@ -1236,9 +1228,6 @@ struct AMDGPUStreamTy {
   /// Deinitialize the stream's signals.
   Error deinit() { return Plugin::success(); }
 
-  /// Attach an RPC server to this stream.
-  void setRPCServer(RPCServerTy *Server) { RPCServer = Server; }
-
   /// Push a asynchronous kernel to the stream. The kernel arguments must be
   /// placed in a special allocation for kernel args and must keep alive until
   /// the kernel finalizes. Once the kernel is finished, the stream will release
@@ -1266,10 +1255,30 @@ struct AMDGPUStreamTy {
     if (auto Err = Slots[Curr].schedReleaseBuffer(KernelArgs, MemoryManager))
       return Err;
 
+    // If we are running an RPC server we want to wake up the server thread
+    // whenever there is a kernel running and let it sleep otherwise.
+    if (Device.getRPCServer())
+      Device.Plugin.getRPCServer().Thread->notify();
+
     // Push the kernel with the output signal and an input signal (optional)
-    return Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads, NumBlocks,
-                                   GroupSize, StackSize, OutputSignal,
-                                   InputSignal);
+    if (auto Err = Queue->pushKernelLaunch(Kernel, KernelArgs, NumThreads,
+                                           NumBlocks, GroupSize, StackSize,
+                                           OutputSignal, InputSignal))
+      return Err;
+
+    // Register a callback to indicate when the kernel is complete.
+    if (Device.getRPCServer()) {
+      if (auto Err = Slots[Curr].schedCallback(
+              [](void *Data) -> llvm::Error {
+                GenericPluginTy &Plugin =
+                    *reinterpret_cast<GenericPluginTy *>(Data);
+                Plugin.getRPCServer().Thread->finish();
+                return Error::success();
+              },
+              &Device.Plugin))
+        return Err;
+    }
+    return Plugin::success();
   }
 
   /// Push an asynchronous memory copy between pinned memory buffers.
@@ -1479,8 +1488,8 @@ struct AMDGPUStreamTy {
       return Plugin::success();
 
     // Wait until all previous operations on the stream have completed.
-    if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds,
-                                              RPCServer, &Device))
+    if (auto Err =
+            Slots[last()].Signal->wait(StreamBusyWaitMicroseconds, &Device))
       return Err;
 
     // Reset the stream and perform all pending post actions.
@@ -3027,7 +3036,7 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
     : Agent(Device.getAgent()), Queue(nullptr),
       SignalManager(Device.getSignalManager()), Device(Device),
       // Initialize the std::deque with some empty positions.
-      Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
+      Slots(32), NextSlot(0), SyncCycle(0),
       StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
       UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
 
@@ -3383,10 +3392,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   if (auto Err = AMDGPUDevice.getStream(AsyncInfoWrapper, Stream))
     return Err;
 
-  // If this kernel requires an RPC server we attach its pointer to the stream.
-  if (GenericDevice.getRPCServer())
-    Stream->setRPCServer(GenericDevice.getRPCServer());
-
   // Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
   if (ImplArgs &&
       getImplicitArgsSize() == sizeof(hsa_utils::AMDGPUImplicitArgsTy)) {
diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index 5b9b7ffd086b5..f3a8e7555020d 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -19,7 +19,11 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Error.h"
 
+#include <atomic>
+#include <condition_variable>
 #include <cstdint>
+#include <mutex>
+#include <thread>
 
 namespace llvm::omp::target {
 namespace plugin {
@@ -37,6 +41,12 @@ struct RPCServerTy {
   /// Initializes the handles to the number of devices we may need to service.
   RPCServerTy(plugin::GenericPluginTy &Plugin);
 
+  /// Deinitialize the associated memory and resources.
+  llvm::Error shutDown();
+
+  /// Initialize the worker thread.
+  llvm::Error startThread();
+
   /// Check if this device image is using an RPC server. This checks for the
   /// precense of an externally visible symbol in the device image that will
   /// be present whenever RPC code is called.
@@ -51,17 +61,77 @@ struct RPCServerTy {
                          plugin::GenericGlobalHandlerTy &Handler,
                          plugin::DeviceImageTy &Image);
 
-  /// Runs the RPC server associated with the \p Device until the pending work
-  /// is cleared.
-  llvm::Error runServer(plugin::GenericDeviceTy &Device);
-
   /// Deinitialize the RPC server for the given device. This will free the
   /// memory associated with the k
   llvm::Error deinitDevice(plugin::GenericDeviceTy &Device);
 
 private:
   /// Array from this device's identifier to its attached devices.
-  llvm::SmallVector<void *> Buffers;
+  std::unique_ptr<void *[]> Buffers;
+
+  /// Array of associated devices. These must be alive as long as the server is.
+  std::unique_ptr<plugin::GenericDeviceTy *[]> Devices;
+
+  /// A helper class for running the user thread that handles the RPC interface.
+  /// Because we only need to check the RPC server while any kernels are
+  /// working, we track submission / completion events to allow the thread to
+  /// sleep when it is not needed.
+  struct ServerThread {
+    std::thread Worker;
+
+    /// A boolean indicating whether or not the worker thread should continue.
+    std::atomic<bool> Running;
+
+    /// The number of currently executing kernels across all devices that need
+    /// the server thread to be running.
+    std::atomic<uint32_t> NumUsers;
+
+    /// The condition variable used to suspend the thread if no work is needed.
+    std::condition_variable CV;
+    std::mutex Mutex;
+
+    /// A reference to all the RPC interfaces that the server is handling.
+    llvm::ArrayRef<void *> Buffers;
+
+    /// A reference to the associated generic device for the buffer.
+    llvm::ArrayRef<plugin::GenericDeviceTy *> Devices;
+
+    /// Initialize the worker thread to run in the background.
+    ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[],
+                 size_t Length)
+        : Running(true), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length),
+          Devices(Devices, Length) {}
+
+    ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
+
+    /// Notify the worker thread that there is a user that needs it.
+    void notify() {
+      std::lock_guard<decltype(Mutex)> Lock(Mutex);
+      NumUsers.fetch_add(1, std::memory_order_relaxed);
+      CV.notify_all();
+    }
+
+    /// Indicate that one of the dependent users has finished.
+    void finish() {
+      [[maybe_unused]] uint32_t Old =
+          NumUsers.fetch_sub(1, std::memory_order_relaxed);
+      assert(Old > 0 && "Attempt to signal finish with no pending work");
+    }
+
+    /// Destroy the worker thread and wait.
+    void shutDown();
+
+    /// Initialize the worker thread.
+    void startThread();
+
+    /// Run the server thread to continuously check the RPC interface for work
+    /// to be done for the device.
+    void run();
+  };
+
+public:
+  /// Pointer to the server thread instance.
+  std::unique_ptr<ServerThread> Thread;
 };
 
 } // namespace llvm::omp::target
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index a164bfb51d026..c9acabea6977d 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1057,6 +1057,9 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
   if (auto Err = Server.initDevice(*this, Plugin.getGlobalHandler(), Image))
     return Err;
 
+  if (auto Err = Server.startThread())
+    return Err;
+
   RPCServer = &Server;
   DP("Running an RPC server on device %d\n", getDeviceId());
   return Plugin::success();
@@ -1630,8 +1633,11 @@ Error GenericPluginTy::deinit() {
   if (GlobalHandler)
     delete GlobalHandler;
 
-  if (RPCServer)
+  if (RPCServer) {
+    if (Error Err = RPCServer->shutDown())
+      return Err;
     delete RPCServer;
+  }
 
   if (RecordReplay)
     delete RecordReplay;
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index f20c8f7bcc5c9..81ad9ca66808a 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -21,8 +21,8 @@ using namespace omp;
 using namespace target;
 
 template <uint32_t NumLanes>
-rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device,
-                                   rpc::Server::Port &Port) {
+rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
+                                 rpc::Server::Port &Port) {
 
   switch (Port.get_opcode()) {
   case LIBC_MALLOC: {
@@ -62,21 +62,99 @@ rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device,
   return rpc::RPC_SUCCESS;
 }
 
-static rpc::Status handle_offload_opcodes(plugin::GenericDeviceTy &Device,
-                                          rpc::Server::Port &Port,
-                                          uint32_t NumLanes) {
+static rpc::Status handleOffloadOpcodes(plugin::GenericDeviceTy &Device,
+                                        rpc::Server::Port &Port,
+                                        uint32_t NumLanes) {
   if (NumLanes == 1)
-    return handle_offload_opcodes<1>(Device, Port);
+    return handleOffloadOpcodes<1>(Device, Port);
   else if (NumLanes == 32)
-    return handle_offload_opcodes<32>(Device, Port);
+    return handleOffloadOpcodes<32>(Device, Port);
   else if (NumLanes == 64)
-    return handle_offload_opcodes<64>(Device, Port);
+    return handleOffloadOpcodes<64>(Device, Port);
   else
     return rpc::RPC_ERROR;
 }
 
+static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) {
+  uint64_t NumPorts =
+      std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
+  rpc::Server Server(NumPorts, Buffer);
+
+  auto Port = Server.try_open(Device.getWarpSize());
+  if (!Port)
+    return rpc::RPC_SUCCESS;
+
+  rpc::Status Status =
+      handleOffloadOpcodes(Device, *Port, Device.getWarpSize());
+
+  // Let the `libc` library handle any other unhandled opcodes.
+#ifdef LIBOMPTARGET_RPC_SUPPORT
+  if (Status == rpc::RPC_UNHANDLED_OPCODE)
+    Status = handle_libc_opcodes(*Port, Device.getWarpSize());
+#endif
+
+  Port->close();
+
+  return Status;
+}
+
+void RPCServerTy::ServerThread::startThread() {
+  Worker = std::thread([this]() { run(); });
+}
+
+void RPCServerTy::ServerThread::shutDown() {
+  {
+    std::lock_guard<decltype(Mutex)> Lock(Mutex);
+    Running.store(false, std::memory_order_release);
+    CV.notify_all();
+  }
+  if (Worker.joinable())
+    Worker.join();
+}
+
+void RPCServerTy::ServerThread::run() {
+  std::unique_lock<decltype(Mutex)> Lock(Mutex);
+  for (;;) {
+    CV.wait(Lock, [&]() {
+      return NumUsers.load(std::memory_order_acquire) > 0 ||
+             !Running.load(std::memory_order_acquire);
+    });
+
+    if (!Running.load(std::memory_order_acquire))
+      return;
+
+    Lock.unlock();
+    while (NumUsers.load(std::memory_order_relaxed) > 0 &&
+           Running.load(std::memory_order_relaxed)) {
+      for (const auto &[Buffer, Device] : llvm::zip_equal(Buffers, Devices)) {
+        if (!Buffer || !Device)
+          continue;
+
+        // If running the server failed, print a message but keep running.
+        if (runServer(*Device, Buffer) != rpc::RPC_SUCCESS)
+          FAILURE_MESSAGE("Unhandled or invalid RPC opcode!");
+      }
+    }
+    Lock.lock();
+  }
+}
+
 RPCServerTy::RPCServerTy(plugin::GenericPluginTy &Plugin)
-    : Buffers(Plugin.getNumDevices()) {}
+    : Buffers(std::make_unique<void *[]>(Plugin.getNumDevices())),
+      Devices(std::make_unique<plugin::GenericDeviceTy *[]>(
+          Plugin.getNumDevices())),
+      Thread(new ServerThread(Buffers.get(), Devices.get(),
+                              Plugin.getNumDevices())) {}
+
+llvm::Error RPCServerTy::startThread() {
+  Thread->startThread();
+  return Error::success();
+}
+
+llvm::Error RPCServerTy::shutDown() {
+  Thread->shutDown();
+  return Error::success();
+}
 
 llvm::Expected<bool>
 RPCServerTy::isDeviceUsingRPC(plugin::GenericDeviceTy &Device,
@@ -108,35 +186,14 @@ Error RPCServerTy::initDevice(plugin::GenericDeviceTy &Device,
                                    sizeof(rpc::Client), nullptr))
     return Err;
   Buffers[Device.getDeviceId()] = RPCBuffer;
-
-  return Error::success();
-}
-
-Error RPCServerTy::runServer(plugin::GenericDeviceTy &Device) {
-  uint64_t NumPorts =
-      std::min(Device.requestedRPCPortCount(), rpc::MAX_PORT_COUNT);
-  rpc::Server Server(NumPorts, Buffers[Device.getDeviceId()]);
-
-  auto Port = Server.try_open(Device.getWarpSize());
-  if (!Port)
-    return Error::success();
-
-  int Status = handle_offload_opcodes(Device, *Port, Device.getWarpSize());
-
-  // Let the `libc` library handle any other unhandled opcodes.
-#ifdef LIBOMPTARGET_RPC_SUPPORT
-  if (Status == rpc::RPC_UNHANDLED_OPCODE)
-    Status = handle_libc_opcodes(*Port, Device.getWarpSize());
-#endif
-
-  Port->close();
-  if (Status != rpc::RPC_SUCCESS)
-    return createStringError("RPC server given invalid opcode!");
+  Devices[Device.getDeviceId()] = &Device;
 
   return Error::success();
 }
 
 Error RPCServerTy::deinitDevice(plugin::GenericDeviceTy &Device) {
   Device.free(Buffers[Device.getDeviceId()], TARGET_ALLOC_HOST);
+  Buffers[Device.getDeviceId()] = nullptr;
+  Devices[Device.getDeviceId()] = nullptr;
   return Error::success();
 }
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 5ec3adb9e4e3a..7878499dbfcb7 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -63,6 +63,7 @@ DLWRAP(cuStreamCreate, 2)
 DLWRAP(cuStreamDestroy, 1)
 DLWRAP(cuStreamSynchronize, 1)
 DLWRAP(cuStreamQuery, 1)
+DLWRAP(cuStreamAddCallback, 4)
 DLWRAP(cuCtxSetCurrent, 1)
 DLWRAP(cuDevicePrimaryCtxRelease, 1)
 DLWRAP(cuDevicePrimaryCtxGetState, 3)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 16c8f7ad46c44..ad874735a25ed 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -286,6 +286,8 @@ static inline void *CU_LAUNCH_PARAM_END = (void *)0x00;
 static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
+typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
 CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
@@ -326,6 +328,7 @@ CUresult cuStreamCreate(CUstream *, unsigned);
 CUresult cuStreamDestroy(CUstream);
 CUresult cuStreamSynchronize(CUstream);
 CUresult cuStreamQuery(CUstream);
+CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int);
 CUresult cuCtxSetCurrent(CUcontext);
 CUresult cuDevicePrimaryCtxRelease(CUdevice);
 CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 894d1c2214b97..52e8a100dc87b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -628,17 +628,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
   Error synchronizeImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
     CUresult Res;
-    // If we have an RPC server running on this device we will continuously
-    // query it for work rather than blocking.
-    if (!getRPCServer()) {
-      Res = cuStreamSynchronize(Stream);
-    } else {
-      do {
-        Res = cuStreamQuery(Stream);
-        if (auto Err = getRPCServer()->runServer(*this))
-          return Err;
-      } while (Res == CUDA_ERROR_NOT_READY);
-    }
+    Res = cuStreamSynchronize(Stream);
 
     // Once the stream is synchronized, return it to stream pool and reset
     // AsyncInfo. This is to make sure the synchronization only works for its
@@ -823,17 +813,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = getStream(AsyncInfoWrapper, Stream))
       return Err;
 
-    // If there is already pending work on the stream it could be waiting for
-    // someone to check the RPC server.
-    if (auto *RPCServer = getRPCServer()) {
-      CUresult Res = cuStreamQuery(Stream);
-      while (Res == CUDA_ERROR_NOT_READY) {
-        if (auto Err = RPCServer->runServer(*this))
-          return Err;
-        Res = cuStreamQuery(Stream);
-      }
-    }
-
     CUresult Res = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream);
     return Plugin::check(Res, "Error in cuMemcpyDtoHAsync: %s");
   }
@@ -1292,9 +1271,25 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                     reinterpret_cast<void *>(&LaunchParams.Size),
                     CU_LAUNCH_PARAM_END};
 
+  // If we are running an RPC server we want to wake up the server thread
+  // whenever there is a kernel running and let it sleep otherwise.
+  if (GenericDevice.getRPCServer())
+    GenericDevice.Plugin.getRPCServer().Thread->notify();
+
   CUresult Res = cuLaunchKernel(Func, NumBlocks[0], NumBlocks[1], NumBlocks[2],
                                 NumThreads[0], NumThreads[1], NumThreads[2],
                                 MaxDynCGroupMem, Stream, nullptr, Config);
+
+  // Register a callback to indicate when the kernel is complete.
+  if (GenericDevice.getRPCServer())
+    cuLaunchHostFunc(
+        Stream,
+        [](void *Data) {
+          GenericPluginTy &Plugin = *reinterpret_cast<GenericPluginTy *>(Data);
+          Plugin.getRPCServer().Thread->finish();
+        },
+        &GenericDevice.Plugin);
+
   return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
 }
 
diff --git a/offload/test/libc/server.c b/offload/test/libc/server.c
new file mode 100644
index 0000000000000..67f60a648235a
--- /dev/null
+++ b/offload/test/libc/server.c
@@ -0,0 +1,56 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+
+// REQUIRES: libc
+
+#include <assert.h>
+#include <omp.h>
+#include <stdio.h>
+
+#pragma omp begin declare variant match(device = {kind(gpu)})
+// Extension provided by the 'libc' project.
+unsigned long long __llvm_omp_host_call(void *fn, void *args, size_t size);
+#pragma omp declare target to(__llvm_omp_host_call) device_type(nohost)
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+// Dummy host implementation to make this work for all targets.
+unsigned long long __llvm_omp_host_call(void *fn, void *args, size_t size) {
+  return ((unsigned long long (*)(void *))fn)(args);
+}
+#pragma omp end declare variant
+
+long long foo(void *data) { return -1; }
+
+void *fn_ptr = NULL;
+#pragma omp declare target to(fn_ptr)
+
+int main() {
+  fn_ptr = (void *)&foo;
+#pragma omp target update to(fn_ptr)
+
+  for (int i = 0; i < 4; ++i) {
+#pragma omp target
+    {
+      long long res = __llvm_omp_host_call(fn_ptr, NULL, 0);
+      assert(res == -1 && "RPC call failed\n");
+    }
+
+    for (int j = 0; j < 128; ++j) {
+#pragma omp target nowait
+      {
+        long long res = __llvm_omp_host_call(fn_ptr, NULL, 0);
+        assert(res == -1 && "RPC call failed\n");
+      }
+    }
+#pragma omp taskwait
+
+#pragma omp target
+    {
+      long long res = __llvm_omp_host_call(fn_ptr, NULL, 0);
+      assert(res == -1 && "RPC call failed\n");
+    }
+  }
+
+  // CHECK: PASS
+  puts("PASS");
+}

From bd8a8181288c9e16eb90fff78cbbc63b4687963a Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 24 Jan 2025 11:40:15 -0600
Subject: [PATCH 030/432] [Offload] Add cuLaunchHostFunc to dynamic cuda

Summary:
This was missing, causing non-directly linked builds to fail.
---
 offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp | 1 +
 offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h   | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 7878499dbfcb7..e5332686fcffb 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -39,6 +39,7 @@ DLWRAP(cuDriverGetVersion, 1)
 
 DLWRAP(cuGetErrorString, 2)
 DLWRAP(cuLaunchKernel, 11)
+DLWRAP(cuLaunchHostFunc, 3)
 
 DLWRAP(cuMemAlloc, 2)
 DLWRAP(cuMemAllocHost, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index ad874735a25ed..ac075c875a8bb 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -30,15 +30,16 @@ typedef uintptr_t CUdeviceptr;
 typedef struct CUmod_st *CUmodule;
 typedef struct CUctx_st *CUcontext;
 typedef struct CUfunc_st *CUfunction;
+typedef void (*CUhostFn)(void *userData);
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
 
-#define CU_DEVICE_INVALID ((CUdevice)-2)
+#define CU_DEVICE_INVALID ((CUdevice)(-2))
 
 typedef unsigned long long CUmemGenericAllocationHandle_v1;
 typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
 
-#define CU_DEVICE_INVALID ((CUdevice)-2)
+#define CU_DEVICE_INVALID ((CUdevice)(-2))
 
 typedef enum CUmemAllocationGranularity_flags_enum {
   CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
@@ -304,6 +305,7 @@ CUresult cuInit(unsigned);
 CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
                         unsigned, unsigned, unsigned, CUstream, void **,
                         void **);
+CUresult cuLaunchHostFunc(CUstream, CUhostFn, void *);
 
 CUresult cuMemAlloc(CUdeviceptr *, size_t);
 CUresult cuMemAllocHost(void **, size_t);

From 0cd794d4860e376698bb4da24bcdf8cbf331835c Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 24 Jan 2025 18:56:02 +0100
Subject: [PATCH 031/432] [libc++][chrono] implements UTC clock. (#90393)

While implementing this feature and its associated LWG issues it turns
out
- LWG3316 Correctly define epoch for utc_clock / utc_timepoint only
added non-normative wording to the standard.

Implements parts of:
- P0355 Extending <chrono> to Calendars and Time Zones
- P1361 Integration of chrono with text formatting
- LWG3359 <chrono> leap second support should allow for negative leap
seconds
---
 libcxx/docs/Status/Cxx20Issues.csv            |    2 +-
 libcxx/docs/Status/FormatPaper.csv            |    2 +-
 libcxx/include/CMakeLists.txt                 |    1 +
 libcxx/include/__chrono/convert_to_tm.h       |   23 +
 libcxx/include/__chrono/formatter.h           |   18 +
 libcxx/include/__chrono/ostream.h             |   13 +
 libcxx/include/__chrono/utc_clock.h           |  163 +++
 libcxx/include/chrono                         |   38 +
 libcxx/include/module.modulemap               |    4 +
 libcxx/modules/std/chrono.inc                 |   11 +-
 libcxx/test/benchmarks/utc_clock.bench.cpp    |   60 +
 .../diagnostics/chrono.nodiscard.verify.cpp   |   14 +
 .../get_leap_second_info.pass.cpp             |  147 +++
 .../time.clock.utc.members/from_sys.pass.cpp  |  108 ++
 .../time.clock.utc.members/to_sys.pass.cpp    |  117 ++
 .../get_leap_second_info.pass.cpp             |  128 +++
 .../leap_second_info.members.pass.cpp         |   37 +
 .../time.clock.utc.members/from_sys.pass.cpp  |  245 ++++
 .../time.clock.utc.members/now.pass.cpp       |   38 +
 .../time.clock.utc.members/to_sys.pass.cpp    |  252 +++++
 .../time.clock.utc/types.compile.pass.cpp     |   60 +
 .../time.clock.utc/utc_time.ostream.pass.cpp  |  165 +++
 .../time/time.syn/formatter.utc_time.pass.cpp | 1004 +++++++++++++++++
 .../concept.formattable.compile.pass.cpp      |    8 +-
 24 files changed, 2653 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/include/__chrono/utc_clock.h
 create mode 100644 libcxx/test/benchmarks/utc_clock.bench.cpp
 create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
 create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
 create mode 100644 libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp
 create mode 100644 libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp

diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 3462557e8d668..ca286146840b1 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -238,7 +238,7 @@
 "`LWG3313 <https://wg21.link/LWG3313>`__","``join_view::iterator::operator--``\  is incorrectly constrained","2020-02 (Prague)","|Complete|","14",""
 "`LWG3314 <https://wg21.link/LWG3314>`__","Is stream insertion behavior locale dependent when ``Period::type``\  is ``micro``\ ?","2020-02 (Prague)","|Complete|","16",""
 "`LWG3315 <https://wg21.link/LWG3315>`__","LWG3315: Correct Allocator Default Behavior","2020-02 (Prague)","|Complete|","",""
-"`LWG3316 <https://wg21.link/LWG3316>`__","Correctly define epoch for ``utc_clock``\  / ``utc_timepoint``\ ","2020-02 (Prague)","","",""
+"`LWG3316 <https://wg21.link/LWG3316>`__","Correctly define epoch for ``utc_clock``\  / ``utc_timepoint``\ ","2020-02 (Prague)","|Nothing To Do|","",""
 "`LWG3317 <https://wg21.link/LWG3317>`__","Incorrect ``operator<<``\  for floating-point durations","2020-02 (Prague)","|Complete|","16",""
 "`LWG3318 <https://wg21.link/LWG3318>`__","Clarify whether clocks can represent time before their epoch","2020-02 (Prague)","","",""
 "`LWG3319 <https://wg21.link/LWG3319>`__","Properly reference specification of IANA time zone database","2020-02 (Prague)","|Nothing To Do|","",""
diff --git a/libcxx/docs/Status/FormatPaper.csv b/libcxx/docs/Status/FormatPaper.csv
index 7f5f46d834d3e..343fa62f13565 100644
--- a/libcxx/docs/Status/FormatPaper.csv
+++ b/libcxx/docs/Status/FormatPaper.csv
@@ -2,7 +2,7 @@ Section,Description,Dependencies,Assignee,Status,First released version
 `P1361 <https://wg21.link/P1361>`__ `P2372 <https://wg21.link/P2372>`__,"Formatting chrono"
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::duration<Rep, Period>``",,Mark de Wever,|Complete|,16
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::sys_time<Duration>``",,Mark de Wever,|Complete|,17
-`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::utc_time<Duration>``",A ``<chrono>`` implementation,Mark de Wever,,,
+`[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::utc_time<Duration>``",A ``<chrono>`` implementation,Mark de Wever,|Complete|,20
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::tai_time<Duration>``",A ``<chrono>`` implementation,Mark de Wever,,,
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::gps_time<Duration>``",A ``<chrono>`` implementation,Mark de Wever,,,
 `[time.syn] <https://wg21.link/time.syn>`_,"Formatter ``chrono::file_time<Duration>``",,Mark de Wever,|Complete|,17
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index f3313bf53460a..78d3192542b5a 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -275,6 +275,7 @@ set(files
   __chrono/time_zone_link.h
   __chrono/tzdb.h
   __chrono/tzdb_list.h
+  __chrono/utc_clock.h
   __chrono/weekday.h
   __chrono/year.h
   __chrono/year_month.h
diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h
index 8a16c4f996a86..e547e107a5852 100644
--- a/libcxx/include/__chrono/convert_to_tm.h
+++ b/libcxx/include/__chrono/convert_to_tm.h
@@ -24,6 +24,7 @@
 #include <__chrono/sys_info.h>
 #include <__chrono/system_clock.h>
 #include <__chrono/time_point.h>
+#include <__chrono/utc_clock.h>
 #include <__chrono/weekday.h>
 #include <__chrono/year.h>
 #include <__chrono/year_month.h>
@@ -98,6 +99,22 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const chrono::sys_time<_Duration> __tp
   return __result;
 }
 
+#  if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+#    if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+template <class _Tm, class _Duration>
+_LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) {
+  _Tm __result = std::__convert_to_tm<_Tm>(chrono::utc_clock::to_sys(__tp));
+
+  if (chrono::get_leap_second_info(__tp).is_leap_second)
+    ++__result.tm_sec;
+
+  return __result;
+}
+
+#    endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#  endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+
 // Convert a chrono (calendar) time point, or dururation to the given _Tm type,
 // which must have the same properties as std::tm.
 template <class _Tm, class _ChronoT>
@@ -110,6 +127,12 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) {
   if constexpr (__is_time_point<_ChronoT>) {
     if constexpr (same_as<typename _ChronoT::clock, chrono::system_clock>)
       return std::__convert_to_tm<_Tm>(__value);
+#  if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+#    if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+    else if constexpr (same_as<typename _ChronoT::clock, chrono::utc_clock>)
+      return std::__convert_to_tm<_Tm>(__value);
+#    endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#  endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
     else if constexpr (same_as<typename _ChronoT::clock, chrono::file_clock>)
       return std::__convert_to_tm<_Tm>(_ChronoT::clock::to_sys(__value));
     else if constexpr (same_as<typename _ChronoT::clock, chrono::local_t>)
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 400eb8c7fdcd2..6153fdc35a47b 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -32,6 +32,7 @@
 #  include <__chrono/sys_info.h>
 #  include <__chrono/system_clock.h>
 #  include <__chrono/time_point.h>
+#  include <__chrono/utc_clock.h>
 #  include <__chrono/weekday.h>
 #  include <__chrono/year.h>
 #  include <__chrono/year_month.h>
@@ -719,6 +720,23 @@ struct _LIBCPP_TEMPLATE_VIS formatter<chrono::sys_time<_Duration>, _CharT> : pub
   }
 };
 
+#    if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
+#      if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+template <class _Duration, __fmt_char_type _CharT>
+struct _LIBCPP_TEMPLATE_VIS formatter<chrono::utc_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
+public:
+  using _Base _LIBCPP_NODEBUG = __formatter_chrono<_CharT>;
+
+  template <class _ParseContext>
+  _LIBCPP_HIDE_FROM_ABI constexpr typename _ParseContext::iterator parse(_ParseContext& __ctx) {
+    return _Base::__parse(__ctx, __format_spec::__fields_chrono, __format_spec::__flags::__clock);
+  }
+};
+
+#      endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
+
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::file_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
 public:
diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index 41884647f927d..66735e5c2c28b 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -26,6 +26,7 @@
 #  include <__chrono/statically_widen.h>
 #  include <__chrono/sys_info.h>
 #  include <__chrono/system_clock.h>
+#  include <__chrono/utc_clock.h>
 #  include <__chrono/weekday.h>
 #  include <__chrono/year.h>
 #  include <__chrono/year_month.h>
@@ -61,6 +62,18 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_days& __dp) {
   return __os << year_month_day{__dp};
 }
 
+#    if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
+#      if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+template <class _CharT, class _Traits, class _Duration>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const utc_time<_Duration>& __tp) {
+  return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T}"), __tp);
+}
+
+#      endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
+
 template <class _CharT, class _Traits, class _Duration>
 _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const file_time<_Duration> __tp) {
diff --git a/libcxx/include/__chrono/utc_clock.h b/libcxx/include/__chrono/utc_clock.h
new file mode 100644
index 0000000000000..647b6eda13ea2
--- /dev/null
+++ b/libcxx/include/__chrono/utc_clock.h
@@ -0,0 +1,163 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___CHRONO_UTC_CLOCK_H
+#define _LIBCPP___CHRONO_UTC_CLOCK_H
+
+#include <version>
+// Enable the contents of the header only when libc++ was built with experimental features enabled.
+#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#  include <__chrono/duration.h>
+#  include <__chrono/leap_second.h>
+#  include <__chrono/system_clock.h>
+#  include <__chrono/time_point.h>
+#  include <__chrono/tzdb.h>
+#  include <__chrono/tzdb_list.h>
+#  include <__config>
+#  include <__type_traits/common_type.h>
+
+#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#    pragma GCC system_header
+#  endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#  if _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+
+namespace chrono {
+
+class utc_clock;
+
+template <class _Duration>
+using utc_time    = time_point<utc_clock, _Duration>;
+using utc_seconds = utc_time<seconds>;
+
+class utc_clock {
+public:
+  using rep                       = system_clock::rep;
+  using period                    = system_clock::period;
+  using duration                  = chrono::duration<rep, period>;
+  using time_point                = chrono::time_point<utc_clock>;
+  static constexpr bool is_steady = false; // The system_clock is not steady.
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static time_point now() { return from_sys(system_clock::now()); }
+
+  template <class _Duration>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static sys_time<common_type_t<_Duration, seconds>>
+  to_sys(const utc_time<_Duration>& __time);
+
+  template <class _Duration>
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI static utc_time<common_type_t<_Duration, seconds>>
+  from_sys(const sys_time<_Duration>& __time) {
+    using _Rp = utc_time<common_type_t<_Duration, seconds>>;
+    // TODO TZDB investigate optimizations.
+    //
+    // The leap second database stores all transitions, this mean to calculate
+    // the current number of leap seconds the code needs to iterate over all
+    // leap seconds to accumulate the sum. Then the sum can be used to determine
+    // the sys_time. Accessing the database involves acquiring a mutex.
+    //
+    // The historic entries in the database are immutable. Hard-coding these
+    // values in a table would allow:
+    // - To store the sum, allowing a binary search on the data.
+    // - Avoid acquiring a mutex.
+    // The disadvantage are:
+    // - A slightly larger code size.
+    //
+    // There are two optimization directions
+    // - hard-code the database and do a linear search for future entries. This
+    //   search can start at the back, and should probably contain very few
+    //   entries. (Adding leap seconds is quite rare and new release of libc++
+    //   can add the new entries; they are announced half a year before they are
+    //   added.)
+    // - During parsing the leap seconds store an additional database in the
+    //   dylib with the list of the sum of the leap seconds. In that case there
+    //   can be a private function __get_utc_to_sys_table that returns the
+    //   table.
+    //
+    // Note for to_sys there are no optimizations to be done; it uses
+    // get_leap_second_info. The function get_leap_second_info could benefit
+    // from optimizations as described above; again both options apply.
+
+    // Both UTC and the system clock use the same epoch. The Standard
+    // specifies from 1970-01-01 even when UTC starts at
+    // 1972-01-01 00:00:10 TAI. So when the sys_time is before epoch we can be
+    // sure there both clocks return the same value.
+
+    const tzdb& __tzdb = chrono::get_tzdb();
+    _Rp __result{__time.time_since_epoch()};
+    for (const auto& __leap_second : __tzdb.leap_seconds) {
+      if (__leap_second > __time)
+        return __result;
+
+      __result += __leap_second.value();
+    }
+    return __result;
+  }
+};
+
+struct leap_second_info {
+  bool is_leap_second;
+  seconds elapsed;
+};
+
+template <class _Duration>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI leap_second_info get_leap_second_info(const utc_time<_Duration>& __time) {
+  const tzdb& __tzdb = chrono::get_tzdb();
+  if (__tzdb.leap_seconds.empty()) [[unlikely]]
+    return {false, chrono::seconds{0}};
+
+  sys_seconds __sys{chrono::floor<seconds>(__time).time_since_epoch()};
+  seconds __elapsed{0};
+  for (const auto& __leap_second : __tzdb.leap_seconds) {
+    if (__sys == __leap_second.date() + __elapsed)
+      // A time point may only be a leap second during a positive leap second
+      // insertion, since time points that occur during a (theoretical)
+      // negative leap second don't exist.
+      return {__leap_second.value() > 0s, __elapsed + __leap_second.value()};
+
+    if (__sys < __leap_second.date() + __elapsed)
+      return {false, __elapsed};
+
+    __elapsed += __leap_second.value();
+  }
+
+  return {false, __elapsed};
+}
+
+template <class _Duration>
+[[nodiscard]] _LIBCPP_HIDE_FROM_ABI sys_time<common_type_t<_Duration, seconds>>
+utc_clock::to_sys(const utc_time<_Duration>& __time) {
+  using _Dp               = common_type_t<_Duration, seconds>;
+  leap_second_info __info = chrono::get_leap_second_info(__time);
+
+  // [time.clock.utc.members]/2
+  //   Returns: A sys_time t, such that from_sys(t) == u if such a mapping
+  //   exists. Otherwise u represents a time_point during a positive leap
+  //   second insertion, the conversion counts that leap second as not
+  //   inserted, and the last representable value of sys_time prior to the
+  //   insertion of the leap second is returned.
+  sys_time<common_type_t<_Duration, seconds>> __result{__time.time_since_epoch() - __info.elapsed};
+  if (__info.is_leap_second)
+    return chrono::floor<seconds>(__result) + chrono::seconds{1} - _Dp{1};
+
+  return __result;
+}
+
+} // namespace chrono
+
+#  endif // _LIBCPP_STD_VER >= 20 && _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM &&
+         // _LIBCPP_HAS_LOCALIZATION
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+
+#endif // _LIBCPP___CHRONO_UTC_CLOCK_H
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index d9a8afef933b9..10695eea649fb 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -300,6 +300,41 @@ template<class charT, class traits>                    // C++20
   basic_ostream<charT, traits>&
     operator<<(basic_ostream<charT, traits>& os, const sys_days& dp);
 
+// [time.clock.utc], class utc_clock
+class utc_clock {                                      // C++20
+public:
+    using rep                       = a signed arithmetic type;
+    using period                    = ratio<unspecified, unspecified>;
+    using duration                  = chrono::duration<rep, period>;
+    using time_point                = chrono::time_point<utc_clock>;
+    static constexpr bool is_steady = unspecified;
+
+    static time_point now();
+
+    template<class Duration>
+      static sys_time<common_type_t<Duration, seconds>>
+        to_sys(const utc_time<Duration>& t);
+    template<class Duration>
+      static utc_time<common_type_t<Duration, seconds>>
+        from_sys(const sys_time<Duration>& t);
+};
+
+template<class Duration>
+using utc_time  = time_point<utc_clock, Duration>;      // C++20
+using utc_seconds = utc_time<seconds>;                  // C++20
+
+template<class charT, class traits, class Duration>     // C++20
+  basic_ostream<charT, traits>&
+    operator<<(basic_ostream<charT, traits>& os, const utc_time<Duration>& t);
+
+struct leap_second_info {                               // C++20
+  bool    is_leap_second;
+  seconds elapsed;
+};
+
+template<class Duration>                                // C++20
+  leap_second_info get_leap_second_info(const utc_time<Duration>& ut);
+
 class file_clock                                        // C++20
 {
 public:
@@ -861,6 +896,8 @@ strong_ordering operator<=>(const time_zone_link& x, const time_zone_link& y);
 namespace std {
   template<class Duration, class charT>
     struct formatter<chrono::sys_time<Duration>, charT>;                          // C++20
+  template<class Duration, class charT>
+    struct formatter<chrono::utc_time<Duration>, charT>;                          // C++20
   template<class Duration, class charT>
     struct formatter<chrono::filetime<Duration>, charT>;                          // C++20
   template<class Duration, class charT>
@@ -981,6 +1018,7 @@ constexpr chrono::year                                  operator ""y(unsigned lo
 #      include <__chrono/time_zone_link.h>
 #      include <__chrono/tzdb.h>
 #      include <__chrono/tzdb_list.h>
+#      include <__chrono/utc_clock.h>
 #      include <__chrono/zoned_time.h>
 #    endif
 
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index cdac9c883ecab..85b88ca137f85 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -980,6 +980,10 @@ module std [system] {
       export std.string // public data member of type std::string
       export std.vector // public data members of type std::vector
     }
+    module utc_clock {
+      header "__chrono/utc_clock.h"
+      export std.chrono.time_point
+    }
     module weekday                    { header "__chrono/weekday.h" }
     module year_month_day             { header "__chrono/year_month_day.h" }
     module year_month_weekday         { header "__chrono/year_month_weekday.h" }
diff --git a/libcxx/modules/std/chrono.inc b/libcxx/modules/std/chrono.inc
index df21d1fbac585..98f14f716c207 100644
--- a/libcxx/modules/std/chrono.inc
+++ b/libcxx/modules/std/chrono.inc
@@ -84,7 +84,9 @@ export namespace std {
     using std::chrono::sys_seconds;
     using std::chrono::sys_time;
 
-#if 0
+#if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+#  ifdef _LIBCPP_ENABLE_EXPERIMENTAL
+
     // [time.clock.utc], class utc_clock
     using std::chrono::utc_clock;
 
@@ -94,6 +96,8 @@ export namespace std {
     using std::chrono::leap_second_info;
 
     using std::chrono::get_leap_second_info;
+
+#    if 0
     // [time.clock.tai], class tai_clock
     using std::chrono::tai_clock;
 
@@ -105,7 +109,10 @@ export namespace std {
 
     using std::chrono::gps_seconds;
     using std::chrono::gps_time;
-#endif
+#    endif
+#  endif // _LIBCPP_ENABLE_EXPERIMENTAL
+#endif   //  _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
+
     // [time.clock.file], type file_clock
     using std::chrono::file_clock;
 
diff --git a/libcxx/test/benchmarks/utc_clock.bench.cpp b/libcxx/test/benchmarks/utc_clock.bench.cpp
new file mode 100644
index 0000000000000..c44652a8f7ae0
--- /dev/null
+++ b/libcxx/test/benchmarks/utc_clock.bench.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+#include <chrono>
+
+#include "benchmark/benchmark.h"
+
+// Benchmarks the performance of the UTC <-> system time conversions. These
+// operations determine the sum of leap second insertions at a specific time.
+
+static void BM_from_sys(benchmark::State& state) {
+  std::chrono::sys_days date{std::chrono::July / 1 / state.range(0)};
+  for (auto _ : state)
+    benchmark::DoNotOptimize(std::chrono::utc_clock::from_sys(date));
+}
+
+BENCHMARK(BM_from_sys)
+    ->Arg(1970)  // before the first leap seconds
+    ->Arg(1979)  // in the first half of inserted leap seconds
+    ->Arg(1993)  // in the second half of inserted leap seconds
+    ->Arg(2100); // after the last leap second
+
+BENCHMARK(BM_from_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(4);
+BENCHMARK(BM_from_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(16);
+
+static void BM_to_sys(benchmark::State& state) {
+  // 59 sec offset means we pass th UTC offset for the leap second; assuming
+  // there won't be more than 59 leap seconds ever.
+  std::chrono::utc_seconds date{
+      std::chrono::sys_days{std::chrono::July / 1 / state.range(0)}.time_since_epoch() + std::chrono::seconds{59}};
+  for (auto _ : state)
+    benchmark::DoNotOptimize(std::chrono::utc_clock::to_sys(date));
+}
+
+BENCHMARK(BM_to_sys)
+    ->Arg(1970)  // before the first leap seconds
+    ->Arg(1979)  // in the first half of inserted leap seconds
+    ->Arg(1993)  // in the second half of inserted leap seconds
+    ->Arg(2100); // after the last leap second
+
+BENCHMARK(BM_to_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(4);
+BENCHMARK(BM_to_sys)->Arg(1970)->Arg(1979)->Arg(1993)->Arg(2100)->Threads(16);
+
+int main(int argc, char** argv) {
+  benchmark::Initialize(&argc, argv);
+  if (benchmark::ReportUnrecognizedArguments(argc, argv))
+    return 1;
+
+  benchmark::RunSpecifiedBenchmarks();
+}
diff --git a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
index 4cb10ae3c35e9..644c5b598c018 100644
--- a/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
+++ b/libcxx/test/libcxx/diagnostics/chrono.nodiscard.verify.cpp
@@ -75,6 +75,20 @@ void test(std::chrono::time_zone tz, std::chrono::time_zone_link link, std::chro
     t::locate_zone(""); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
   }
 
+  { // [time.clock.utc]
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::chrono::utc_clock::now();
+
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::chrono::utc_clock::to_sys(std::chrono::utc_seconds{});
+
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::chrono::utc_clock::from_sys(std::chrono::sys_seconds{});
+
+    // expected-warning@+1 {{ignoring return value of function declared with 'nodiscard' attribute}}
+    std::chrono::get_leap_second_info(std::chrono::utc_seconds{});
+  }
+
   {
     std::chrono::zoned_time<std::chrono::seconds> zt;
 
diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
new file mode 100644
index 0000000000000..e87c5438179ef
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// template<class Duration>
+// std::chrono::leap_second_info get_leap_second_info(const utc_time<Duration>& ut);
+
+#include <chrono>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "filesystem_test_helper.h"
+#include "test_tzdb.h"
+
+scoped_test_env env;
+[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo");
+const std::filesystem::path tzdata               = env.create_file("zoneinfo/tzdata.zi");
+const std::filesystem::path leap_seconds         = env.create_file("zoneinfo/leap-seconds.list");
+
+std::string_view std::chrono::__libcpp_tzdb_directory() {
+  static std::string result = dir.string();
+  return result;
+}
+
+static void write(std::string_view input) {
+  static int version = 0;
+
+  {
+    std::ofstream f{tzdata};
+    f << "# version " << version++ << '\n';
+    std::ofstream{leap_seconds}.write(input.data(), input.size());
+  }
+  std::chrono::reload_tzdb();
+}
+
+template <class Duration>
+static void test_leap_second_info(
+    std::chrono::time_point<std::chrono::utc_clock, Duration> time, bool is_leap_second, std::chrono::seconds elapsed) {
+  std::chrono::leap_second_info result = std::chrono::get_leap_second_info(time);
+  TEST_REQUIRE(
+      result.is_leap_second == is_leap_second && result.elapsed == elapsed,
+      TEST_WRITE_CONCATENATED(
+          "\nExpected output [",
+          is_leap_second,
+          ", ",
+          elapsed,
+          "]\nActual output   [",
+          result.is_leap_second,
+          ", ",
+          result.elapsed,
+          "]\n"));
+}
+
+static void test_no_leap_seconds_entries() {
+  using namespace std::literals::chrono_literals;
+
+  write("");
+
+  test_leap_second_info(
+      std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 1900}.time_since_epoch()}, false, 0s);
+  test_leap_second_info(
+      std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 2000}.time_since_epoch()}, false, 0s);
+  test_leap_second_info(
+      std::chrono::utc_seconds{std::chrono::sys_days{std::chrono::January / 1 / 3000}.time_since_epoch()}, false, 0s);
+}
+
+// Note at the time of writing all leap seconds are positive. This test uses
+// fake data to test the behaviour of negative leap seconds.
+static void test_negative_leap_seconds() {
+  using namespace std::literals::chrono_literals;
+
+  // Use small values for simplicity. The dates are seconds since 1.1.1900.
+  write(
+      R"(
+1 10
+60 11
+120 12
+180 11
+240 12
+300 13
+360 12
+)");
+
+  // Transitions from the start of UTC.
+  auto test_transition = [](std::chrono::utc_seconds time, std::chrono::seconds elapsed, bool positive) {
+    if (positive) {
+      // Every transition has the following tests
+      // - 1ns before the start of the transition is_leap_second -> false, elapsed -> elapsed
+      // -         at the start of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // - 1ns after  the start of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // - 1ns before the end   of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // -         at the end   of the transition is_leap_second -> false, elapsed -> elapsed + 1
+
+      test_leap_second_info(time - 1ns, false, elapsed);
+      test_leap_second_info(time, true, elapsed + 1s);
+      test_leap_second_info(time + 1ns, true, elapsed + 1s);
+      test_leap_second_info(time + 1s - 1ns, true, elapsed + 1s);
+      test_leap_second_info(time + 1s, false, elapsed + 1s);
+    } else {
+      // Every transition has the following tests
+      // - 1ns before the transition is_leap_second -> false, elapsed -> elapsed
+      // -         at the transition is_leap_second -> false  elapsed -> elapsed - 1
+      // - 1ns after  the transition is_leap_second -> false, elapsed -> elapsed - 1
+      test_leap_second_info(time - 1ns, false, elapsed);
+      test_leap_second_info(time, false, elapsed - 1s);
+      test_leap_second_info(time + 1ns, false, elapsed - 1s);
+    }
+  };
+
+  std::chrono::utc_seconds epoch{std::chrono::sys_days{std::chrono::January / 1 / 1900}.time_since_epoch()};
+  test_leap_second_info(epoch, false, 0s);
+
+  // The UTC times are:
+  //   epoch + transition time in the database + leap seconds before the transition.
+  test_transition(epoch + 60s + 0s, 0s, true);
+  test_transition(epoch + 120s + 1s, 1s, true);
+  test_transition(epoch + 180s + 2s, 2s, false);
+  test_transition(epoch + 240s + 1s, 1s, true);
+  test_transition(epoch + 300s + 2s, 2s, true);
+  test_transition(epoch + 360s + 3s, 3s, false);
+}
+
+int main(int, const char**) {
+  test_no_leap_seconds_entries();
+  test_negative_leap_seconds();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
new file mode 100644
index 0000000000000..2468daa95c29d
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// template<class Duration>
+// static utc_time<common_type_t<Duration, seconds>>
+// from_sys(const sys_time<Duration>& time);
+
+#include <chrono>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "filesystem_test_helper.h"
+#include "test_tzdb.h"
+
+scoped_test_env env;
+[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo");
+const std::filesystem::path tzdata               = env.create_file("zoneinfo/tzdata.zi");
+const std::filesystem::path leap_seconds         = env.create_file("zoneinfo/leap-seconds.list");
+
+std::string_view std::chrono::__libcpp_tzdb_directory() {
+  static std::string result = dir.string();
+  return result;
+}
+
+static void write(std::string_view input) {
+  static int version = 0;
+
+  std::ofstream f{tzdata};
+  f << "# version " << version++ << '\n';
+  std::ofstream{leap_seconds}.write(input.data(), input.size());
+}
+
+template <class Duration>
+static void
+test_leap_seconds(std::chrono::time_point<std::chrono::system_clock, Duration> time, std::chrono::seconds expected) {
+  auto utc  = std::chrono::utc_clock::from_sys(time);
+  auto diff = utc.time_since_epoch() - time.time_since_epoch();
+  TEST_REQUIRE(
+      diff == expected,
+      TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", expected, "\nActual output   ", diff, '\n'));
+}
+
+// Note at the time of writing all leap seconds are positive. This test uses
+// fake data to test the behaviour of negative leap seconds.
+int main(int, const char**) {
+  using namespace std::literals::chrono_literals;
+
+  // Use small values for simplicity. The dates are seconds since 1.1.1970.
+  write(
+      R"(
+1 10
+60 11
+120 12
+180 11
+240 12
+300 13
+360 12
+)");
+
+  std::chrono::sys_days epoch = {std::chrono::January / 1 / 1900};
+  test_leap_seconds(epoch, 0s);
+
+  test_leap_seconds(epoch + 60s - 1ns, 0s);
+  test_leap_seconds(epoch + 60s, 1s);
+  test_leap_seconds(epoch + 60s + 1ns, 1s);
+
+  test_leap_seconds(epoch + 120s - 1ns, 1s);
+  test_leap_seconds(epoch + 120s, 2s);
+  test_leap_seconds(epoch + 120s + 1ns, 2s);
+
+  test_leap_seconds(epoch + 180s - 1ns, 2s);
+  test_leap_seconds(epoch + 180s, 1s);
+  test_leap_seconds(epoch + 180s + 1ns, 1s);
+
+  test_leap_seconds(epoch + 240s - 1ns, 1s);
+  test_leap_seconds(epoch + 240s, 2s);
+  test_leap_seconds(epoch + 240s + 1ns, 2s);
+
+  test_leap_seconds(epoch + 300s - 1ns, 2s);
+  test_leap_seconds(epoch + 300s, 3s);
+  test_leap_seconds(epoch + 300s + 1ns, 3s);
+
+  test_leap_seconds(epoch + 360s - 1ns, 3s);
+  test_leap_seconds(epoch + 360s, 2s);
+  test_leap_seconds(epoch + 360s + 1ns, 2s);
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
new file mode 100644
index 0000000000000..ab4dff46d9184
--- /dev/null
+++ b/libcxx/test/libcxx/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
@@ -0,0 +1,117 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// static sys_time<common_type_t<_Duration, seconds>>
+// to_sys(const utc_time<_Duration>& __time);
+
+#include <chrono>
+#include <cassert>
+#include <fstream>
+#include <string>
+#include <string_view>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+#include "filesystem_test_helper.h"
+#include "test_tzdb.h"
+
+scoped_test_env env;
+[[maybe_unused]] const std::filesystem::path dir = env.create_dir("zoneinfo");
+const std::filesystem::path tzdata               = env.create_file("zoneinfo/tzdata.zi");
+const std::filesystem::path leap_seconds         = env.create_file("zoneinfo/leap-seconds.list");
+
+std::string_view std::chrono::__libcpp_tzdb_directory() {
+  static std::string result = dir.string();
+  return result;
+}
+
+static void write(std::string_view input) {
+  static int version = 0;
+
+  std::ofstream f{tzdata};
+  f << "# version " << version++ << '\n';
+  std::ofstream{leap_seconds}.write(input.data(), input.size());
+}
+
+template <class Duration>
+static void test_leap_seconds(std::chrono::utc_time<Duration> time, std::chrono::sys_time<Duration> expected) {
+  auto result = std::chrono::utc_clock::to_sys(time);
+  TEST_REQUIRE(result == expected,
+               TEST_WRITE_CONCATENATED("\nExpected output ", expected, "\nActual output   ", result, '\n'));
+}
+
+// Note at the time of writing all leap seconds are positive. This test uses
+// fake data to test the behaviour of negative leap seconds.
+int main(int, const char**) {
+  using namespace std::literals::chrono_literals;
+
+  // Use small values for simplicity. The dates are seconds since 1.1.1970.
+  write(
+      R"(
+1 10
+60 11
+120 12
+180 11
+240 12
+300 13
+360 12
+)");
+
+  std::chrono::sys_seconds sys_epoch{std::chrono::sys_days{std::chrono::January / 1 / 1900}};
+  std::chrono::utc_seconds utc_epoch{sys_epoch.time_since_epoch()};
+
+  test_leap_seconds(utc_epoch, sys_epoch);
+  auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) {
+    std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed;
+    if (positive) {
+      // Every transition has the following tests
+      // - 1ns before the start of the transition no adjustment needed
+      // -         at the start of the transition sys is clamped at the time just prior to the moment
+      //                                          of the leap second insertion. The exact value depends
+      //                                          on the resolution of the result type.
+      // - 1ns before the end   of the transition sys is still clamped like before
+      // -         at the end   of the transition sys is 1s behind the utc time
+      // - 1ns after  the end   of the transition sys is still 1s behind the utc time
+      test_leap_seconds(utc - 1ns, sys - 1ns);
+      test_leap_seconds(utc, sys - 1s);
+      test_leap_seconds(utc + 0ns, sys - 1ns);
+      test_leap_seconds(utc + 1s - 1ns, sys - 1ns);
+      test_leap_seconds(utc + 1s, sys);
+      test_leap_seconds(utc + 1s + 0ns, sys + 0ns);
+      test_leap_seconds(utc + 1s + 1ns, sys + 1ns);
+    } else {
+      // Every transition has the following tests
+      // - 1ns before the transition no adjustment needed
+      // -         at the transition sys is 1s ahead of the utc time
+      // - 1ns after  the transition sys is still 1s ahead of the utc time
+      test_leap_seconds(utc - 1ns, sys - 1ns);
+      test_leap_seconds(utc, sys + 1s);
+      test_leap_seconds(utc + 1ns, sys + 1s + 1ns);
+    }
+  };
+
+  test_transition(sys_epoch + 60s, 0s, true);
+  test_transition(sys_epoch + 120s, 1s, true);
+  test_transition(sys_epoch + 180s, 2s, false);
+  test_transition(sys_epoch + 240s, 1s, true);
+  test_transition(sys_epoch + 300s, 2s, true);
+  test_transition(sys_epoch + 360s, 3s, false);
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
new file mode 100644
index 0000000000000..9d06d479ad90c
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/get_leap_second_info.pass.cpp
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// template<class Duration>
+// leap_second_info get_leap_second_info(const utc_time<Duration>& ut);
+
+#include <chrono>
+#include <cassert>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+template <class Duration>
+static void test_leap_second_info(
+    std::chrono::time_point<std::chrono::utc_clock, Duration> time, bool is_leap_second, std::chrono::seconds elapsed) {
+  std::chrono::leap_second_info result = std::chrono::get_leap_second_info(time);
+  TEST_REQUIRE(
+      result.is_leap_second == is_leap_second && result.elapsed == elapsed,
+      TEST_WRITE_CONCATENATED(
+          "\nExpected output [",
+          is_leap_second,
+          ", ",
+          elapsed,
+          "]\nActual output   [",
+          result.is_leap_second,
+          ", ",
+          result.elapsed,
+          "]\n"));
+}
+
+static std::chrono::utc_seconds get_utc_time(long long seconds_since_1900) {
+  // The file leap-seconds.list stores dates since 1 January 1900, 00:00:00, we want
+  // seconds since 1 January 1970.
+  constexpr auto offset =
+      std::chrono::sys_days{std::chrono::January / 1 / 1970} - std::chrono::sys_days{std::chrono::January / 1 / 1900};
+  return std::chrono::utc_seconds{std::chrono::seconds{seconds_since_1900} - offset};
+}
+
+// Tests set of existing database entries at the time of writing.
+int main(int, const char**) {
+  using namespace std::literals::chrono_literals;
+
+  test_leap_second_info(std::chrono::utc_seconds::min(), false, 0s);
+
+  // Epoch transition no transitions.
+  test_leap_second_info(std::chrono::utc_seconds{-1s}, false, 0s);
+  test_leap_second_info(std::chrono::utc_seconds{0s}, false, 0s);
+  test_leap_second_info(std::chrono::utc_seconds{1s}, false, 0s);
+
+  // Transitions from the start of UTC.
+  auto test_transition = [](std::chrono::utc_seconds time, std::chrono::seconds elapsed, bool positive) {
+    // Note at the time of writing all leap seconds are positive so the else
+    // branch is never executed. The private test for this function tests
+    // negative leap seconds and uses the else branch.
+
+    if (positive) {
+      // Every transition has the following tests
+      // - 1ns before the start of the transition is_leap_second -> false, elapsed -> elapsed
+      // -         at the start of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // - 1ns after  the start of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // - 1ns before the end   of the transition is_leap_second -> true,  elapsed -> elapsed + 1
+      // -         at the end   of the transition is_leap_second -> false, elapsed -> elapsed + 1
+
+      test_leap_second_info(time - 1ns, false, elapsed);
+      test_leap_second_info(time, true, elapsed + 1s);
+      test_leap_second_info(time + 1ns, true, elapsed + 1s);
+      test_leap_second_info(time + 1s - 1ns, true, elapsed + 1s);
+      test_leap_second_info(time + 1s, false, elapsed + 1s);
+    } else {
+      // Every transition has the following tests
+      // - 1ns before the transition is_leap_second -> false, elapsed -> elapsed
+      // -         at the transition is_leap_second -> false  elapsed -> elapsed - 1
+      // - 1ns after  the transition is_leap_second -> false, elapsed -> elapsed - 1
+      test_leap_second_info(time - 1ns, false, elapsed);
+      test_leap_second_info(time, false, elapsed - 1s);
+      test_leap_second_info(time + 1ns, false, elapsed - 1s);
+    }
+  };
+
+  // The timestamps are from leap-seconds.list in the IANA database.
+  // Note the times stamps are timestamps without leap seconds so the number
+  // here are incremented by x "leap seconds".
+  test_transition(get_utc_time(2287785600 + 0), 0s, true);   // 1 Jul 1972
+  test_transition(get_utc_time(2303683200 + 1), 1s, true);   // 1 Jan 1973
+  test_transition(get_utc_time(2335219200 + 2), 2s, true);   // 1 Jan 1974
+  test_transition(get_utc_time(2366755200 + 3), 3s, true);   // 1 Jan 1975
+  test_transition(get_utc_time(2398291200 + 4), 4s, true);   // 1 Jan 1976
+  test_transition(get_utc_time(2429913600 + 5), 5s, true);   // 1 Jan 1977
+  test_transition(get_utc_time(2461449600 + 6), 6s, true);   // 1 Jan 1978
+  test_transition(get_utc_time(2492985600 + 7), 7s, true);   // 1 Jan 1979
+  test_transition(get_utc_time(2524521600 + 8), 8s, true);   // 1 Jan 1980
+  test_transition(get_utc_time(2571782400 + 9), 9s, true);   // 1 Jul 1981
+  test_transition(get_utc_time(2603318400 + 10), 10s, true); // 1 Jul 1982
+  test_transition(get_utc_time(2634854400 + 11), 11s, true); // 1 Jul 1983
+  test_transition(get_utc_time(2698012800 + 12), 12s, true); // 1 Jul 1985
+  test_transition(get_utc_time(2776982400 + 13), 13s, true); // 1 Jan 1988
+  test_transition(get_utc_time(2840140800 + 14), 14s, true); // 1 Jan 1990
+  test_transition(get_utc_time(2871676800 + 15), 15s, true); // 1 Jan 1991
+  test_transition(get_utc_time(2918937600 + 16), 16s, true); // 1 Jul 1992
+  test_transition(get_utc_time(2950473600 + 17), 17s, true); // 1 Jul 1993
+  test_transition(get_utc_time(2982009600 + 18), 18s, true); // 1 Jul 1994
+  test_transition(get_utc_time(3029443200 + 19), 19s, true); // 1 Jan 1996
+  test_transition(get_utc_time(3076704000 + 20), 20s, true); // 1 Jul 1997
+  test_transition(get_utc_time(3124137600 + 21), 21s, true); // 1 Jan 1999
+  test_transition(get_utc_time(3345062400 + 22), 22s, true); // 1 Jan 2006
+  test_transition(get_utc_time(3439756800 + 23), 23s, true); // 1 Jan 2009
+  test_transition(get_utc_time(3550089600 + 24), 24s, true); // 1 Jul 2012
+  test_transition(get_utc_time(3644697600 + 25), 25s, true); // 1 Jul 2015
+  test_transition(get_utc_time(3692217600 + 26), 26s, true); // 1 Jan 2017
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp
new file mode 100644
index 0000000000000..90cf99d4b30c7
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/leap_second_info.members.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// struct leap_second_info {
+//   bool    is_leap_second;
+//   seconds elapsed;
+// };
+
+#include <chrono>
+#include <type_traits>
+
+// Validates whether:
+// - The members are present as non-const members.
+// - The struct is an aggregate.
+int main(int, const char**) {
+  static_assert(std::is_aggregate_v<std::chrono::leap_second_info>);
+
+  std::chrono::leap_second_info leap_second_info{.is_leap_second = false, .elapsed = std::chrono::seconds(0)};
+
+  [[maybe_unused]] bool& is_leap_second          = leap_second_info.is_leap_second;
+  [[maybe_unused]] std::chrono::seconds& elapsed = leap_second_info.elapsed;
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
new file mode 100644
index 0000000000000..ab22cfafa2b0f
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/from_sys.pass.cpp
@@ -0,0 +1,245 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// template<class Duration>
+// static utc_time<common_type_t<Duration, seconds>>
+// from_sys(const sys_time<Duration>& time);
+
+#include <chrono>
+#include <cassert>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+template <class Duration>
+static void test_leap_seconds(std::chrono::time_point<std::chrono::system_clock, Duration> time,
+                              std::chrono::seconds leap_seconds) {
+  auto utc  = std::chrono::utc_clock::from_sys(time);
+  auto diff = utc.time_since_epoch() - time.time_since_epoch();
+  TEST_REQUIRE(
+      diff == leap_seconds,
+      TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", leap_seconds, "\nActual output   ", diff, '\n'));
+}
+
+// This test is based on the example in [time.clock.utc.members]/3
+static void test_example_standard() {
+  using namespace std::literals::chrono_literals;
+
+  auto t = std::chrono::sys_days{std::chrono::July / 1 / 2015} - 2ns;
+  test_leap_seconds(t, 25s);
+
+  t += 1ns;
+  test_leap_seconds(t, 25s);
+
+  t += 1ns;
+  test_leap_seconds(t, 26s);
+
+  t += 1ns;
+  test_leap_seconds(t, 26s);
+}
+
+// Tests set of existing database entries at the time of writing.
+static void test_transitions() {
+  using namespace std::literals::chrono_literals;
+
+  test_leap_seconds(std::chrono::sys_seconds::min(), 0s);
+  test_leap_seconds(std::chrono::sys_days::min(), 0s);
+
+  // Epoch transition no transitions.
+  test_leap_seconds(std::chrono::sys_seconds{-1s}, 0s);
+  test_leap_seconds(std::chrono::sys_seconds{0s}, 0s);
+  test_leap_seconds(std::chrono::sys_seconds{1s}, 0s);
+
+  // Transitions from the start of UTC.
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972} - 1ns, 0s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972}, 0s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1972} + 1ns, 0s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972} - 1ns, 0s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972}, 1s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1972} + 1ns, 1s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973} - 1ns, 1s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973}, 2s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1973} + 1ns, 2s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974} - 1ns, 2s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974}, 3s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1974} + 1ns, 3s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975} - 1ns, 3s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975}, 4s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1975} + 1ns, 4s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976} - 1ns, 4s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976}, 5s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1976} + 1ns, 5s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977} - 1ns, 5s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977}, 6s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1977} + 1ns, 6s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978} - 1ns, 6s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978}, 7s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1978} + 1ns, 7s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979} - 1ns, 7s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979}, 8s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1979} + 1ns, 8s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980} - 1ns, 8s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980}, 9s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1980} + 1ns, 9s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981} - 1ns, 9s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981}, 10s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1981} + 1ns, 10s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982} - 1ns, 10s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982}, 11s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1982} + 1ns, 11s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983} - 1ns, 11s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983}, 12s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1983} + 1ns, 12s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985} - 1ns, 12s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985}, 13s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1985} + 1ns, 13s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988} - 1ns, 13s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988}, 14s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1988} + 1ns, 14s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990} - 1ns, 14s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990}, 15s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1990} + 1ns, 15s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991} - 1ns, 15s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991}, 16s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1991} + 1ns, 16s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992} - 1ns, 16s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992}, 17s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1992} + 1ns, 17s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993} - 1ns, 17s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993}, 18s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1993} + 1ns, 18s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994} - 1ns, 18s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994}, 19s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1994} + 1ns, 19s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996} - 1ns, 19s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996}, 20s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1996} + 1ns, 20s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997} - 1ns, 20s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997}, 21s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 1997} + 1ns, 21s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999} - 1ns, 21s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999}, 22s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 1999} + 1ns, 22s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006} - 1ns, 22s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006}, 23s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2006} + 1ns, 23s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009} - 1ns, 23s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009}, 24s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2009} + 1ns, 24s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012} - 1ns, 24s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012}, 25s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2012} + 1ns, 25s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015} - 1ns, 25s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015}, 26s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::July / 1 / 2015} + 1ns, 26s);
+
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017} - 1ns, 26s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017}, 27s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2017} + 1ns, 27s);
+
+  // This validates status when the tests were written.
+  // It's not possible to test the future; there might be additional leap
+  // seconds in the future.
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024} - 1ns, 27s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024}, 27s);
+  test_leap_seconds(std::chrono::sys_days{std::chrono::January / 1 / 2024} + 1ns, 27s);
+}
+
+// Tests whether the return type is the expected type.
+static void test_return_type() {
+  namespace cr = std::chrono;
+  using namespace std::literals::chrono_literals;
+
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::nanoseconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::nanoseconds>{0ns});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::microseconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::microseconds>{0us});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::milliseconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::milliseconds>{0ms});
+  }
+
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::seconds>{cr::seconds{0}});
+  }
+
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::minutes>{cr::minutes{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::hours>{cr::hours{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::days>{cr::days{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::weeks>{cr::weeks{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::months>{cr::months{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::utc_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::from_sys(cr::sys_time<cr::years>{cr::years{0}});
+  }
+}
+
+int main(int, const char**) {
+  test_example_standard();
+  test_transitions();
+  test_return_type();
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp
new file mode 100644
index 0000000000000..2b6967b1c983a
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/now.pass.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+//
+// class utc_clock;
+
+// static time_point now();
+
+#include <chrono>
+#include <concepts>
+#include <cassert>
+
+int main(int, const char**) {
+  using clock                                      = std::chrono::utc_clock;
+  std::same_as<clock::time_point> decltype(auto) t = clock::now();
+
+  assert(t >= clock::time_point::min());
+  assert(t <= clock::time_point::max());
+
+  auto t2 = clock::now();
+  assert(t2 - t >= std::chrono::seconds(0));
+  // This may fail if the tests takes a long time to complete.
+  assert(t2 - t < std::chrono::seconds(42));
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
new file mode 100644
index 0000000000000..9b43ca4c0dde0
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/time.clock.utc.members/to_sys.pass.cpp
@@ -0,0 +1,252 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class utc_clock;
+
+// static sys_time<common_type_t<_Duration, seconds>>
+// to_sys(const utc_time<_Duration>& __time);
+
+#include <chrono>
+#include <cmath>
+#include <cassert>
+
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+template <class Duration>
+static void test_leap_seconds(std::chrono::utc_time<Duration> time, std::chrono::sys_time<Duration> expected) {
+  auto result = std::chrono::utc_clock::to_sys(time);
+  TEST_REQUIRE(
+      result == expected,
+      TEST_WRITE_CONCATENATED("\tTime: ", time, "\nExpected output ", expected, "\nActual output   ", result, '\n'));
+}
+
+static std::chrono::sys_seconds get_sys_time(long long seconds_since_1900) {
+  // The file leap-seconds.list stores dates since 1 January 1900, 00:00:00, we want
+  // seconds since 1 January 1970.
+  constexpr auto offset =
+      std::chrono::sys_days{std::chrono::January / 1 / 1970} - std::chrono::sys_days{std::chrono::January / 1 / 1900};
+  return std::chrono::sys_seconds{std::chrono::seconds{seconds_since_1900} - offset};
+}
+
+// Tests the set of existing database entries at the time of writing. Since
+// the last leap second insertion is several years ago, it's expected all
+// systems have the same information. (Adding new entries in the future does
+// not affect this test.)
+static void test_transitions() {
+  using namespace std::literals::chrono_literals;
+
+  test_leap_seconds(std::chrono::utc_seconds::min(), std::chrono::sys_seconds::min());
+
+  // Epoch transition no transitions.
+  test_leap_seconds(std::chrono::utc_seconds{-1s}, std::chrono::sys_seconds{-1s});
+  test_leap_seconds(std::chrono::utc_seconds{0s}, std::chrono::sys_seconds{0s});
+  test_leap_seconds(std::chrono::utc_seconds{1s}, std::chrono::sys_seconds{1s});
+
+  // "sys" is the time of the transition to the next leap second.
+  // "elapsed" is the number of leap seconds before the transition.
+  // "positive" is the leap second added +1s? If not it's -1s.
+  auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) {
+    // Note at the time of writing all leap seconds are positive so the else
+    // branch is never executed. The private test for this function tests
+    // negative leap seconds and uses the else branch.
+
+    std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed;
+    if (positive) {
+      // Every transition has the following tests
+      // - 1ns before the start of the transition no adjustment needed
+      // -         at the start of the transition sys is clamped at the time just prior to the moment
+      //                                          of the leap second insertion. The exact value depends
+      //                                          on the resolution of the result type.
+      // - 1ns before the end   of the transition sys is still clamped like before
+      // -         at the end   of the transition sys is 1s behind the utc time
+      // - 1ns after  the end   of the transition sys is still 1s behind the utc time
+      test_leap_seconds(utc - 1ns, sys - 1ns);
+      test_leap_seconds(utc, sys - 1s);
+      test_leap_seconds(utc + 0ns, sys - 1ns);
+      test_leap_seconds(utc + 1s - 1ns, sys - 1ns);
+      test_leap_seconds(utc + 1s, sys);
+      test_leap_seconds(utc + 1s + 0ns, sys + 0ns);
+      test_leap_seconds(utc + 1s + 1ns, sys + 1ns);
+    } else {
+      // Every transition has the following tests
+      // - 1ns before the transition no adjustment needed
+      // -         at the transition sys is 1s ahead of the utc time
+      // - 1ns after  the transition sys is still 1s ahead of the utc time
+      test_leap_seconds(utc - 1ns, sys - 1ns);
+      test_leap_seconds(utc, sys + 1s);
+      test_leap_seconds(utc + 1ns, sys + 1s + 1ns);
+    }
+  };
+
+  // Transitions from the start of UTC.
+  test_transition(get_sys_time(2287785600), 0s, true);  // 1 Jul 1972
+  test_transition(get_sys_time(2303683200), 1s, true);  // 1 Jan 1973
+  test_transition(get_sys_time(2335219200), 2s, true);  // 1 Jan 1974
+  test_transition(get_sys_time(2366755200), 3s, true);  // 1 Jan 1975
+  test_transition(get_sys_time(2398291200), 4s, true);  // 1 Jan 1976
+  test_transition(get_sys_time(2429913600), 5s, true);  // 1 Jan 1977
+  test_transition(get_sys_time(2461449600), 6s, true);  // 1 Jan 1978
+  test_transition(get_sys_time(2492985600), 7s, true);  // 1 Jan 1979
+  test_transition(get_sys_time(2524521600), 8s, true);  // 1 Jan 1980
+  test_transition(get_sys_time(2571782400), 9s, true);  // 1 Jul 1981
+  test_transition(get_sys_time(2603318400), 10s, true); // 1 Jul 1982
+  test_transition(get_sys_time(2634854400), 11s, true); // 1 Jul 1983
+  test_transition(get_sys_time(2698012800), 12s, true); // 1 Jul 1985
+  test_transition(get_sys_time(2776982400), 13s, true); // 1 Jan 1988
+  test_transition(get_sys_time(2840140800), 14s, true); // 1 Jan 1990
+  test_transition(get_sys_time(2871676800), 15s, true); // 1 Jan 1991
+  test_transition(get_sys_time(2918937600), 16s, true); // 1 Jul 1992
+  test_transition(get_sys_time(2950473600), 17s, true); // 1 Jul 1993
+  test_transition(get_sys_time(2982009600), 18s, true); // 1 Jul 1994
+  test_transition(get_sys_time(3029443200), 19s, true); // 1 Jan 1996
+  test_transition(get_sys_time(3076704000), 20s, true); // 1 Jul 1997
+  test_transition(get_sys_time(3124137600), 21s, true); // 1 Jan 1999
+  test_transition(get_sys_time(3345062400), 22s, true); // 1 Jan 2006
+  test_transition(get_sys_time(3439756800), 23s, true); // 1 Jan 2009
+  test_transition(get_sys_time(3550089600), 24s, true); // 1 Jul 2012
+  test_transition(get_sys_time(3644697600), 25s, true); // 1 Jul 2015
+  test_transition(get_sys_time(3692217600), 26s, true); // 1 Jan 2017
+}
+
+// Tests the transition for clocks where the duration's rep is a floating-point type.
+static void test_transitions_floating_point() {
+  using namespace std::literals::chrono_literals;
+
+  // Based on test_transitions but uses a floating-point duration.
+  using F = float;
+
+  auto test_transition = [](std::chrono::sys_seconds sys, std::chrono::seconds elapsed, bool positive) {
+    // Note at the time of writing all leap seconds are positive so the else
+    // branch is never executed. The private test for this function tests
+    // negative leap seconds and uses the else branch.
+
+    std::chrono::utc_seconds utc = std::chrono::utc_seconds{sys.time_since_epoch()} + elapsed;
+
+    using D = std::chrono::duration<F>;
+    using S = std::chrono ::time_point<std::chrono::system_clock, D>;
+    using U = std::chrono ::time_point<std::chrono::utc_clock, D>;
+
+    S s{sys.time_since_epoch()};
+    bool is_leap_second = s.time_since_epoch().count() == sys.time_since_epoch().count();
+    assert(is_leap_second);
+
+    U u{utc.time_since_epoch()};
+    if (positive) {
+      test_leap_seconds(u - 1ns, s - 1ns);
+      test_leap_seconds(u, s - 1s);
+      test_leap_seconds(u + 0ns, s - 1ns);
+      test_leap_seconds(u + 1s - 1ns, s - 1ns);
+      test_leap_seconds(u + 1s, s);
+      test_leap_seconds(u + 1s + 0ns, s + 0ns);
+      test_leap_seconds(u + 1s + 1ns, s + 1ns);
+
+      test_leap_seconds(U{D{std::nextafter(u.time_since_epoch().count(), F{0})}},
+                        S{D{std::nextafter(s.time_since_epoch().count(), F{0})}});
+      test_leap_seconds(u, S{D{s.time_since_epoch().count() - F{1}}});
+      test_leap_seconds(U{D{u.time_since_epoch().count() + F{1}}}, s);
+      test_leap_seconds(U{D{std::nextafter(u.time_since_epoch().count() + F{1}, std::numeric_limits<F>::max())}},
+                        S{D{std::nextafter(s.time_since_epoch().count(), std::numeric_limits<F>::max())}});
+    }
+  };
+
+  // Transitions from the start of UTC.
+  test_transition(get_sys_time(2287785600), 0s, true);  // 1 Jul 1972
+  test_transition(get_sys_time(2303683200), 1s, true);  // 1 Jan 1973
+  test_transition(get_sys_time(2335219200), 2s, true);  // 1 Jan 1974
+  test_transition(get_sys_time(2366755200), 3s, true);  // 1 Jan 1975
+  test_transition(get_sys_time(2398291200), 4s, true);  // 1 Jan 1976
+  test_transition(get_sys_time(2429913600), 5s, true);  // 1 Jan 1977
+  test_transition(get_sys_time(2461449600), 6s, true);  // 1 Jan 1978
+  test_transition(get_sys_time(2492985600), 7s, true);  // 1 Jan 1979
+  test_transition(get_sys_time(2524521600), 8s, true);  // 1 Jan 1980
+  test_transition(get_sys_time(2571782400), 9s, true);  // 1 Jul 1981
+  test_transition(get_sys_time(2603318400), 10s, true); // 1 Jul 1982
+  test_transition(get_sys_time(2634854400), 11s, true); // 1 Jul 1983
+  test_transition(get_sys_time(2698012800), 12s, true); // 1 Jul 1985
+  test_transition(get_sys_time(2776982400), 13s, true); // 1 Jan 1988
+  test_transition(get_sys_time(2840140800), 14s, true); // 1 Jan 1990
+  test_transition(get_sys_time(2871676800), 15s, true); // 1 Jan 1991
+  test_transition(get_sys_time(2918937600), 16s, true); // 1 Jul 1992
+  test_transition(get_sys_time(2950473600), 17s, true); // 1 Jul 1993
+  test_transition(get_sys_time(2982009600), 18s, true); // 1 Jul 1994
+  test_transition(get_sys_time(3029443200), 19s, true); // 1 Jan 1996
+  test_transition(get_sys_time(3076704000), 20s, true); // 1 Jul 1997
+  test_transition(get_sys_time(3124137600), 21s, true); // 1 Jan 1999
+  test_transition(get_sys_time(3345062400), 22s, true); // 1 Jan 2006
+  test_transition(get_sys_time(3439756800), 23s, true); // 1 Jan 2009
+  test_transition(get_sys_time(3550089600), 24s, true); // 1 Jul 2012
+  test_transition(get_sys_time(3644697600), 25s, true); // 1 Jul 2015
+  test_transition(get_sys_time(3692217600), 26s, true); // 1 Jan 2017
+}
+
+// Tests whether the return type is the expected type.
+static void test_return_type() {
+  namespace cr = std::chrono;
+  using namespace std::literals::chrono_literals;
+
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::nanoseconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::nanoseconds>{0ns});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::microseconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::microseconds>{0us});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::milliseconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::milliseconds>{0ms});
+  }
+
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::seconds>{cr::seconds{0}});
+  }
+
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::minutes>{cr::minutes{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::hours>{cr::hours{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::days>{cr::days{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::weeks>{cr::weeks{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::months>{cr::months{0}});
+  }
+  {
+    [[maybe_unused]] std::same_as<cr::sys_time<cr::seconds>> decltype(auto) _ =
+        cr::utc_clock::to_sys(cr::utc_time<cr::years>{cr::years{0}});
+  }
+}
+
+int main(int, const char**) {
+  test_transitions();
+  test_transitions_floating_point();
+  test_return_type();
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp
new file mode 100644
index 0000000000000..0322e9122e1cd
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/types.compile.pass.cpp
@@ -0,0 +1,60 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// <chrono>
+
+// class utc_clock {
+// public:
+//     using rep                       = a signed arithmetic type;
+//     using period                    = ratio<unspecified, unspecified>;
+//     using duration                  = chrono::duration<rep, period>;
+//     using time_point                = chrono::time_point<utc_clock>;
+//     static constexpr bool is_steady = unspecified;
+//
+//     ...
+// };
+//
+// template<class Duration>
+// using utc_time  = time_point<utc_clock, Duration>;
+// using utc_seconds = utc_time<seconds>;
+
+#include <concepts>
+#include <chrono>
+#include <ratio>
+
+#include "test_macros.h"
+
+// class utc_clock
+using rep                = std::chrono::utc_clock::rep;
+using period             = std::chrono::utc_clock::period;
+using duration           = std::chrono::utc_clock::duration;
+using time_point         = std::chrono::utc_clock::time_point;
+constexpr bool is_steady = std::chrono::utc_clock::is_steady;
+
+// Tests the values. Some of them are implementation-defined.
+LIBCPP_STATIC_ASSERT(std::same_as<rep, std::chrono::system_clock::rep>);
+static_assert(std::is_arithmetic_v<rep>);
+static_assert(std::is_signed_v<rep>);
+
+LIBCPP_STATIC_ASSERT(std::same_as<period, std::chrono::system_clock::period>);
+static_assert(std::same_as<period, std::ratio<period::num, period::den>>);
+
+static_assert(std::same_as<duration, std::chrono::duration<rep, period>>);
+static_assert(std::same_as<time_point, std::chrono::time_point<std::chrono::utc_clock>>);
+LIBCPP_STATIC_ASSERT(is_steady == false);
+
+// typedefs
+static_assert(std::same_as<std::chrono::utc_time<int>, std::chrono::time_point<std::chrono::utc_clock, int>>);
+static_assert(std::same_as<std::chrono::utc_time<long>, std::chrono::time_point<std::chrono::utc_clock, long>>);
+static_assert(std::same_as<std::chrono::utc_seconds, std::chrono::utc_time<std::chrono::seconds>>);
diff --git a/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp
new file mode 100644
index 0000000000000..8fd3b8a3e1d47
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.utc/utc_time.ostream.pass.cpp
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+
+// using utc_time = ...;
+
+// template<class charT, class traits, class Duration>
+// basic_ostream<charT, traits>&
+//   operator<<(basic_ostream<charT, traits>& os, const utc_time<Duration>& tp);
+
+#include <chrono>
+#include <cassert>
+#include <ratio>
+#include <sstream>
+
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+template <class CharT, class Duration>
+static std::basic_string<CharT> stream_c_locale(std::chrono::utc_time<Duration> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  sstr << std::fixed << time_point;
+  return sstr.str();
+}
+
+template <class CharT, class Duration>
+static std::basic_string<CharT> stream_fr_FR_locale(std::chrono::utc_time<Duration> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_fr_FR_UTF_8);
+  sstr.imbue(locale);
+  sstr << std::fixed << time_point;
+  return sstr.str();
+}
+
+template <class CharT, class Duration>
+static std::basic_string<CharT> stream_ja_JP_locale(std::chrono::utc_time<Duration> time_point) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_ja_JP_UTF_8);
+  sstr.imbue(locale);
+  sstr << std::fixed << time_point;
+  return sstr.str();
+}
+
+template <class CharT>
+static void test_c() {
+  using namespace std::literals::chrono_literals;
+
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::nanoseconds>{946'688'523'123'456'789ns}) ==
+         SV("2000-01-01 01:01:41.123456789"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::microseconds>{946'688'523'123'456us}) ==
+         SV("2000-01-01 01:01:41.123456"));
+
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::milliseconds>{946'684'822'123ms}) ==
+         SV("2000-01-01 00:00:00.123"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::minutes>{20'576'131min}) ==
+         SV("2009-02-13 23:30:36"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 22:59:36"));
+
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
+             std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::duration<short, std::ratio<1, 2>>>{
+             std::chrono::duration<short, std::ratio<1, 2>>{3600}}) == SV("1970-01-01 00:30:00.0"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::duration<int, std::ratio<1, 4>>>{
+             std::chrono::duration<int, std::ratio<1, 4>>{3600}}) == SV("1970-01-01 00:15:00.00"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long, std::ratio<1, 10>>>{
+             std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
+  assert(stream_c_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
+             std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:06.10"));
+}
+
+template <class CharT>
+static void test_fr_FR() {
+  using namespace std::literals::chrono_literals;
+
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::nanoseconds>{946'688'523'123'456'789ns}) ==
+         SV("2000-01-01 01:01:41,123456789"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::microseconds>{946'688'523'123'456us}) ==
+         SV("2000-01-01 01:01:41,123456"));
+
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::milliseconds>{946'684'822'123ms}) ==
+         SV("2000-01-01 00:00:00,123"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::minutes>{20'576'131min}) ==
+         SV("2009-02-13 23:30:36"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 22:59:36"));
+
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
+             std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::duration<short, std::ratio<1, 2>>>{
+             std::chrono::duration<short, std::ratio<1, 2>>{3600}}) == SV("1970-01-01 00:30:00,0"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::duration<int, std::ratio<1, 4>>>{
+             std::chrono::duration<int, std::ratio<1, 4>>{3600}}) == SV("1970-01-01 00:15:00,00"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long, std::ratio<1, 10>>>{
+             std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01,1"));
+  assert(stream_fr_FR_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
+             std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:06,10"));
+}
+
+template <class CharT>
+static void test_ja_JP() {
+  using namespace std::literals::chrono_literals;
+
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::nanoseconds>{946'688'523'123'456'789ns}) ==
+         SV("2000-01-01 01:01:41.123456789"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::microseconds>{946'688'523'123'456us}) ==
+         SV("2000-01-01 01:01:41.123456"));
+
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::milliseconds>{946'684'822'123ms}) ==
+         SV("2000-01-01 00:00:00.123"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_seconds{1'234'567'890s}) == SV("2009-02-13 23:31:06"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::minutes>{20'576'131min}) ==
+         SV("2009-02-13 23:30:36"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 22:59:36"));
+
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
+             std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::duration<short, std::ratio<1, 2>>>{
+             std::chrono::duration<short, std::ratio<1, 2>>{3600}}) == SV("1970-01-01 00:30:00.0"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::duration<int, std::ratio<1, 4>>>{
+             std::chrono::duration<int, std::ratio<1, 4>>{3600}}) == SV("1970-01-01 00:15:00.00"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long, std::ratio<1, 10>>>{
+             std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
+  assert(stream_ja_JP_locale<CharT>(std::chrono::utc_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
+             std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:06.10"));
+}
+
+template <class CharT>
+static void test() {
+  test_c<CharT>();
+  test_fr_FR<CharT>();
+  test_ja_JP<CharT>();
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp b/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp
new file mode 100644
index 0000000000000..e6f94bf7fecc6
--- /dev/null
+++ b/libcxx/test/std/time/time.syn/formatter.utc_time.pass.cpp
@@ -0,0 +1,1004 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-filesystem, no-localization, no-tzdb
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// XFAIL: libcpp-has-no-experimental-tzdb
+// XFAIL: availability-tzdb-missing
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+
+// template<class Duration, class charT>
+//   struct formatter<chrono::utc_time<Duration>, charT>;
+
+#include <chrono>
+#include <format>
+
+#include <cassert>
+#include <concepts>
+#include <locale>
+#include <iostream>
+#include <type_traits>
+
+#include "formatter_tests.h"
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+
+template <class CharT>
+static void test_no_chrono_specs() {
+  using namespace std::literals::chrono_literals;
+
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output
+
+  // [time.syn]
+  //   using nanoseconds  = duration<signed integer type of at least 64 bits, nano>;
+  //   using microseconds = duration<signed integer type of at least 55 bits, micro>;
+  //   using milliseconds = duration<signed integer type of at least 45 bits, milli>;
+  //   using seconds      = duration<signed integer type of at least 35 bits>;
+  //   using minutes      = duration<signed integer type of at least 29 bits, ratio<  60>>;
+  //   using hours        = duration<signed integer type of at least 23 bits, ratio<3600>>;
+  check(SV("1425-08-04 22:06:56"), SV("{}"), std::chrono::utc_seconds(-17'179'869'184s)); // Minimum value for 35 bits.
+  check(SV("1901-12-13 20:45:52"), SV("{}"), std::chrono::utc_seconds(-2'147'483'648s));
+
+  check(SV("1969-12-31 00:00:00"), SV("{}"), std::chrono::utc_seconds(-24h));
+  check(SV("1969-12-31 06:00:00"), SV("{}"), std::chrono::utc_seconds(-18h));
+  check(SV("1969-12-31 12:00:00"), SV("{}"), std::chrono::utc_seconds(-12h));
+  check(SV("1969-12-31 18:00:00"), SV("{}"), std::chrono::utc_seconds(-6h));
+  check(SV("1969-12-31 23:59:59"), SV("{}"), std::chrono::utc_seconds(-1s));
+
+  check(SV("1970-01-01 00:00:00"), SV("{}"), std::chrono::utc_seconds(0s));
+  check(SV("2000-01-01 00:00:00"), SV("{}"), std::chrono::utc_seconds(946'684'800s + 22s));
+  check(SV("2000-01-01 01:02:03"), SV("{}"), std::chrono::utc_seconds(946'688'523s + 22s));
+
+  check(SV("2038-01-19 03:14:07"), SV("{}"), std::chrono::utc_seconds(2'147'483'647s + 27s));
+  check(SV("2514-05-30 01:53:03"),
+        SV("{}"),
+        std::chrono::utc_seconds(17'179'869'183s + 27s)); // Maximum value for 35 bits.
+
+  check(SV("2000-01-01 01:02:03.123"),
+        SV("{}"),
+        std::chrono::utc_time<std::chrono::milliseconds>(946'688'523'123ms + 22s));
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_year() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt =
+      SV("{:%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}");
+  constexpr std::basic_string_view<CharT> lfmt =
+      SV("{:L%%C='%C'%t%%EC='%EC'%t%%y='%y'%t%%Oy='%Oy'%t%%Ey='%Ey'%t%%Y='%Y'%t%%EY='%EY'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        fmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%C='19'\t%EC='19'\t%y='70'\t%Oy='70'\t%Ey='70'\t%Y='1970'\t%EY='1970'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%C='20'\t%EC='20'\t%y='09'\t%Oy='09'\t%Ey='09'\t%Y='2009'\t%EY='2009'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)||defined(__FreeBSD__)
+  check(loc,
+        SV("%C='19'\t%EC='昭和'\t%y='70'\t%Oy='七十'\t%Ey='45'\t%Y='1970'\t%EY='昭和45年'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%C='20'\t%EC='平成'\t%y='09'\t%Oy='九'\t%Ey='21'\t%Y='2009'\t%EY='平成21年'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX)||defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_month() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%b='%b'%t%%h='%h'%t%%B='%B'%t%%m='%m'%t%%Om='%Om'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%b='Jan'\t%h='Jan'\t%B='January'\t%m='01'\t%Om='01'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%b='May'\t%h='May'\t%B='May'\t%m='05'\t%Om='05'\n"),
+        fmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__)
+  check(SV("%b='jan'\t%h='jan'\t%B='janvier'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+#else
+  check(SV("%b='janv.'\t%h='janv.'\t%B='janvier'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+#endif
+
+  check(SV("%b='mai'\t%h='mai'\t%B='mai'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#ifdef _WIN32
+  check(loc,
+        SV("%b='1'\t%h='1'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b='5'\t%h='5'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(_AIX)                                // _WIN32
+  check(loc,
+        SV("%b='1月'\t%h='1月'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b='5月'\t%h='5月'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(__APPLE__)                           // _WIN32
+  check(loc,
+        SV("%b=' 1'\t%h=' 1'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5'\t%h=' 5'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#elif defined(__FreeBSD__)                         // _WIN32
+  check(loc,
+        SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='05'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#else                                              // _WIN32
+  check(loc,
+        SV("%b=' 1月'\t%h=' 1月'\t%B='1月'\t%m='01'\t%Om='一'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%b=' 5月'\t%h=' 5月'\t%B='5月'\t%m='05'\t%Om='五'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#endif                                             // _WIN32
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_day() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%d='%d'%t%%Od='%Od'%t%%e='%e'%t%%Oe='%Oe'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        fmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%d='01'\t%Od='01'\t%e=' 1'\t%Oe=' 1'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%d='13'\t%Od='13'\t%e='13'\t%Oe='13'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%d='01'\t%Od='一'\t%e=' 1'\t%Oe='一'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%d='13'\t%Od='十三'\t%e='13'\t%Oe='十三'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_weekday() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt =
+      SV("{:%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}");
+  constexpr std::basic_string_view<CharT> lfmt =
+      SV("{:L%%a='%a'%t%%A='%A'%t%%u='%u'%t%%Ou='%Ou'%t%%w='%w'%t%%Ow='%Ow'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%a='Thu'\t%A='Thursday'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='Sun'\t%A='Sunday'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        fmt,
+        std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__)
+  check(SV("%a='Jeu'\t%A='Jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='Dim'\t%A='Dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106
+#else
+  check(SV("%a='jeu.'\t%A='jeudi'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%a='dim.'\t%A='dimanche'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106
+#endif
+
+  // Use supplied locale (ja_JP).
+  // This locale has a different alternate, but not on all platforms
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='4'\t%w='4'\t%Ow='4'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='7'\t%w='0'\t%Ow='0'\n"),
+        lfmt,
+        std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%a='木'\t%A='木曜日'\t%u='4'\t%Ou='四'\t%w='4'\t%Ow='四'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%a='日'\t%A='日曜日'\t%u='7'\t%Ou='七'\t%w='0'\t%Ow='〇'\n"),
+        lfmt,
+        std::chrono::utc_seconds(4'294'967'295s)); // 06:28:15 UTC on Sunday, 7 February 2106
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_day_of_year() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%j='%j'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%j='%j'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%j='001'\n"), fmt, std::chrono::utc_seconds(0s));             // 00:00:00 UTC Thursday, 1 January 1970
+  check(SV("%j='138'\n"), fmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+  check(SV("%j='001'\n"), lfmt, std::chrono::utc_seconds(0s));             // 00:00:00 UTC Thursday, 1 January 1970
+  check(SV("%j='138'\n"), lfmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+  check(loc, SV("%j='001'\n"), lfmt, std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(
+      loc, SV("%j='138'\n"), lfmt, std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_week() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%U='%U'%t%%OU='%OU'%t%%W='%W'%t%%OW='%OW'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        fmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use the global locale (fr_FR)
+  check(SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%U='00'\t%OU='00'\t%W='00'\t%OW='00'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%U='20'\t%OU='20'\t%W='20'\t%OW='20'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%U='00'\t%OU='〇'\t%W='00'\t%OW='〇'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%U='20'\t%OU='二十'\t%W='20'\t%OW='二十'\n"),
+        lfmt,
+        std::chrono::utc_seconds(2'000'000'000s)); // 03:33:20 UTC on Wednesday, 18 May 2033
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_iso_8601_week() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%g='%g'%t%%G='%G'%t%%V='%V'%t%%OV='%OV'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        fmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%g='70'\t%G='1970'\t%V='01'\t%OV='01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%g='09'\t%G='2009'\t%V='07'\t%OV='07'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%g='70'\t%G='1970'\t%V='01'\t%OV='一'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%g='09'\t%G='2009'\t%V='07'\t%OV='七'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_date() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%D='%D'%t%%F='%F'%t%%x='%x'%t%%Ex='%Ex'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/70'\t%Ex='01/01/70'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='02/13/09'\t%Ex='02/13/09'\n"),
+        fmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01.01.1970'\t%Ex='01.01.1970'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13.02.2009'\t%Ex='13.02.2009'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else
+  check(SV("%D='01/01/70'\t%F='1970-01-01'\t%x='01/01/1970'\t%Ex='01/01/1970'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%D='02/13/09'\t%F='2009-02-13'\t%x='13/02/2009'\t%Ex='13/02/2009'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970/01/01'\t%Ex='1970/01/01'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009/02/13'\t%Ex='2009/02/13'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else  // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+  check(loc,
+        SV("%D='01/01/70'\t%F='1970-01-01'\t%x='1970年01月01日'\t%Ex='昭和45年01月01日'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%D='02/13/09'\t%F='2009-02-13'\t%x='2009年02月13日'\t%Ex='平成21年02月13日'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(_WIN32) || defined(__APPLE__) || defined(_AIX) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_time() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt = SV(
+      "{:"
+      "%%H='%H'%t"
+      "%%OH='%OH'%t"
+      "%%I='%I'%t"
+      "%%OI='%OI'%t"
+      "%%M='%M'%t"
+      "%%OM='%OM'%t"
+      "%%S='%S'%t"
+      "%%OS='%OS'%t"
+      "%%p='%p'%t"
+      "%%R='%R'%t"
+      "%%T='%T'%t"
+      "%%r='%r'%t"
+      "%%X='%X'%t"
+      "%%EX='%EX'%t"
+      "%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV(
+      "{:L"
+      "%%H='%H'%t"
+      "%%OH='%OH'%t"
+      "%%I='%I'%t"
+      "%%OI='%OI'%t"
+      "%%M='%M'%t"
+      "%%OM='%OM'%t"
+      "%%S='%S'%t"
+      "%%OS='%OS'%t"
+      "%%p='%p'%t"
+      "%%R='%R'%t"
+      "%%T='%T'%t"
+      "%%r='%r'%t"
+      "%%X='%X'%t"
+      "%%EX='%EX'%t"
+      "%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+           "%p='AM'\t"
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+           "%r='12:00:00 AM'\t"
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+           "\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30.123'\t"
+           "%OS='30.123'\t"
+           "%p='PM'\t"
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+           "%r='11:31:30 PM'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+           "\n"),
+        fmt,
+        std::chrono::utc_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+  // Use the global locale (fr_FR)
+  check(SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+#if defined(_AIX)
+           "%p='AM'\t"
+#else
+           "%p=''\t"
+#endif
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+#ifdef _WIN32
+           "%r='00:00:00'\t"
+#elif defined(_AIX)
+           "%r='12:00:00 AM'\t"
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+           "%r=''\t"
+#else
+           "%r='12:00:00 '\t"
+#endif
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+           "\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30,123'\t"
+           "%OS='30,123'\t"
+#if defined(_AIX)
+           "%p='PM'\t"
+#else
+           "%p=''\t"
+#endif
+           "%R='23:31'\t"
+           "%T='23:31:30,123'\t"
+#ifdef _WIN32
+           "%r='23:31:30'\t"
+#elif defined(_AIX)
+           "%r='11:31:30 PM'\t"
+#elif defined(__APPLE__) || defined(__FreeBSD__)
+           "%r=''\t"
+#else
+           "%r='11:31:30 '\t"
+#endif
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+           "\n"),
+        lfmt,
+        std::chrono::utc_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.
+#if defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+  check(loc,
+        SV("%H='00'\t"
+           "%OH='00'\t"
+           "%I='12'\t"
+           "%OI='12'\t"
+           "%M='00'\t"
+           "%OM='00'\t"
+           "%S='00'\t"
+           "%OS='00'\t"
+#  if defined(__APPLE__)
+           "%p='AM'\t"
+#  else
+           "%p='午前'\t"
+#  endif
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    if defined(__APPLE__)
+           "%r='12:00:00 AM'\t"
+#    else
+           "%r='12:00:00 午前'\t"
+#    endif
+           "%X='00時00分00秒'\t"
+           "%EX='00時00分00秒'\t"
+#  elif defined(_WIN32)
+           "%r='0:00:00'\t"
+           "%X='0:00:00'\t"
+           "%EX='0:00:00'\t"
+#  else
+           "%r='午前12:00:00'\t"
+           "%X='00:00:00'\t"
+           "%EX='00:00:00'\t"
+#  endif
+           "\n"),
+        lfmt,
+        std::chrono::hh_mm_ss(0s));
+
+  check(loc,
+        SV("%H='23'\t"
+           "%OH='23'\t"
+           "%I='11'\t"
+           "%OI='11'\t"
+           "%M='31'\t"
+           "%OM='31'\t"
+           "%S='30.123'\t"
+           "%OS='30.123'\t"
+#  if defined(__APPLE__)
+           "%p='PM'\t"
+#  else
+           "%p='午後'\t"
+#  endif
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+#  if defined(__APPLE__) || defined(__FreeBSD__)
+#    if defined(__APPLE__)
+           "%r='11:31:30 PM'\t"
+#    else
+           "%r='11:31:30 午後'\t"
+#    endif
+           "%X='23時31分30秒'\t"
+           "%EX='23時31分30秒'\t"
+#  elif defined(_WIN32)
+           "%r='23:31:30'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+#  else
+           "%r='午後11:31:30'\t"
+           "%X='23:31:30'\t"
+           "%EX='23:31:30'\t"
+#  endif
+           "\n"),
+        lfmt,
+        std::chrono::hh_mm_ss(23h + 31min + 30s + 123ms));
+#else  // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+  check(loc,
+        SV("%H='00'\t"
+           "%OH='〇'\t"
+           "%I='12'\t"
+           "%OI='十二'\t"
+           "%M='00'\t"
+           "%OM='〇'\t"
+           "%S='00'\t"
+           "%OS='〇'\t"
+           "%p='午前'\t"
+           "%R='00:00'\t"
+           "%T='00:00:00'\t"
+           "%r='午前12時00分00秒'\t"
+           "%X='00時00分00秒'\t"
+           "%EX='00時00分00秒'\t"
+           "\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%H='23'\t"
+           "%OH='二十三'\t"
+           "%I='11'\t"
+           "%OI='十一'\t"
+           "%M='31'\t"
+           "%OM='三十一'\t"
+           "%S='30.123'\t"
+           "%OS='三十.123'\t"
+           "%p='午後'\t"
+           "%R='23:31'\t"
+           "%T='23:31:30.123'\t"
+           "%r='午後11時31分30秒'\t"
+           "%X='23時31分30秒'\t"
+           "%EX='23時31分30秒'\t"
+           "\n"),
+        lfmt,
+        std::chrono::utc_time<std::chrono::milliseconds>(
+            1'234'567'890'123ms + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif // defined(__APPLE__) || defined(_AIX) || defined(_WIN32) || defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_date_time() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%c='%c'%t%%Ec='%Ec'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%c='%c'%t%%Ec='%Ec'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%c='Thu Jan  1 00:00:00 1970'\t%Ec='Thu Jan  1 00:00:00 1970'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(SV("%c='Fri Feb 13 23:31:30 2009'\t%Ec='Fri Feb 13 23:31:30 2009'\n"),
+        fmt,
+        std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use the global locale (fr_FR)
+  check(
+// https://sourceware.org/bugzilla/show_bug.cgi?id=24054
+#if defined(__powerpc__) && defined(__linux__)
+      SV("%c='jeu. 01 janv. 1970 00:00:00 UTC'\t%Ec='jeu. 01 janv. 1970 00:00:00 UTC'\n"),
+#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29
+      SV("%c='jeu. 01 janv. 1970 00:00:00 GMT'\t%Ec='jeu. 01 janv. 1970 00:00:00 GMT'\n"),
+#elif defined(_AIX)
+      SV("%c=' 1 janvier 1970 à 00:00:00 UTC'\t%Ec=' 1 janvier 1970 à 00:00:00 UTC'\n"),
+#elif defined(__APPLE__)
+      SV("%c='Jeu  1 jan 00:00:00 1970'\t%Ec='Jeu  1 jan 00:00:00 1970'\n"),
+#elif defined(_WIN32)
+      SV("%c='01/01/1970 00:00:00'\t%Ec='01/01/1970 00:00:00'\n"),
+#elif defined(__FreeBSD__)
+      SV("%c='jeu.  1 janv. 00:00:00 1970'\t%Ec='jeu.  1 janv. 00:00:00 1970'\n"),
+#else
+      SV("%c='jeu. 01 janv. 1970 00:00:00'\t%Ec='jeu. 01 janv. 1970 00:00:00'\n"),
+#endif
+      lfmt,
+      std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(
+// https://sourceware.org/bugzilla/show_bug.cgi?id=24054
+#if defined(__powerpc__) && defined(__linux__)
+      SV("%c='ven. 13 févr. 2009 23:31:30 UTC'\t%Ec='ven. 13 févr. 2009 23:31:30 UTC'\n"),
+#elif defined(__GLIBC__) && __GLIBC__ <= 2 && __GLIBC_MINOR__ < 29
+      SV("%c='ven. 13 févr. 2009 23:31:30 GMT'\t%Ec='ven. 13 févr. 2009 23:31:30 GMT'\n"),
+#elif defined(_AIX)
+      SV("%c='13 février 2009 à 23:31:30 UTC'\t%Ec='13 février 2009 à 23:31:30 UTC'\n"),
+#elif defined(__APPLE__)
+      SV("%c='Ven 13 fév 23:31:30 2009'\t%Ec='Ven 13 fév 23:31:30 2009'\n"),
+#elif defined(_WIN32)
+      SV("%c='13/02/2009 23:31:30'\t%Ec='13/02/2009 23:31:30'\n"),
+#elif defined(__FreeBSD__)
+      SV("%c='ven. 13 févr. 23:31:30 2009'\t%Ec='ven. 13 févr. 23:31:30 2009'\n"),
+#else
+      SV("%c='ven. 13 févr. 2009 23:31:30'\t%Ec='ven. 13 févr. 2009 23:31:30'\n"),
+#endif
+      lfmt,
+      std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+
+  // Use supplied locale (ja_JP). This locale has a different alternate.a
+#if defined(__APPLE__) || defined(__FreeBSD__)
+  check(loc,
+        SV("%c='木  1/ 1 00:00:00 1970'\t%Ec='木  1/ 1 00:00:00 1970'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='金  2/13 23:31:30 2009'\t%Ec='金  2/13 23:31:30 2009'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#elif defined(_AIX)                                // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970年01月 1日 00:00:00 UTC'\t%Ec='1970年01月 1日 00:00:00 UTC'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='2009年02月13日 23:31:30 UTC'\t%Ec='2009年02月13日 23:31:30 UTC'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#elif defined(_WIN32)                              // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970/01/01 0:00:00'\t%Ec='1970/01/01 0:00:00'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+  check(loc,
+        SV("%c='2009/02/13 23:31:30'\t%Ec='2009/02/13 23:31:30'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s)); // 23:31:30 UTC on Friday, 13 February 2009
+#else                                              // defined(__APPLE__)|| defined(__FreeBSD__)
+  check(loc,
+        SV("%c='1970年01月01日 00時00分00秒'\t%Ec='昭和45年01月01日 00時00分00秒'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  check(loc,
+        SV("%c='2009年02月13日 23時31分30秒'\t%Ec='平成21年02月13日 23時31分30秒'\n"),
+        lfmt,
+        std::chrono::utc_seconds(1'234'567'890s + 24s)); // 23:31:30 UTC on Friday, 13 February 2009
+#endif                                             // defined(__APPLE__)|| defined(__FreeBSD__)
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_valid_values_time_zone() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt  = SV("{:%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+  constexpr std::basic_string_view<CharT> lfmt = SV("{:L%%z='%z'%t%%Ez='%Ez'%t%%Oz='%Oz'%t%%Z='%Z'%n}");
+
+  const std::locale loc(LOCALE_ja_JP_UTF_8);
+  std::locale::global(std::locale(LOCALE_fr_FR_UTF_8));
+
+  // Non localized output using C-locale
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        fmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  // Use the global locale (fr_FR)
+  check(SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  // Use supplied locale (ja_JP).
+  check(loc,
+        SV("%z='+0000'\t%Ez='+00:00'\t%Oz='+00:00'\t%Z='UTC'\n"),
+        lfmt,
+        std::chrono::utc_seconds(0s)); // 00:00:00 UTC Thursday, 1 January 1970
+
+  std::locale::global(std::locale::classic());
+}
+
+template <class CharT>
+static void test_utc_transitions() {
+  using namespace std::literals::chrono_literals;
+
+  constexpr std::basic_string_view<CharT> fmt = SV("{:%F %T}");
+  check(SV("1972-06-30 23:59:59"), fmt, std::chrono::utc_seconds(78'796'799s));
+  check(SV("1972-06-30 23:59:60"), fmt, std::chrono::utc_seconds(78'796'800s));
+  check(SV("1972-07-01 00:00:00"), fmt, std::chrono::utc_seconds(78'796'801s));
+
+  check(SV("1972-12-31 23:59:59"), fmt, std::chrono::utc_seconds(94'694'400s));
+  check(SV("1972-12-31 23:59:60"), fmt, std::chrono::utc_seconds(94'694'401s));
+  check(SV("1973-01-01 00:00:00"), fmt, std::chrono::utc_seconds(94'694'402s));
+}
+
+template <class CharT>
+static void test_valid_values() {
+  test_valid_values_year<CharT>();
+  test_valid_values_month<CharT>();
+  test_valid_values_day<CharT>();
+  test_valid_values_weekday<CharT>();
+  test_valid_values_day_of_year<CharT>();
+  test_valid_values_week<CharT>();
+  test_valid_values_iso_8601_week<CharT>();
+  test_valid_values_date<CharT>();
+  test_valid_values_time<CharT>();
+  test_valid_values_date_time<CharT>();
+  test_valid_values_time_zone<CharT>();
+
+  test_utc_transitions<CharT>();
+}
+
+// In order to have the UTC seconds the number of leap seconds need to be
+// included in the UTC time. The number of leap seconds for times far in the
+// future are not yet known and may change in the future.
+template <class CharT>
+static void test() {
+  using namespace std::literals::chrono_literals;
+
+  test_no_chrono_specs<CharT>();
+  test_valid_values<CharT>();
+  check_invalid_types<CharT>(
+      {SV("a"),  SV("A"),  SV("b"),  SV("B"),  SV("c"),  SV("C"),  SV("d"),  SV("D"),  SV("e"),  SV("F"),  SV("g"),
+       SV("G"),  SV("h"),  SV("H"),  SV("I"),  SV("j"),  SV("m"),  SV("M"),  SV("p"),  SV("r"),  SV("R"),  SV("S"),
+       SV("T"),  SV("u"),  SV("U"),  SV("V"),  SV("w"),  SV("W"),  SV("x"),  SV("X"),  SV("y"),  SV("Y"),  SV("z"),
+       SV("Z"),  SV("Ec"), SV("EC"), SV("Ex"), SV("EX"), SV("Ey"), SV("EY"), SV("Ez"), SV("Od"), SV("Oe"), SV("OH"),
+       SV("OI"), SV("Om"), SV("OM"), SV("OS"), SV("Ou"), SV("OU"), SV("OV"), SV("Ow"), SV("OW"), SV("Oy"), SV("Oz")},
+      std::chrono::utc_seconds(0s));
+
+  check_exception("The format specifier expects a '%' or a '}'", SV("{:A"), std::chrono::utc_seconds(0s));
+  check_exception("The chrono specifiers contain a '{'", SV("{:%%{"), std::chrono::utc_seconds(0s));
+  check_exception("End of input while parsing a conversion specifier", SV("{:%"), std::chrono::utc_seconds(0s));
+  check_exception("End of input while parsing the modifier E", SV("{:%E"), std::chrono::utc_seconds(0s));
+  check_exception("End of input while parsing the modifier O", SV("{:%O"), std::chrono::utc_seconds(0s));
+
+  // Precision not allowed
+  check_exception("The format specifier expects a '%' or a '}'", SV("{:.3}"), std::chrono::utc_seconds(0s));
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index 52cfa2c81c21a..88a485a256c80 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -150,9 +150,15 @@ void test_P1361() {
   assert_is_formattable<std::chrono::microseconds, CharT>();
 
   assert_is_formattable<std::chrono::sys_time<std::chrono::microseconds>, CharT>();
-  //assert_is_formattable<std::chrono::utc_time<std::chrono::microseconds>, CharT>();
+#  if !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) && !defined(TEST_HAS_NO_TIME_ZONE_DATABASE) &&                           \
+      !defined(TEST_HAS_NO_FILESYSTEM)
+  assert_is_formattable<std::chrono::utc_time<std::chrono::microseconds>, CharT>();
   //assert_is_formattable<std::chrono::tai_time<std::chrono::microseconds>, CharT>();
   //assert_is_formattable<std::chrono::gps_time<std::chrono::microseconds>, CharT>();
+
+#  endif // !defined(TEST_HAS_NO_EXPERIMENTAL_TZDB) && !defined(TEST_HAS_NO_TIME_ZONE_DATABASE) &&
+         // !defined(TEST_HAS_NO_FILESYSTEM)
+
   assert_is_formattable<std::chrono::file_time<std::chrono::microseconds>, CharT>();
   assert_is_formattable<std::chrono::local_time<std::chrono::microseconds>, CharT>();
 

From 12f82fbe072382bb78ab1cbdd3fbeb8ed44cbc81 Mon Sep 17 00:00:00 2001
From: Ellis Hoag <ellis.sparky.hoag@gmail.com>
Date: Fri, 24 Jan 2025 10:01:02 -0800
Subject: [PATCH 032/432] [compiler-rt] Fix Windows test after profile summary
 change (#124318)

Fix a Windows compiler-rt test that
https://github.com/llvm/llvm-project/pull/105915 broke.
---
 compiler-rt/test/profile/Windows/binary-id.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/compiler-rt/test/profile/Windows/binary-id.c b/compiler-rt/test/profile/Windows/binary-id.c
index dadc623b7af38..f115de431618b 100644
--- a/compiler-rt/test/profile/Windows/binary-id.c
+++ b/compiler-rt/test/profile/Windows/binary-id.c
@@ -62,6 +62,8 @@ int main() {
 // BINARY-ID-RAW-PROF-NEXT: Total functions: 3
 // BINARY-ID-RAW-PROF-NEXT: Maximum function count: 1
 // BINARY-ID-RAW-PROF-NEXT: Maximum internal block count: 0
+// BINARY-ID-RAW-PROF-NEXT: Total number of blocks:
+// BINARY-ID-RAW-PROF-NEXT: Total count:
 // BINARY-ID-RAW-PROF-NEXT: Binary IDs:
 // BINARY-ID-RAW-PROF-NEXT: {{[0-9a-f]+}}
 
@@ -69,6 +71,8 @@ int main() {
 // ONE-BINARY-ID-NEXT: Total functions: 3
 // ONE-BINARY-ID-NEXT: Maximum function count: 3
 // ONE-BINARY-ID-NEXT: Maximum internal block count: 0
+// ONE-BINARY-ID-NEXT: Total number of blocks:
+// ONE-BINARY-ID-NEXT: Total count:
 // ONE-BINARY-ID-NEXT: Binary IDs:
 // ONE-BINARY-ID-NEXT: {{[0-9a-f]+}}
 
@@ -76,6 +80,8 @@ int main() {
 // MULTI-BINARY-ID-NEXT: Total functions: 3
 // MULTI-BINARY-ID-NEXT: Maximum function count: 1
 // MULTI-BINARY-ID-NEXT: Maximum internal block count: 0
+// MULTI-BINARY-ID-NEXT: Total number of blocks:
+// MULTI-BINARY-ID-NEXT: Total count:
 // MULTI-BINARY-ID-NEXT: Binary IDs:
 // MULTI-BINARY-ID-NEXT: {{[0-9a-f]+}}
 // MULTI-BINARY-ID-NEXT: {{[0-9a-f]+}}

From 7293455cf292cfaa263ea04fc1bc2aee4ceab6a6 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Fri, 24 Jan 2025 10:02:15 -0800
Subject: [PATCH 033/432] [lldb] Add SBThread.selected_frame property (#123981)

Adds a `selected_frame` property to `SBThread`. The setter accepts either a frame index (like `SetSelectedFrame`), or a frame object.

Updates a few tests to make use of the new `selected_frame`. While doing so I noticed some of the usage could be cleaned up, so I did that too.
---
 lldb/bindings/interface/SBThreadExtensions.i          |  9 +++++++++
 .../commands/frame/recognizer/TestFrameRecognizer.py  |  8 +++-----
 .../location-list-lookup/TestLocationListLookup.py    |  4 ++--
 .../TestStdFunctionRecognizer.py                      | 11 ++++-------
 lldb/test/API/lang/objc/print-obj/TestPrintObj.py     |  9 +++------
 5 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/lldb/bindings/interface/SBThreadExtensions.i b/lldb/bindings/interface/SBThreadExtensions.i
index 860a2d765a669..267faad9d651f 100644
--- a/lldb/bindings/interface/SBThreadExtensions.i
+++ b/lldb/bindings/interface/SBThreadExtensions.i
@@ -51,6 +51,14 @@ STRING_EXTENSION_OUTSIDE(SBThread)
                 for idx in range(self.GetStopReasonDataCount())
             ]
 
+        def set_selected_frame(self, frame):
+            if isinstance(frame, SBFrame):
+                if frame.thread != self:
+                    raise ValueError("cannot select frame from different thread")
+                self.SetSelectedFrame(frame.idx)
+            else:
+                self.SetSelectedFrame(frame)
+
         id = property(GetThreadID, None, doc='''A read only property that returns the thread ID as an integer.''')
         idx = property(GetIndexID, None, doc='''A read only property that returns the thread index ID as an integer. Thread index ID values start at 1 and increment as threads come and go and can be used to uniquely identify threads.''')
         return_value = property(GetStopReturnValue, None, doc='''A read only property that returns an lldb object that represents the return value from the last stop (lldb.SBValue) if we just stopped due to stepping out of a function.''')
@@ -65,6 +73,7 @@ STRING_EXTENSION_OUTSIDE(SBThread)
         stop_reason_data = property(get_stop_reason_data, None, doc='''A read only property that returns the stop reason data as a list.''')
         is_suspended = property(IsSuspended, None, doc='''A read only property that returns a boolean value that indicates if this thread is suspended.''')
         is_stopped = property(IsStopped, None, doc='''A read only property that returns a boolean value that indicates if this thread is stopped but not exited.''')
+        selected_frame = property(GetSelectedFrame, set_selected_frame, doc='''A read/write property that gets and sets the selected frame of this SBThread.''')
     %}
 #endif
 }
diff --git a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py
index aa2a448087431..3e9dbfe6d8555 100644
--- a/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py
+++ b/lldb/test/API/commands/frame/recognizer/TestFrameRecognizer.py
@@ -20,7 +20,7 @@ def test_frame_recognizer_1(self):
         target, process, thread, _ = lldbutil.run_to_name_breakpoint(
             self, "foo", exe_name=exe
         )
-        frame = thread.GetSelectedFrame()
+        frame = thread.selected_frame
 
         # Clear internal & plugins recognizers that get initialized at launch
         self.runCmd("frame recognizer clear")
@@ -166,7 +166,7 @@ def test_frame_recognizer_hiding(self):
         self.build()
 
         target, process, thread, _ = lldbutil.run_to_name_breakpoint(self, "nested")
-        frame = thread.GetSelectedFrame()
+        frame = thread.selected_frame
 
         # Sanity check.
         self.expect(
@@ -229,7 +229,6 @@ def test_frame_recognizer_multi_symbol(self):
         target, process, thread, _ = lldbutil.run_to_name_breakpoint(
             self, "foo", exe_name=exe
         )
-        frame = thread.GetSelectedFrame()
 
         self.expect(
             "frame recognizer info 0",
@@ -239,7 +238,6 @@ def test_frame_recognizer_multi_symbol(self):
         target, process, thread, _ = lldbutil.run_to_name_breakpoint(
             self, "bar", exe_name=exe
         )
-        frame = thread.GetSelectedFrame()
 
         self.expect(
             "frame recognizer info 0",
@@ -374,7 +372,7 @@ def test_frame_recognizer_not_only_first_instruction(self):
 
         opts = lldb.SBVariablesOptions()
         opts.SetIncludeRecognizedArguments(True)
-        frame = thread.GetSelectedFrame()
+        frame = thread.selected_frame
         variables = frame.GetVariables(opts)
 
         self.assertEqual(variables.GetSize(), 2)
diff --git a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
index 84033daff7730..a97f4fc5e3d79 100644
--- a/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
+++ b/lldb/test/API/functionalities/location-list-lookup/TestLocationListLookup.py
@@ -25,7 +25,7 @@ def check_local_vars(self, process: lldb.SBProcess, check_expr: bool):
         # Find `bar` on the stack, then
         # make sure we can read out the local
         # variables (with both `frame var` and `expr`)
-        for f in process.GetSelectedThread().frames:
+        for f in process.selected_thread.frames:
             frame_name = f.GetDisplayFunctionName()
             if frame_name is not None and frame_name.startswith("Foo::bar"):
                 argv = f.GetValueForVariablePath("argv").GetChildAtIndex(0)
@@ -34,7 +34,7 @@ def check_local_vars(self, process: lldb.SBProcess, check_expr: bool):
                 self.assertNotEqual(strm.GetData().find("a.out"), -1)
 
                 if check_expr:
-                    process.GetSelectedThread().SetSelectedFrame(f.idx)
+                    process.selected_thread.selected_frame = f
                     self.expect_expr("this", result_type="Foo *")
 
     @skipIf(oslist=["linux"], archs=["arm"])
diff --git a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py
index 978bf2066e43b..f5d0ea41e3114 100644
--- a/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py
+++ b/lldb/test/API/lang/cpp/std-function-recognizer/TestStdFunctionRecognizer.py
@@ -69,14 +69,14 @@ def test_up_down(self):
         (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "// break here", lldb.SBFileSpec("main.cpp")
         )
-        frame = thread.GetSelectedFrame()
+        frame = thread.selected_frame
         # up
         self.assertIn("foo", frame.GetFunctionName())
         start_idx = frame.GetFrameID()
         i = 0
         while i < thread.GetNumFrames():
             self.expect("up")
-            frame = thread.GetSelectedFrame()
+            frame = thread.selected_frame
             if frame.GetFunctionName() == "main":
                 break
         end_idx = frame.GetFrameID()
@@ -86,7 +86,7 @@ def test_up_down(self):
         start_idx = frame.GetFrameID()
         for i in range(1, thread.GetNumFrames()):
             self.expect("down")
-            frame = thread.GetSelectedFrame()
+            frame = thread.selected_frame
             if "foo" in frame.GetFunctionName():
                 break
         end_idx = frame.GetFrameID()
@@ -99,11 +99,8 @@ def test_api(self):
         (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
             self, "// break here", lldb.SBFileSpec("main.cpp")
         )
-        frame = thread.GetSelectedFrame()
         num_hidden = 0
-        for i in range(1, thread.GetNumFrames()):
-            thread.SetSelectedFrame(i)
-            frame = thread.GetSelectedFrame()
+        for frame in thread.frames:
             if frame.IsHidden():
                 num_hidden += 1
 
diff --git a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
index 60fc4fbc51cee..3ad4a09b53206 100644
--- a/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
+++ b/lldb/test/API/lang/objc/print-obj/TestPrintObj.py
@@ -69,12 +69,9 @@ def test_print_obj(self):
         # We want to traverse the frame to the one corresponding to blocked.m to
         # issue our 'po lock_me' command.
 
-        depth = other_thread.GetNumFrames()
-        for i in range(depth):
-            frame = other_thread.GetFrameAtIndex(i)
-            name = frame.GetFunctionName()
-            if name == "main":
-                other_thread.SetSelectedFrame(i)
+        for frame in other_thread.frames:
+            if frame.name == "main":
+                other_thread.selected_frame = frame
                 if self.TraceOn():
                     print("selected frame:" + lldbutil.get_description(frame))
                 break

From a9ad601f7c5486919d6fabc5dd3cb6e96f63ac61 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Fri, 24 Jan 2025 10:08:42 -0800
Subject: [PATCH 034/432] [RISCV] Use vrsub for select of add and sub of the
 same operands (#123400)

If we have a (vselect c, a+b, a-b), we can combine this to a+(vselect c,
b, -b). That by itself isn't hugely profitable, but if we reverse the
select, we get a form which matches a masked vrsub.vi with zero. The
result is that we can use a masked vrsub *before* the add instead of a
masked add or sub. This doesn't change the critical path (since we
already had the pass through on the masked second op), but does reduce
register pressure since a, b, and (a+b) don't need to all be alive at
once.

In addition to the vselect form, we can also see the same pattern with a
vector_shuffle encoding the vselect. I explored canonicalizing these to
vselects instead, but that exposes several unrelated missing combines.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  89 +++++++++-
 .../RISCV/rvv/fixed-vectors-select-addsub.ll  | 162 ++++++------------
 2 files changed, 139 insertions(+), 112 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 618fb28d3e9f9..5e5bc0819a10c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1535,7 +1535,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          ISD::UDIV,          ISD::SREM,
                          ISD::UREM,          ISD::INSERT_VECTOR_ELT,
                          ISD::ABS,           ISD::CTPOP,
-                         ISD::VECTOR_SHUFFLE});
+                         ISD::VECTOR_SHUFFLE, ISD::VSELECT});
+
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
   if (Subtarget.useRVVForFixedLengthVectors())
@@ -16874,6 +16875,53 @@ static SDValue useInversedSetcc(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static bool matchSelectAddSub(SDValue TrueVal, SDValue FalseVal, bool &SwapCC) {
+  if (!TrueVal.hasOneUse() || !FalseVal.hasOneUse())
+    return false;
+
+  SwapCC = false;
+  if (TrueVal.getOpcode() == ISD::SUB && FalseVal.getOpcode() == ISD::ADD) {
+    std::swap(TrueVal, FalseVal);
+    SwapCC = true;
+  }
+
+  if (TrueVal.getOpcode() != ISD::ADD || FalseVal.getOpcode() != ISD::SUB)
+    return false;
+
+  SDValue A = FalseVal.getOperand(0);
+  SDValue B = FalseVal.getOperand(1);
+  // Add is commutative, so check both orders
+  return ((TrueVal.getOperand(0) == A && TrueVal.getOperand(1) == B) ||
+          (TrueVal.getOperand(1) == A && TrueVal.getOperand(0) == B));
+}
+
+/// Convert vselect CC, (add a, b), (sub a, b) to add a, (vselect CC, -b, b).
+/// This allows us match a vadd.vv fed by a masked vrsub, which reduces
+/// register pressure over the add followed by masked vsub sequence.
+static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue CC = N->getOperand(0);
+  SDValue TrueVal = N->getOperand(1);
+  SDValue FalseVal = N->getOperand(2);
+
+  bool SwapCC;
+  if (!matchSelectAddSub(TrueVal, FalseVal, SwapCC))
+    return SDValue();
+
+  SDValue Sub = SwapCC ? TrueVal : FalseVal;
+  SDValue A = Sub.getOperand(0);
+  SDValue B = Sub.getOperand(1);
+
+  // Arrange the select such that we can match a masked
+  // vrsub.vi to perform the conditional negate
+  SDValue NegB = DAG.getNegative(B, DL, VT);
+  if (!SwapCC)
+    CC = DAG.getLogicalNOT(DL, CC, CC->getValueType(0));
+  SDValue NewB = DAG.getNode(ISD::VSELECT, DL, VT, CC, NegB, B);
+  return DAG.getNode(ISD::ADD, DL, VT, A, NewB);
+}
+
 static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     const RISCVSubtarget &Subtarget) {
   if (SDValue Folded = foldSelectOfCTTZOrCTLZ(N, DAG))
@@ -17153,20 +17201,48 @@ static SDValue performCONCAT_VECTORSCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(VT.getSimpleVT(), StridedLoad);
 }
 
-/// Custom legalize <N x i128> or <N x i256> to <M x ELEN>.  This runs
-/// during the combine phase before type legalization, and relies on
-/// DAGCombine not undoing the transform if isShuffleMaskLegal returns false
-/// for the source mask.
 static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
                                             const RISCVSubtarget &Subtarget,
                                             const RISCVTargetLowering &TLI) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   const unsigned ElementSize = VT.getScalarSizeInBits();
+  const unsigned NumElts = VT.getVectorNumElements();
   SDValue V1 = N->getOperand(0);
   SDValue V2 = N->getOperand(1);
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // Recognized a disguised select of add/sub.
+  bool SwapCC;
+  if (ShuffleVectorInst::isSelectMask(Mask, NumElts) &&
+      matchSelectAddSub(V1, V2, SwapCC)) {
+    SDValue Sub = SwapCC ? V1 : V2;
+    SDValue A = Sub.getOperand(0);
+    SDValue B = Sub.getOperand(1);
+
+    SmallVector<SDValue> MaskVals;
+    for (int MaskIndex : Mask) {
+      bool SelectMaskVal = (MaskIndex < (int)NumElts);
+      MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+    }
+    assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+    EVT MaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+    SDValue CC = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
+    // Arrange the select such that we can match a masked
+    // vrsub.vi to perform the conditional negate
+    SDValue NegB = DAG.getNegative(B, DL, VT);
+    if (!SwapCC)
+      CC = DAG.getLogicalNOT(DL, CC, CC->getValueType(0));
+    SDValue NewB = DAG.getNode(ISD::VSELECT, DL, VT, CC, NegB, B);
+    return DAG.getNode(ISD::ADD, DL, VT, A, NewB);
+  }
+
+  // Custom legalize <N x i128> or <N x i256> to <M x ELEN>.  This runs
+  // during the combine phase before type legalization, and relies on
+  // DAGCombine not undoing the transform if isShuffleMaskLegal returns false
+  // for the source mask.
   if (TLI.isTypeLegal(VT) || ElementSize <= Subtarget.getELen() ||
       !isPowerOf2_64(ElementSize) || VT.getVectorNumElements() % 2 != 0 ||
       VT.isFloatingPoint() || TLI.isShuffleMaskLegal(Mask, VT))
@@ -17183,7 +17259,6 @@ static SDValue performVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getBitcast(VT, Res);
 }
 
-
 static SDValue combineToVWMACC(SDNode *N, SelectionDAG &DAG,
                                const RISCVSubtarget &Subtarget) {
 
@@ -17857,6 +17932,8 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     return performTRUNCATECombine(N, DAG, Subtarget);
   case ISD::SELECT:
     return performSELECTCombine(N, DAG, Subtarget);
+  case ISD::VSELECT:
+    return performVSELECTCombine(N, DAG);
   case RISCVISD::CZERO_EQZ:
   case RISCVISD::CZERO_NEZ: {
     SDValue Val = N->getOperand(0);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
index ee9609992c049..318f38839851c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-select-addsub.ll
@@ -9,9 +9,8 @@ define <1 x i32> @select_addsub_v1i32(<1 x i1> %cc, <1 x i32> %a, <1 x i32> %b)
 ; CHECK-LABEL: select_addsub_v1i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <1 x i32> %a, %b
   %add = add <1 x i32> %a, %b
@@ -23,9 +22,8 @@ define <2 x i32> @select_addsub_v2i32(<2 x i1> %cc, <2 x i32> %a, <2 x i32> %b)
 ; CHECK-LABEL: select_addsub_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <2 x i32> %a, %b
   %add = add <2 x i32> %a, %b
@@ -37,9 +35,8 @@ define <4 x i32> @select_addsub_v4i32(<4 x i1> %cc, <4 x i32> %a, <4 x i32> %b)
 ; CHECK-LABEL: select_addsub_v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %a, %b
@@ -51,9 +48,9 @@ define <4 x i32> @select_addsub_v4i32_select_swapped(<4 x i1> %cc, <4 x i32> %a,
 ; CHECK-LABEL: select_addsub_v4i32_select_swapped:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vsub.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmnot.m v0, v0
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %a, %b
@@ -65,9 +62,8 @@ define <4 x i32> @select_addsub_v4i32_add_swapped(<4 x i1> %cc, <4 x i32> %a, <4
 ; CHECK-LABEL: select_addsub_v4i32_add_swapped:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v9, v8
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %b, %a
@@ -79,9 +75,9 @@ define <4 x i32> @select_addsub_v4i32_both_swapped(<4 x i1> %cc, <4 x i32> %a, <
 ; CHECK-LABEL: select_addsub_v4i32_both_swapped:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vsub.vv v10, v8, v9
-; CHECK-NEXT:    vadd.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmnot.m v0, v0
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %b, %a
@@ -93,12 +89,11 @@ define <4 x i32> @select_addsub_v4i32_sub_swapped(<4 x i1> %cc, <4 x i32> %a, <4
 ; CHECK-LABEL: select_addsub_v4i32_sub_swapped:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v9, v8
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
-  %sub = sub <4 x i32> %a, %b
-  %add = add <4 x i32> %b, %a
+  %sub = sub <4 x i32> %b, %a
+  %add = add <4 x i32> %a, %b
   %res = select <4 x i1> %cc,  <4 x i32> %sub, <4 x i32> %add
   ret <4 x i32> %res
 }
@@ -107,9 +102,8 @@ define <8 x i32> @select_addsub_v8i32(<8 x i1> %cc, <8 x i32> %a, <8 x i32> %b)
 ; CHECK-LABEL: select_addsub_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, mu
-; CHECK-NEXT:    vadd.vv v12, v8, v10
-; CHECK-NEXT:    vsub.vv v12, v8, v10, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vrsub.vi v10, v10, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v10
 ; CHECK-NEXT:    ret
   %sub = sub <8 x i32> %a, %b
   %add = add <8 x i32> %a, %b
@@ -121,9 +115,8 @@ define <16 x i32> @select_addsub_v16i32(<16 x i1> %cc, <16 x i32> %a, <16 x i32>
 ; CHECK-LABEL: select_addsub_v16i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; CHECK-NEXT:    vadd.vv v16, v8, v12
-; CHECK-NEXT:    vsub.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %sub = sub <16 x i32> %a, %b
   %add = add <16 x i32> %a, %b
@@ -136,9 +129,8 @@ define <32 x i32> @select_addsub_v32i32(<32 x i1> %cc, <32 x i32> %a, <32 x i32>
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a0, 32
 ; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vv v24, v8, v16
-; CHECK-NEXT:    vsub.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    vrsub.vi v16, v16, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v16
 ; CHECK-NEXT:    ret
   %sub = sub <32 x i32> %a, %b
   %add = add <32 x i32> %a, %b
@@ -153,62 +145,28 @@ define <64 x i32> @select_addsub_v64i32(<64 x i1> %cc, <64 x i32> %a, <64 x i32>
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    mv a2, a1
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    add a1, a1, a2
 ; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    addi a1, sp, 16
 ; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv8r.v v16, v8
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vle32.v v16, (a0)
+; CHECK-NEXT:    vle32.v v8, (a0)
 ; CHECK-NEXT:    addi a0, a0, 128
 ; CHECK-NEXT:    vle32.v v24, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
-; CHECK-NEXT:    vadd.vv v24, v8, v16
-; CHECK-NEXT:    vsub.vv v24, v8, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
 ; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v0, v0, 4
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; CHECK-NEXT:    vadd.vv v16, v16, v8
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
-; CHECK-NEXT:    vsub.vv v16, v24, v8, v0.t
+; CHECK-NEXT:    vadd.vv v8, v16, v8
+; CHECK-NEXT:    vrsub.vi v24, v24, 0, v0.t
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vadd.vv v16, v16, v24
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
@@ -224,9 +182,8 @@ define <8 x i64> @select_addsub_v8i64(<8 x i1> %cc, <8 x i64> %a, <8 x i64> %b)
 ; CHECK-LABEL: select_addsub_v8i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, mu
-; CHECK-NEXT:    vadd.vv v16, v8, v12
-; CHECK-NEXT:    vsub.vv v16, v8, v12, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    vrsub.vi v12, v12, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    ret
   %sub = sub <8 x i64> %a, %b
   %add = add <8 x i64> %a, %b
@@ -238,9 +195,8 @@ define <8 x i16> @select_addsub_v8i16(<8 x i1> %cc, <8 x i16> %a, <8 x i16> %b)
 ; CHECK-LABEL: select_addsub_v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <8 x i16> %a, %b
   %add = add <8 x i16> %a, %b
@@ -252,9 +208,8 @@ define <8 x i8> @select_addsub_v8i8(<8 x i1> %cc, <8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: select_addsub_v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <8 x i8> %a, %b
   %add = add <8 x i8> %a, %b
@@ -278,9 +233,8 @@ define <8 x i2> @select_addsub_v8i2(<8 x i1> %cc, <8 x i2> %a, <8 x i2> %b) {
 ; CHECK-LABEL: select_addsub_v8i2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <8 x i2> %a, %b
   %add = add <8 x i2> %a, %b
@@ -293,9 +247,8 @@ define <4 x i32> @select_addsub_v4i32_constmask(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %a, %b
@@ -307,14 +260,13 @@ define <4 x i32> @select_addsub_v4i32_constmask2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_constmask2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vadd.vv v10, v9, v8
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
-  %sub = sub <4 x i32> %a, %b
-  %add = add <4 x i32> %b, %a
-  %res = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,  <4 x i32> %sub, <4 x i32> %add
+  %sub = sub <4 x i32> %b, %a
+  %add = add <4 x i32> %a, %b
+  %res = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,  <4 x i32> %add, <4 x i32> %sub
   ret <4 x i32> %res
 }
 
@@ -324,9 +276,8 @@ define <4 x i32> @select_addsub_v4i32_as_shuffle(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
 ; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrsub.vi v9, v9, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %a, %b
   %add = add <4 x i32> %a, %b
@@ -339,13 +290,12 @@ define <4 x i32> @select_addsub_v4i32_as_shuffle2(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: select_addsub_v4i32_as_shuffle2:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vmv.v.i v0, 5
-; CHECK-NEXT:    vadd.vv v10, v8, v9
-; CHECK-NEXT:    vsub.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv.v.i v0, 10
+; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT:    vadd.vv v8, v9, v8
 ; CHECK-NEXT:    ret
   %sub = sub <4 x i32> %b, %a
   %add = add <4 x i32> %a, %b
-  %res = shufflevector <4 x i32> %sub, <4 x i32> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  %res = shufflevector <4 x i32> %add, <4 x i32> %sub, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
   ret <4 x i32> %res
 }

From 544a3cb65b6b9b1455f9294d1764f47a7b8673b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 24 Jan 2025 10:09:38 -0800
Subject: [PATCH 035/432] [flang][cuda] Handle variable with initialization in
 device global pass (#124307)

---
 .../Optimizer/Transforms/CUFDeviceGlobal.cpp  | 12 +++++------
 .../Fir/CUDA/cuda-implicit-device-global.f90  | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
index 5ce39f99bbb12..7486dde0e281e 100644
--- a/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFDeviceGlobal.cpp
@@ -36,13 +36,11 @@ static void processAddrOfOp(fir::AddrOfOp addrOfOp,
           addrOfOp.getSymbol().getRootReference().getValue())) {
     // TO DO: limit candidates to non-scalars. Scalars appear to have been
     // folded in already.
-    if (globalOp.getConstant()) {
-      if (recurseInGlobal)
-        globalOp.walk([&](fir::AddrOfOp op) {
-          processAddrOfOp(op, symbolTable, candidates, recurseInGlobal);
-        });
-      candidates.insert(globalOp);
-    }
+    if (recurseInGlobal)
+      globalOp.walk([&](fir::AddrOfOp op) {
+        processAddrOfOp(op, symbolTable, candidates, recurseInGlobal);
+      });
+    candidates.insert(globalOp);
   }
 }
 
diff --git a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90 b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
index 9b22ed86e419c..11866d871a607 100644
--- a/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
+++ b/flang/test/Fir/CUDA/cuda-implicit-device-global.f90
@@ -308,3 +308,24 @@ // Test that global used in device function are flagged with the correct
 // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.c.__builtin_c_devptr
 // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.dt.__builtin_c_devptr
 // CHECK-DAG: fir.global linkonce_odr @_QM__mod1E.n.__builtin_c_devptr
+
+// -----
+
+// Variables with initialization are promoted to non constant global.
+// 
+// attributes(global) subroutine kernel4()
+//   integer :: a = 4
+// end subroutine 
+
+func.func @_QPkernel4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
+  %0 = fir.address_of(@_QFkernel4Ea) : !fir.ref<i32>
+  return
+}
+fir.global internal @_QFkernel4Ea : i32 {
+  %c4_i32 = arith.constant 4 : i32
+  fir.has_value %c4_i32 : i32
+}
+
+// CHECK-LABEL: fir.global internal @_QFkernel4Ea : i32
+// CHECK-LABEL: gpu.module @cuda_device_mod
+// CHECK: fir.global internal @_QFkernel4Ea : i32

From d9b8120259a546ce7aa9f047566fef29479f59e8 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Fri, 24 Jan 2025 13:14:21 -0500
Subject: [PATCH 036/432] [lld/COFF] Fix -start-lib / -end-lib more after
 reviews.llvm.org/D116434 (#124294)

This is a follow-up to #120452 in a way.

Since lld/COFF does not yet insert all defined in an obj file before all
undefineds (ELF and MachO do this, see #67445 and things linked from
there), it's possible that:

1. We add an obj file a.obj
2. a.obj contains an undefined that's in b.obj, causing b.obj to be
added
3. b.obj contains an undefined that's in a part of a.obj that's not yet
in the symbol table, causing a recursive load of a.obj, which adds the
symbols in there twice, leading to duplicate symbol errors.

For normal archives, `ArchiveFile::addMember()` has a `seen` check to
prevent this. For start-lib lazy objects, we can just check if the
archive is still lazy at the recursive call.

This bug is similar to issue #59162.

(Eventually, we'll probably want to do what the MachO and ELF ports do.)

Includes a test that caused duplicate symbol diagnostics before this
code change.
---
 lld/COFF/InputFiles.cpp    |  2 ++
 lld/COFF/SymbolTable.cpp   |  4 +++
 lld/test/COFF/start-lib.ll | 69 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index fe1135db636cb..47faf70e099e1 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -151,6 +151,8 @@ void ArchiveFile::addMember(const Archive::Symbol &sym) {
                                  toCOFFString(symtab.ctx, sym));
 
   // Return an empty buffer if we have already returned the same buffer.
+  // FIXME: Remove this once we resolve all defineds before all undefineds in
+  //        ObjFile::initializeSymbols().
   if (!seen.insert(c.getChildOffset()).second)
     return;
 
diff --git a/lld/COFF/SymbolTable.cpp b/lld/COFF/SymbolTable.cpp
index 32ea4a5b2e1fc..307bd4a0c9411 100644
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@@ -56,6 +56,10 @@ static void forceLazy(Symbol *s) {
   }
   case Symbol::Kind::LazyObjectKind: {
     InputFile *file = cast<LazyObject>(s)->file;
+    // FIXME: Remove this once we resolve all defineds before all undefineds in
+    //        ObjFile::initializeSymbols().
+    if (!file->lazy)
+      return;
     file->lazy = false;
     file->symtab.ctx.driver.addFile(file);
     break;
diff --git a/lld/test/COFF/start-lib.ll b/lld/test/COFF/start-lib.ll
index a46147f21ccbb..134cdc2a6e1df 100644
--- a/lld/test/COFF/start-lib.ll
+++ b/lld/test/COFF/start-lib.ll
@@ -173,3 +173,72 @@ target triple = "x86_64-pc-windows-msvc"
 define void @baz() {
   ret void
 }
+
+
+; Check cycles between symbols in two /start-lib files.
+; If the links succeed and does not emit duplicate symbol diagnostics,
+; that's enough.
+
+; RUN: llc -filetype=obj %t.dir/main3.ll -o %t-main3.obj
+; RUN: llc -filetype=obj %t.dir/cycle1.ll -o %t-cycle1.obj
+; RUN: llc -filetype=obj %t.dir/cycle2.ll -o %t-cycle2.obj
+; RUN: opt -thinlto-bc %t.dir/main3.ll -o %t-main3.bc
+; RUN: opt -thinlto-bc %t.dir/cycle1.ll -o %t-cycle1.bc
+; RUN: opt -thinlto-bc %t.dir/cycle2.ll -o %t-cycle2.bc
+
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     %t-main3.obj %t-cycle1.obj %t-cycle2.obj
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     %t-main3.obj /start-lib %t-cycle1.obj %t-cycle2.obj /end-lib
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     /start-lib %t-cycle1.obj %t-cycle2.obj /end-lib %t-main3.obj
+
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     %t-main3.bc %t-cycle1.bc %t-cycle2.bc
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     %t-main3.bc /start-lib %t-cycle1.bc %t-cycle2.bc /end-lib
+; RUN: lld-link -out:%t3.exe -entry:main \
+; RUN:     /start-lib %t-cycle1.bc %t-cycle2.bc /end-lib %t-main3.bc
+
+#--- main3.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @foo1()
+
+define void @main() {
+  call void () @foo1()
+  ret void
+}
+
+#--- cycle1.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @bar()
+
+define void @foo1() {
+  ; cycle1.ll pulls in cycle2.ll for bar(), and cycle2.ll then pulls in
+  ; cycle1.ll again for foo2().
+  call void () @bar()
+  ret void
+}
+
+define void @foo2() {
+  ret void
+}
+
+
+#--- cycle2.ll
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare void @foo2()
+
+define void @bar() {
+  call void () @foo2()
+  ret void
+}

From e4009ed3d68ba8d9e78721ce5afc2b3a7edd6f36 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 24 Jan 2025 19:17:51 +0100
Subject: [PATCH 037/432] [clang][docs] Update bytecode interpreter docs
 (#124252)

Just a light update, not adding a lot of new information.
---
 clang/docs/ConstantInterpreter.rst | 119 ++++-------------------------
 clang/www/OpenProjects.html        |  11 ---
 2 files changed, 16 insertions(+), 114 deletions(-)

diff --git a/clang/docs/ConstantInterpreter.rst b/clang/docs/ConstantInterpreter.rst
index 0c5b09c73ee30..b08cb1ce353be 100644
--- a/clang/docs/ConstantInterpreter.rst
+++ b/clang/docs/ConstantInterpreter.rst
@@ -18,8 +18,8 @@ by the evaluator. The interpreter is activated using the following flags:
 Bytecode Compilation
 ====================
 
-Bytecode compilation is handled in ``ByteCodeStmtGen.h`` for statements
-and ``ByteCodeExprGen.h`` for expressions. The compiler has two different
+Bytecode compilation is handled in ``Compiler.h`` for statements
+and for expressions. The compiler has two different
 backends: one to generate bytecode for functions (``ByteCodeEmitter``) and
 one to directly evaluate expressions as they are compiled, without
 generating bytecode (``EvalEmitter``). All functions are compiled to
@@ -44,11 +44,11 @@ Primitive Types
   Signed or unsigned integers of a specific bit width, implemented using
   the ```Integral``` type.
 
-* ``PT_{U|S}intFP``
+* ``PT_IntAP{S}``
 
   Signed or unsigned integers of an arbitrary, but fixed width used to
   implement integral types which are required by the target, but are not
-  supported by the host. Under the hood, they rely on APValue. The
+  supported by the host. Under the hood, they rely on ``APInt``. The
   ``Integral`` specialisation for these types is required by opcodes to
   share an implementation with fixed integrals.
 
@@ -57,7 +57,7 @@ Primitive Types
   Representation for boolean types, essentially a 1-bit unsigned
   ``Integral``.
 
-* ``PT_RealFP``
+* ``PT_Float``
 
   Arbitrary, but fixed precision floating point numbers. Could be
   specialised in the future similarly to integers in order to improve
@@ -65,30 +65,21 @@ Primitive Types
 
 * ``PT_Ptr``
 
-  Pointer type, defined in ``"Pointer.h"``. A pointer can be either null,
-  reference interpreter-allocated memory (``BlockPointer``) or point to an
-  address which can be derived, but not accessed (``ExternPointer``).
+  Pointer type, defined in ``"Pointer.h"``. The most common type of
+  pointer is a "BlockPointer", which points to an ``interp::Block``.
+  But other pointer types exist, such as typeid pointers or
+  integral pointers.
 
 * ``PT_FnPtr``
 
   Function pointer type, can also be a null function pointer. Defined
-  in ``"FnPointer.h"``.
+  in ``"FunctionPointer.h"``.
 
-* ``PT_MemPtr``
+* ``PT_MemberPtr``
 
   Member pointer type, can also be a null member pointer. Defined
   in ``"MemberPointer.h"``
 
-* ``PT_VoidPtr``
-
-  Void pointer type, can be used for round-trip casts. Represented as
-  the union of all pointers which can be cast to void.
-  Defined in ``"VoidPointer.h"``.
-
-* ``PT_ObjCBlockPtr``
-
-  Pointer type for ObjC blocks. Defined in ``"ObjCBlockPointer.h"``.
-
 Composite types
 ---------------
 
@@ -219,35 +210,21 @@ Pointers
 --------
 
 Pointers, implemented in ``Pointer.h`` are represented as a tagged union.
-Some of these may not yet be available in upstream ``clang``.
 
  * **BlockPointer**: used to reference memory allocated and managed by the
    interpreter, being the only pointer kind which allows dereferencing in the
    interpreter
- * **ExternPointer**: points to memory which can be addressed, but not read by
-   the interpreter. It is equivalent to APValue, tracking a declaration and a path
-   of fields and indices into that allocation.
- * **TargetPointer**: represents a target address derived from a base address
-   through pointer arithmetic, such as ``((int *)0x100)[20]``. Null pointers are
-   target pointers with a zero offset.
- * **TypeInfoPointer**: tracks information for the opaque type returned by
+ * **TypeIDPointer**: tracks information for the opaque type returned by
    ``typeid``
- * **InvalidPointer**: is dummy pointer created by an invalid operation which
-   allows the interpreter to continue execution. Does not allow pointer
-   arithmetic or dereferencing.
+ * **IntegralPointer**: a pointer formed from an integer,
+   think ``(int*)123``.
 
 Besides the previously mentioned union, a number of other pointer-like types
 have their own type:
 
- * **ObjCBlockPointer** tracks Objective-C blocks
- * **FnPointer** tracks functions and lazily caches their compiled version
+ * **FunctionPointer** tracks functions.
  * **MemberPointer** tracks C++ object members
 
-Void pointers, which can be built by casting any of the aforementioned
-pointers, are implemented as a union of all pointer types. The ``BitCast``
-opcode is responsible for performing all legal conversions between these
-types and primitive integers.
-
 BlockPointer
 ~~~~~~~~~~~~
 
@@ -311,73 +288,9 @@ of ``a.c``, but its offset would point to ``&a.c[1]``. The
 array-to-pointer decay operation adjusts a pointer to an array (where
 the offset is equal to the base) to a pointer to the first element.
 
-ExternPointer
-~~~~~~~~~~~~~
-
-Extern pointers can be derived, pointing into symbols which are not
-readable from constexpr. An external pointer consists of a base
-declaration, along with a path designating a subobject, similar to
-the ``LValuePath`` of an APValue. Extern pointers can be converted
-to block pointers if the underlying variable is defined after the
-pointer is created, as is the case in the following example:
-
-.. code-block:: c
-
-  extern const int a;
-  constexpr const int *p = &a;
-  const int a = 5;
-  static_assert(*p == 5, "x");
-
-TargetPointer
-~~~~~~~~~~~~~
-
-While null pointer arithmetic or integer-to-pointer conversion is
-banned in constexpr, some expressions on target offsets must be folded,
-replicating the behaviour of the ``offsetof`` builtin. Target pointers
-are characterised by 3 offsets: a field offset, an array offset and a
-base offset, along with a descriptor specifying the type the pointer is
-supposed to refer to. Array indexing adjusts the array offset, while the
-field offset is adjusted when a pointer to a member is created. Casting
-an integer to a pointer sets the value of the base offset. As a special
-case, null pointers are target pointers with all offsets set to 0.
-
 TypeInfoPointer
 ~~~~~~~~~~~~~~~
 
 ``TypeInfoPointer`` tracks two types: the type assigned to
 ``std::type_info`` and the type which was passed to ``typeinfo``.
-
-InvalidPointer
-~~~~~~~~~~~~~~
-
-Such pointers are built by operations which cannot generate valid
-pointers, allowing the interpreter to continue execution after emitting
-a warning. Inspecting such a pointer stops execution.
-
-TODO
-====
-
-Missing Language Features
--------------------------
-
-* Changing the active field of unions
-* ``volatile``
-* ``__builtin_constant_p``
-* ``dynamic_cast``
-* ``new`` and ``delete``
-* Fixed Point numbers and arithmetic on Complex numbers
-* Several builtin methods, including string operations and
-  ``__builtin_bit_cast``
-* Continue-after-failure: a form of exception handling at the bytecode
-  level should be implemented to allow execution to resume. As an example,
-  argument evaluation should resume after the computation of an argument fails.
-* Pointer-to-Integer conversions
-* Lazy descriptors: the interpreter creates a ``Record`` and ``Descriptor``
-  when it encounters a type: ones which are not yet defined should be lazily
-  created when required
-
-Known Bugs
-----------
-
-* If execution fails, memory storing APInts and APFloats is leaked when the
-  stack is cleared
+It is part of the taged union in ``Pointer``.
diff --git a/clang/www/OpenProjects.html b/clang/www/OpenProjects.html
index d48b3bebe7611..a9efdb8d762d7 100755
--- a/clang/www/OpenProjects.html
+++ b/clang/www/OpenProjects.html
@@ -90,17 +90,6 @@ <h1>Open Clang Projects</h1>
 performance as well as to find ways to proactively alert us when we've
 introduced a change that has significant negative impact on build times.</li>
 
-<li><b>Complete support for the experimental constant expression interpreter
-</b>: Clang's production constant expression interpreter computes a constant
-expression result by walking over AST nodes, performing calculations as it
-goes. This does not have good performance properties, and so we've begun work
-on an <a href="https://clang.llvm.org/docs/ConstantInterpreter.html">
-experimental constant expression interpreter</a> that works by converting the
-AST into bytecode that is interpreted. This effort has a long tail of work left
-to complete because it requires implementing byte code for every kind of
-expression and type that can be used in a constant expression for C++ and C.
-</li>
-
 <li><b>Improve clang-doc</b>: Clang's library-based design allows it to be used
 by a variety of tools that reason about source code.
 <a href="https://clang.llvm.org/extra/clang-doc.html">clang-doc</a> is one

From 825e712959d48f14b47e579871bcf9b5e25fff7a Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Fri, 24 Jan 2025 10:24:09 -0800
Subject: [PATCH 038/432] [HLSL] cbuffer: create host layout structs (#122820)

Creates layout struct for `cbuffer` in Sema which will contains only
declarations contributing to the constant buffer layout. Anything else
will be filtered out, such as static variables decls, struct and
function definitions, resources, or empty struct and zero-sized arrays.

If the constant buffer includes a struct that contains any of the above
undesirable declarations, a new version of this struct should be created
with these declarations filtered out as well.

The definition of buffer layout struct will be added to the
HLSLBufferDecl AST node as the last node. Any layout structs for
embedded structures will be added there as well.

Fixes #122553
---
 clang/lib/Sema/SemaHLSL.cpp                   | 239 ++++++++++++++++++
 ... => ast-dump-comment-cbuffer-tbuffer.hlsl} |  39 +--
 clang/test/AST/HLSL/cbuffer.hlsl              | 209 +++++++++++++++
 .../test/AST/HLSL/cbuffer_and_namespaces.hlsl |  98 +++++++
 clang/test/AST/HLSL/cbuffer_tbuffer.hlsl      |  26 --
 clang/test/AST/HLSL/pch_hlsl_buffer.hlsl      |  39 +--
 6 files changed, 592 insertions(+), 58 deletions(-)
 rename clang/test/AST/HLSL/{ast-dump-comment-cbuffe-tbufferr.hlsl => ast-dump-comment-cbuffer-tbuffer.hlsl} (50%)
 create mode 100644 clang/test/AST/HLSL/cbuffer.hlsl
 create mode 100644 clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
 delete mode 100644 clang/test/AST/HLSL/cbuffer_tbuffer.hlsl

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 5001883003ee2..f26469e6a2f1d 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -15,12 +15,14 @@
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclarationName.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/AST/Expr.h"
 #include "clang/AST/Type.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticSema.h"
+#include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/TargetInfo.h"
@@ -32,16 +34,21 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DXILABI.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
+#include <cstddef>
 #include <iterator>
 #include <utility>
 
 using namespace clang;
 using RegisterType = HLSLResourceBindingAttr::RegisterType;
 
+static CXXRecordDecl *createHostLayoutStruct(Sema &S,
+                                             CXXRecordDecl *StructDecl);
+
 static RegisterType getRegisterType(ResourceClass RC) {
   switch (RC) {
   case ResourceClass::SRV:
@@ -253,12 +260,244 @@ static void validatePackoffset(Sema &S, HLSLBufferDecl *BufDecl) {
   }
 }
 
+// Returns true if the array has a zero size = if any of the dimensions is 0
+static bool isZeroSizedArray(const ConstantArrayType *CAT) {
+  while (CAT && !CAT->isZeroSize())
+    CAT = dyn_cast<ConstantArrayType>(
+        CAT->getElementType()->getUnqualifiedDesugaredType());
+  return CAT != nullptr;
+}
+
+// Returns true if the record type is an HLSL resource class
+static bool isResourceRecordType(const Type *Ty) {
+  return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr;
+}
+
+// Returns true if the type is a leaf element type that is not valid to be
+// included in HLSL Buffer, such as a resource class, empty struct, zero-sized
+// array, or a builtin intangible type. Returns false it is a valid leaf element
+// type or if it is a record type that needs to be inspected further.
+static bool isInvalidConstantBufferLeafElementType(const Type *Ty) {
+  if (Ty->isRecordType()) {
+    if (isResourceRecordType(Ty) || Ty->getAsCXXRecordDecl()->isEmpty())
+      return true;
+    return false;
+  }
+  if (Ty->isConstantArrayType() &&
+      isZeroSizedArray(cast<ConstantArrayType>(Ty)))
+    return true;
+  if (Ty->isHLSLBuiltinIntangibleType())
+    return true;
+  return false;
+}
+
+// Returns true if the struct contains at least one element that prevents it
+// from being included inside HLSL Buffer as is, such as an intangible type,
+// empty struct, or zero-sized array. If it does, a new implicit layout struct
+// needs to be created for HLSL Buffer use that will exclude these unwanted
+// declarations (see createHostLayoutStruct function).
+static bool requiresImplicitBufferLayoutStructure(const CXXRecordDecl *RD) {
+  if (RD->getTypeForDecl()->isHLSLIntangibleType() || RD->isEmpty())
+    return true;
+  // check fields
+  for (const FieldDecl *Field : RD->fields()) {
+    QualType Ty = Field->getType();
+    if (isInvalidConstantBufferLeafElementType(Ty.getTypePtr()))
+      return true;
+    if (Ty->isRecordType() &&
+        requiresImplicitBufferLayoutStructure(Ty->getAsCXXRecordDecl()))
+      return true;
+  }
+  // check bases
+  for (const CXXBaseSpecifier &Base : RD->bases())
+    if (requiresImplicitBufferLayoutStructure(
+            Base.getType()->getAsCXXRecordDecl()))
+      return true;
+  return false;
+}
+
+static CXXRecordDecl *findRecordDeclInContext(IdentifierInfo *II,
+                                              DeclContext *DC) {
+  CXXRecordDecl *RD = nullptr;
+  for (NamedDecl *Decl :
+       DC->getNonTransparentContext()->lookup(DeclarationName(II))) {
+    if (CXXRecordDecl *FoundRD = dyn_cast<CXXRecordDecl>(Decl)) {
+      assert(RD == nullptr &&
+             "there should be at most 1 record by a given name in a scope");
+      RD = FoundRD;
+    }
+  }
+  return RD;
+}
+
+// Creates a name for buffer layout struct using the provide name base.
+// If the name must be unique (not previously defined), a suffix is added
+// until a unique name is found.
+static IdentifierInfo *getHostLayoutStructName(Sema &S, NamedDecl *BaseDecl,
+                                               bool MustBeUnique) {
+  ASTContext &AST = S.getASTContext();
+
+  IdentifierInfo *NameBaseII = BaseDecl->getIdentifier();
+  llvm::SmallString<64> Name("__layout_");
+  if (NameBaseII) {
+    Name.append(NameBaseII->getName());
+  } else {
+    // anonymous struct
+    Name.append("anon");
+    MustBeUnique = true;
+  }
+
+  size_t NameLength = Name.size();
+  IdentifierInfo *II = &AST.Idents.get(Name, tok::TokenKind::identifier);
+  if (!MustBeUnique)
+    return II;
+
+  unsigned suffix = 0;
+  while (true) {
+    if (suffix != 0) {
+      Name.append("_");
+      Name.append(llvm::Twine(suffix).str());
+      II = &AST.Idents.get(Name, tok::TokenKind::identifier);
+    }
+    if (!findRecordDeclInContext(II, BaseDecl->getDeclContext()))
+      return II;
+    // declaration with that name already exists - increment suffix and try
+    // again until unique name is found
+    suffix++;
+    Name.truncate(NameLength);
+  };
+}
+
+// Creates a field declaration of given name and type for HLSL buffer layout
+// struct. Returns nullptr if the type cannot be use in HLSL Buffer layout.
+static FieldDecl *createFieldForHostLayoutStruct(Sema &S, const Type *Ty,
+                                                 IdentifierInfo *II,
+                                                 CXXRecordDecl *LayoutStruct) {
+  if (isInvalidConstantBufferLeafElementType(Ty))
+    return nullptr;
+
+  if (Ty->isRecordType()) {
+    CXXRecordDecl *RD = Ty->getAsCXXRecordDecl();
+    if (requiresImplicitBufferLayoutStructure(RD)) {
+      RD = createHostLayoutStruct(S, RD);
+      if (!RD)
+        return nullptr;
+      Ty = RD->getTypeForDecl();
+    }
+  }
+
+  QualType QT = QualType(Ty, 0);
+  ASTContext &AST = S.getASTContext();
+  TypeSourceInfo *TSI = AST.getTrivialTypeSourceInfo(QT, SourceLocation());
+  auto *Field = FieldDecl::Create(AST, LayoutStruct, SourceLocation(),
+                                  SourceLocation(), II, QT, TSI, nullptr, false,
+                                  InClassInitStyle::ICIS_NoInit);
+  Field->setAccess(AccessSpecifier::AS_private);
+  return Field;
+}
+
+// Creates host layout struct for a struct included in HLSL Buffer.
+// The layout struct will include only fields that are allowed in HLSL buffer.
+// These fields will be filtered out:
+// - resource classes
+// - empty structs
+// - zero-sized arrays
+// Returns nullptr if the resulting layout struct would be empty.
+static CXXRecordDecl *createHostLayoutStruct(Sema &S,
+                                             CXXRecordDecl *StructDecl) {
+  assert(requiresImplicitBufferLayoutStructure(StructDecl) &&
+         "struct is already HLSL buffer compatible");
+
+  ASTContext &AST = S.getASTContext();
+  DeclContext *DC = StructDecl->getDeclContext();
+  IdentifierInfo *II = getHostLayoutStructName(S, StructDecl, false);
+
+  // reuse existing if the layout struct if it already exists
+  if (CXXRecordDecl *RD = findRecordDeclInContext(II, DC))
+    return RD;
+
+  CXXRecordDecl *LS = CXXRecordDecl::Create(
+      AST, TagDecl::TagKind::Class, DC, SourceLocation(), SourceLocation(), II);
+  LS->setImplicit(true);
+  LS->startDefinition();
+
+  // copy base struct, create HLSL Buffer compatible version if needed
+  if (unsigned NumBases = StructDecl->getNumBases()) {
+    assert(NumBases == 1 && "HLSL supports only one base type");
+    CXXBaseSpecifier Base = *StructDecl->bases_begin();
+    CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
+    if (requiresImplicitBufferLayoutStructure(BaseDecl)) {
+      BaseDecl = createHostLayoutStruct(S, BaseDecl);
+      if (BaseDecl) {
+        TypeSourceInfo *TSI = AST.getTrivialTypeSourceInfo(
+            QualType(BaseDecl->getTypeForDecl(), 0));
+        Base = CXXBaseSpecifier(SourceRange(), false, StructDecl->isClass(),
+                                AS_none, TSI, SourceLocation());
+      }
+    }
+    if (BaseDecl) {
+      const CXXBaseSpecifier *BasesArray[1] = {&Base};
+      LS->setBases(BasesArray, 1);
+    }
+  }
+
+  // filter struct fields
+  for (const FieldDecl *FD : StructDecl->fields()) {
+    const Type *Ty = FD->getType()->getUnqualifiedDesugaredType();
+    if (FieldDecl *NewFD =
+            createFieldForHostLayoutStruct(S, Ty, FD->getIdentifier(), LS))
+      LS->addDecl(NewFD);
+  }
+  LS->completeDefinition();
+
+  if (LS->field_empty() && LS->getNumBases() == 0)
+    return nullptr;
+
+  DC->addDecl(LS);
+  return LS;
+}
+
+// Creates host layout struct for HLSL Buffer. The struct will include only
+// fields of types that are allowed in HLSL buffer and it will filter out:
+// - static variable declarations
+// - resource classes
+// - empty structs
+// - zero-sized arrays
+// - non-variable declarations
+// The layour struct will be added to the HLSLBufferDecl declarations.
+void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) {
+  ASTContext &AST = S.getASTContext();
+  IdentifierInfo *II = getHostLayoutStructName(S, BufDecl, true);
+
+  CXXRecordDecl *LS =
+      CXXRecordDecl::Create(AST, TagDecl::TagKind::Class, BufDecl,
+                            SourceLocation(), SourceLocation(), II);
+  LS->setImplicit(true);
+  LS->startDefinition();
+
+  for (const Decl *D : BufDecl->decls()) {
+    const VarDecl *VD = dyn_cast<VarDecl>(D);
+    if (!VD || VD->getStorageClass() == SC_Static)
+      continue;
+    const Type *Ty = VD->getType()->getUnqualifiedDesugaredType();
+    if (FieldDecl *FD =
+            createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS))
+      LS->addDecl(FD);
+  }
+  LS->completeDefinition();
+  BufDecl->addDecl(LS);
+}
+
+// Handle end of cbuffer/tbuffer declaration
 void SemaHLSL::ActOnFinishBuffer(Decl *Dcl, SourceLocation RBrace) {
   auto *BufDecl = cast<HLSLBufferDecl>(Dcl);
   BufDecl->setRBraceLoc(RBrace);
 
   validatePackoffset(SemaRef, BufDecl);
 
+  // create buffer layout struct
+  createHostLayoutStructForBuffer(SemaRef, BufDecl);
+
   SemaRef.PopDeclContext();
 }
 
diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
similarity index 50%
rename from clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl
rename to clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
index e6a2ea7c6d2dc..0bff3ae144037 100644
--- a/clang/test/AST/HLSL/ast-dump-comment-cbuffe-tbufferr.hlsl
+++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
@@ -37,19 +37,26 @@ tbuffer B {
     int d;
 }
 
-// AST:HLSLBufferDecl {{.*}}:11:1, line:20:1> line:11:9 cbuffer A
-// AST-NEXT:-HLSLResourceClassAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-// AST-NEXT:-HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-// AST-NEXT:FullComment {{.*}}<line:10:4, col:17>
-// AST-NEXT:`-ParagraphComment {{.*}}<col:4, col:17>
-// AST-NEXT:`-TextComment {{.*}}<col:4, col:17> Text=" CBuffer decl."
-// AST-NEXT:-VarDecl {{.*}}<line:15:5, col:11> col:11 a 'float'
-// AST-NEXT:`-VarDecl {{.*}}<line:19:5, col:9> col:9 b 'int'
-// AST-NEXT:HLSLBufferDecl {{.*}}<line:29:1, line:38:1> line:29:9 tbuffer B
-// AST-NEXT:-HLSLResourceClassAttr {{.*}} <<invalid sloc>> Implicit SRV
-// AST-NEXT:-HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit TBuffer
-// AST-NEXT:-FullComment {{.*}}<line:28:4, col:17>
-// AST-NEXT: `-ParagraphComment {{.*}}<col:4, col:17>
-// AST-NEXT:  `-TextComment {{.*}}<col:4, col:17> Text=" TBuffer decl."
-// AST-NEXT:-VarDecl {{.*}}<line:33:5, col:11> col:11 c 'float'
-// AST-NEXT:`-VarDecl {{.*}} <line:37:5, col:9> col:9 d 'int'
+// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A
+// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer
+// AST-NEXT: FullComment
+// AST-NEXT: ParagraphComment
+// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl."
+// AST-NEXT: VarDecl {{.*}} a 'float'
+// AST-NEXT: VarDecl {{.*}} b 'int'
+// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition
+// AST: FieldDecl {{.*}} a 'float'
+// AST-NEXT: FieldDecl {{.*}} b 'int'
+
+// AST-NEXT: HLSLBufferDecl {{.*}} line:29:9 tbuffer B
+// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV
+// AST-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer
+// AST-NEXT: FullComment
+// AST-NEXT: ParagraphComment
+// AST-NEXT: TextComment {{.*}} Text=" TBuffer decl."
+// AST-NEXT: VarDecl {{.*}} c 'float'
+// AST-NEXT: VarDecl {{.*}} d 'int'
+// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_B definition
+// AST: FieldDecl {{.*}} c 'float'
+// AST-NEXT: FieldDecl {{.*}} d 'int'
diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl
new file mode 100644
index 0000000000000..721abb290f163
--- /dev/null
+++ b/clang/test/AST/HLSL/cbuffer.hlsl
@@ -0,0 +1,209 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s
+
+struct EmptyStruct {
+};
+
+struct A {
+  float a;
+};
+
+struct B {
+  RWBuffer<float> buf;
+  EmptyStruct es;
+  float ea[0];
+  float a;
+};
+
+struct C {
+  EmptyStruct es;
+};
+
+typedef B BTypedef;
+typedef C CTypedef;
+
+struct D : B {
+  float b;
+};
+
+struct E : EmptyStruct {
+  float c;
+};
+
+struct F : A {
+  int ae[0];
+};
+
+typedef float EmptyArrayTypedef[10][0];
+
+struct OneFloat {
+  float a;
+};
+
+struct TwoFloats {
+  float a;
+  float b;
+};
+
+// CHECK: HLSLBufferDecl {{.*}} line:50:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}} col:9 used a1 'float'
+  float a1;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB definition
+  // CHECK: FieldDecl {{.*}} a1 'float'
+}
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_CB), "");
+
+// Check that buffer layout struct does not include resources or empty types 
+// CHECK: HLSLBufferDecl {{.*}} line:62:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}} col:9 used a2 'float'
+  float a2;
+  // CHECK: VarDecl {{.*}} col:19 b2 'RWBuffer<float>':'hlsl::RWBuffer<float>'
+  RWBuffer<float> b2; 
+  // CHECK: VarDecl {{.*}} col:15 c2 'EmptyStruct'
+  EmptyStruct c2;
+  // CHECK: VarDecl {{.*}} col:9 d2 'float[0]'
+  float d2[0];
+  // CHECK: VarDecl {{.*}} col:9 e2 'float'
+  float e2;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_1 definition
+  // CHECK: FieldDecl {{.*}} a2 'float'
+  // CHECK-NEXT: FieldDecl {{.*}} e2 'float'
+}
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_1), "");
+
+// Check that layout struct is created for B and the empty struct C is removed
+// CHECK: HLSLBufferDecl {{.*}} line:83:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}} col:5 used s1 'A'
+  A s1;
+  // CHECK: VarDecl {{.*}} col:5 s2 'B'
+  B s2;
+  // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C
+  CTypedef s3;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_2 definition
+  // CHECK: FieldDecl {{.*}} s1 'A'
+  // CHECK: FieldDecl {{.*}} s2 '__layout_B'
+}
+// CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_B definition
+// CHECK: FieldDecl {{.*}} a 'float'
+
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_B), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_2), "");
+
+// check that layout struct is created for D because of its base struct
+// CHECK: HLSLBufferDecl {{.*}} line:104:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}} s4 'D'
+  D s4;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_3 definition
+  // CHECK: FieldDecl {{.*}} s4 '__layout_D'
+}
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_D definition
+  // CHECK: public '__layout_B'
+  // CHECK: FieldDecl {{.*}} b 'float'
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_D), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_3), "");
+
+// check that layout struct is created for E because because its base struct
+// is empty and should be eliminated, and BTypedef should reuse the previously
+// defined '__layout_B' 
+// CHECK: HLSLBufferDecl {{.*}} line:122:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}}  s5 'E'
+  E s5;
+  // CHECK: VarDecl {{.*}} s6 'BTypedef':'B'
+  BTypedef s6;
+  // CHECK: CXXRecordDecl {{.*}}  implicit referenced class __layout_CB_4 definition
+  // CHECK: FieldDecl {{.*}} s5 '__layout_E'
+  // CHECK: FieldDecl {{.*}} s6 '__layout_B'
+}
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_E definition
+  // CHECK: FieldDecl {{.*}} c 'float'
+  // CHECK-NOT: CXXRecordDecl {{.*}} class __layout_B definition
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_E), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_4), "");
+
+// check that this produces empty layout struct
+// CHECK: HLSLBufferDecl {{.*}} line:141:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: FunctionDecl {{.*}} f 'void ()'
+  void f() {}
+  // CHECK: VarDecl {{.*}} SV 'float' static
+  static float SV;
+  // CHECK: VarDecl {{.*}} s7 'EmptyStruct' callinit
+  EmptyStruct s7;
+  // CHECK: VarDecl {{.*}} Buf 'RWBuffer<float>':'hlsl::RWBuffer<float>' callinit
+  RWBuffer<float> Buf;
+  // CHECK: VarDecl {{.*}} ea 'EmptyArrayTypedef':'float[10][0]'
+  EmptyArrayTypedef ea;
+  // CHECK: CXXRecordDecl {{.*}} implicit class __layout_CB_5 definition
+  // CHECK-NOT: FieldDecl
+}
+
+// check host layout struct with compatible base struct
+// CHECK: HLSLBufferDecl {{.*}} line:160:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: VarDecl {{.*}} s8 'F'
+  F s8;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_6 definition
+  // CHECK: FieldDecl {{.*}} s8 '__layout_F'
+}
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_F definition
+  // CHECK: public 'A'
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_F), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_CB_6), "");
+
+// anonymous structs
+// CHECK: HLSLBufferDecl {{.*}} line:175:9 cbuffer CB
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB {
+  // CHECK: CXXRecordDecl {{.*}} struct definition
+  struct {
+    // CHECK: FieldDecl {{.*}} e 'float'
+    float e;
+    // CHECK: FieldDecl {{.*}} c 'int[0][1]'
+    int c[0][1];
+    // CHECK: FieldDecl {{.*}} f 'RWBuffer<float>':'hlsl::RWBuffer<float>'
+    RWBuffer<float> f;
+  } s9;
+  // CHECK: VarDecl {{.*}} s9 'struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3
+  // CHECK: CXXRecordDecl {{.*}} struct definition
+  struct {
+    // CHECK: FieldDecl {{.*}} g 'float'
+    float g;
+    // CHECK: FieldDecl {{.*}} f 'RWBuffer<float>':'hlsl::RWBuffer<float>'
+    RWBuffer<float> f;
+  } s10;
+  // CHECK: VarDecl {{.*}} s10 'struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon definition
+  // CHECK: FieldDecl {{.*}} e 'float'
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon_1 definition
+  // CHECK: FieldDecl {{.*}} g 'float'
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_7 definition
+  // CHECK: FieldDecl {{.*}} s9 '__layout_anon'
+  // CHECK: FieldDecl {{.*}} s10 '__layout_anon_1'
+}
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_anon), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout_anon_1), "");
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layout_CB_7), "");
+
+// Add uses for the constant buffer declarations so they are not optimized away
+export float foo() {
+  return a1 + a2 + s1.a + s4.b + s5.c + s8.a + s9.e;
+}
diff --git a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
new file mode 100644
index 0000000000000..4b1bbea736f85
--- /dev/null
+++ b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
@@ -0,0 +1,98 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -ast-dump -o - %s | FileCheck %s
+
+// CHECK: CXXRecordDecl {{.*}} struct EmptyStruct definition
+struct EmptyStruct {
+};
+
+// CHECK: NamespaceDecl {{.*}} NS1
+namespace NS1 {
+  // CHECK: CXXRecordDecl {{.*}} struct Foo definition
+  struct Foo { 
+    float a;
+    EmptyStruct es;
+  };
+  
+  // CHECK: CXXRecordDecl {{.*}} struct Bar definition
+  struct Bar {
+    // CHECK: CXXRecordDecl {{.*}} struct Foo definition
+    struct Foo {
+      int b;
+      EmptyStruct es;
+    };
+    // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition
+    // CHECK: FieldDecl {{.*}} b 'int'
+  };
+  // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition
+  // CHECK: FieldDecl {{.*}} a 'float'
+}
+
+struct Foo {
+  double c;
+  EmptyStruct es;
+};
+
+// CHECK: HLSLBufferDecl {{.*}}  line:37:9 cbuffer CB1
+// CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+cbuffer CB1 {
+  // CHECK: VarDecl {{.*}} foo1 'Foo'
+  Foo foo1;
+  // CHECK: VarDecl {{.*}} foo2 'NS1::Foo'
+  NS1::Foo foo2;
+  // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo'
+  NS1::Bar::Foo foo3;
+  // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB1 definition
+  // CHECK: FieldDecl {{.*}} foo1 '__layout_Foo'
+  // CHECK: FieldDecl {{.*}} foo2 'NS1::__layout_Foo'
+  // CHECK: FieldDecl {{.*}} foo3 'NS1::Bar::__layout_Foo'
+}
+// CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition
+// CHECK: FieldDecl {{.*}} c 'double'
+
+struct CB1ExpectedShape {
+    double a1;
+    float a2;
+    int a;
+};
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(CB1ExpectedShape, __layout_CB1), "");
+
+namespace NS2 {
+  struct Foo { 
+    float d[4];
+    EmptyStruct es;
+  };
+  // CHECK: HLSLBufferDecl {{.*}} line:67:11 cbuffer CB2
+  // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+  // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
+  cbuffer CB2 {
+    // CHECK: VarDecl {{.*}} foo0 '::Foo':'Foo'
+    ::Foo foo0;
+    // CHECK: VarDecl {{.*}} foo1 'Foo':'NS2::Foo'
+    Foo foo1;
+    // CHECK: VarDecl {{.*}} foo2 'NS1::Foo'
+    NS1::Foo foo2;
+    // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo'
+    NS1::Bar::Foo foo3;
+    // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB2 definition
+    // CHECK: FieldDecl {{.*}} foo0 '__layout_Foo'
+    // CHECK: FieldDecl {{.*}} foo1 'NS2::__layout_Foo'
+    // CHECK: FieldDecl {{.*}} foo2 'NS1::__layout_Foo'
+    // CHECK: FieldDecl {{.*}} foo3 'NS1::Bar::__layout_Foo'
+  }
+  // CHECK: CXXRecordDecl {{.*}} implicit class __layout_Foo definition
+  // CHECK: FieldDecl {{.*}} d 'float[4]'
+}
+
+struct CB2ExpectedShape {
+    double a1;
+    float d[4];
+    float a2;
+    int a;
+};
+_Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(CB2ExpectedShape, NS2::__layout_CB2), "");
+
+// Add uses for the constant buffer declarations so they are not optimized away
+// CHECK: ExportDecl
+export float f() {
+  return foo2.a + NS2::foo2.a;
+}
diff --git a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl b/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl
deleted file mode 100644
index 5e558354cd3a0..0000000000000
--- a/clang/test/AST/HLSL/cbuffer_tbuffer.hlsl
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK:HLSLBufferDecl 0x[[CB:[0-9a-f]+]] {{.*}} line:7:9 cbuffer CB
-// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit CBuffer
-// CHECK-NEXT:HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float'
-cbuffer CB {
-  float a;
-}
-
-// CHECK:HLSLBufferDecl 0x[[TB:[0-9a-f]+]] {{.*}} line:15:9 tbuffer TB
-// CHECK:HLSLResourceClassAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit SRV
-// CHECK-NEXT:HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit TBuffer
-// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float'
-tbuffer TB {
-  float b;
-}
-
-float foo() {
-// CHECK: BinaryOperator 0x{{[0-9a-f]+}} <col:10, col:14> 'float' '+'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} <col:10> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:10> 'float' lvalue Var 0x[[A]] 'a' 'float'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} <col:14> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:14> 'float' lvalue Var 0x[[B]] 'b' 'float'
-  return a + b;
-}
diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
index 281d8be8addf0..3eabbb1f8ae22 100644
--- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
+++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
@@ -15,20 +15,27 @@ tbuffer B {
 float foo() {
   return a + b;
 }
+
 // Make sure cbuffer/tbuffer works for PCH.
-// CHECK:HLSLBufferDecl 0x{{[0-9a-f]+}} <{{.*}}:7:1, line:9:1> line:7:9 imported <undeserialized declarations> cbuffer A
-// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-// CHECK-NEXT:HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-// CHECK-NEXT:`-VarDecl 0x[[A:[0-9a-f]+]] <line:8:3, col:9> col:9 imported used a 'float'
-// CHECK-NEXT:HLSLBufferDecl 0x{{[0-9a-f]+}} <line:11:1, line:13:1> line:11:9 imported <undeserialized declarations> tbuffer B
-// CHECK-NEXT:HLSLResourceClassAttr {{.*}} <<invalid sloc>> Implicit SRV
-// CHECK-NEXT:HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit TBuffer
-// CHECK-NEXT:`-VarDecl 0x[[B:[0-9a-f]+]] <line:12:3, col:9> col:9 imported used b 'float'
-// CHECK-NEXT:FunctionDecl 0x{{[0-9a-f]+}} <line:15:1, line:17:1> line:15:7 imported foo 'float ()'
-// CHECK-NEXT:CompoundStmt 0x{{[0-9a-f]+}} <col:13, line:17:1>
-// CHECK-NEXT:ReturnStmt 0x{{[0-9a-f]+}} <line:16:3, col:14>
-// CHECK-NEXT:BinaryOperator 0x{{[0-9a-f]+}} <col:10, col:14> 'float' '+'
-// CHECK-NEXT:ImplicitCastExpr 0x{{[0-9a-f]+}} <col:10> 'float' <LValueToRValue>
-// CHECK-NEXT:`-DeclRefExpr 0x{{[0-9a-f]+}} <col:10> 'float' lvalue Var 0x[[A]] 'a' 'float'
-// CHECK-NEXT:`-ImplicitCastExpr 0x{{[0-9a-f]+}} <col:14> 'float' <LValueToRValue>
-// CHECK-NEXT:`-DeclRefExpr 0x{{[0-9a-f]+}} <col:14> 'float' lvalue Var 0x[[B]] 'b' 'float'
+// CHECK: HLSLBufferDecl {{.*}} line:7:9 imported <undeserialized declarations> cbuffer A
+// CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer
+// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'float'
+// CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit <undeserialized declarations> class __layout_A definition
+// CHECK: FieldDecl {{.*}} imported a 'float'
+
+// CHECK: HLSLBufferDecl {{.*}} line:11:9 imported <undeserialized declarations> tbuffer B
+// CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV
+// CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer
+// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'float'
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit <undeserialized declarations> class __layout_B definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float'
+
+// CHECK-NEXT: FunctionDecl {{.*}} line:15:7 imported foo 'float ()'
+// CHECK-NEXT: CompoundStmt {{.*}}
+// CHECK-NEXT: ReturnStmt {{.*}}
+// CHECK-NEXT: BinaryOperator {{.*}} 'float' '+'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[A]] 'a' 'float'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[B]] 'b' 'float'

From 3861b9db882d5637725ceeccb801c2bb837e8fc5 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 24 Jan 2025 18:27:23 +0000
Subject: [PATCH 039/432] [gn build] Port 0cd794d4860e

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 639095b698c6f..90303821eb09f 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -347,6 +347,7 @@ if (current_toolchain == default_toolchain) {
       "__chrono/time_zone_link.h",
       "__chrono/tzdb.h",
       "__chrono/tzdb_list.h",
+      "__chrono/utc_clock.h",
       "__chrono/weekday.h",
       "__chrono/year.h",
       "__chrono/year_month.h",

From ab976a17121374ae3407374b2aa6306e95863eb3 Mon Sep 17 00:00:00 2001
From: Stephen Long <63318318+steplong@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:02:06 -0500
Subject: [PATCH 040/432] PreISelIntrinsicLowering: Lower llvm.exp/llvm.exp2 to
 a loop if scalable vec arg (#117568)

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +
 .../Transforms/Utils/LowerVectorIntrinsics.h  | 30 ++++++++
 llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp | 14 ++++
 llvm/lib/CodeGen/TargetLoweringBase.cpp       | 11 +++
 llvm/lib/Transforms/Utils/CMakeLists.txt      |  1 +
 .../Utils/LowerVectorIntrinsics.cpp           | 73 +++++++++++++++++++
 .../AArch64/expand-exp.ll                     | 43 +++++++++++
 .../AArch64/lit.local.cfg                     |  2 +
 .../llvm/lib/Transforms/Utils/BUILD.gn        |  1 +
 9 files changed, 179 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
 create mode 100644 llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
 create mode 100644 llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 59743dbe4d2ea..861cffdc115a4 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2123,6 +2123,10 @@ class TargetLoweringBase {
   /// Get the ISD node that corresponds to the Instruction class opcode.
   int InstructionOpcodeToISD(unsigned Opcode) const;
 
+  /// Get the ISD node that corresponds to the Intrinsic ID. Returns
+  /// ISD::DELETED_NODE by default for an unsupported Intrinsic ID.
+  int IntrinsicIDToISD(Intrinsic::ID ID) const;
+
   /// @}
 
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
new file mode 100644
index 0000000000000..cb48bb01e178a
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/LowerVectorIntrinsics.h
@@ -0,0 +1,30 @@
+//===- llvm/Transforms/Utils/LowerVectorIntrinsics.h ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower intrinsics with a scalable vector arg to loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
+#define LLVM_TRANSFORMS_UTILS_LOWERVECTORINTRINSICS_H
+
+#include <cstdint>
+#include <optional>
+
+namespace llvm {
+
+class CallInst;
+class Module;
+
+/// Lower \p CI as a loop. \p CI is a unary intrinsic with a vector argument and
+/// is deleted and replaced with a loop.
+bool lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI);
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 4a3d1673c2a7c..048a6a49e4cb9 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
 
 using namespace llvm;
 
@@ -453,6 +454,19 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::objc_sync_exit:
       Changed |= lowerObjCCall(F, "objc_sync_exit");
       break;
+    case Intrinsic::exp:
+    case Intrinsic::exp2:
+      Changed |= forEachCall(F, [&](CallInst *CI) {
+        Type *Ty = CI->getArgOperand(0)->getType();
+        if (!isa<ScalableVectorType>(Ty))
+          return false;
+        const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+        unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
+        if (!TL->isOperationExpand(Op, EVT::getEVT(Ty)))
+          return false;
+        return lowerUnaryVectorIntrinsicAsLoop(M, CI);
+      });
+      break;
     }
   }
   return Changed;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 73af0a9a71407..9c56912aa6ba0 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1841,6 +1841,17 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   llvm_unreachable("Unknown instruction type encountered!");
 }
 
+int TargetLoweringBase::IntrinsicIDToISD(Intrinsic::ID ID) const {
+  switch (ID) {
+  case Intrinsic::exp:
+    return ISD::FEXP;
+  case Intrinsic::exp2:
+    return ISD::FEXP2;
+  default:
+    return ISD::DELETED_NODE;
+  }
+}
+
 Value *
 TargetLoweringBase::getDefaultSafeStackPointerLocation(IRBuilderBase &IRB,
                                                        bool UseTLS) const {
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index 65bd3080662c4..78cad0d253be8 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_component_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerMemIntrinsics.cpp
   LowerSwitch.cpp
+  LowerVectorIntrinsics.cpp
   MatrixUtils.cpp
   MemoryOpRemark.cpp
   MemoryTaggingSupport.cpp
diff --git a/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp
new file mode 100644
index 0000000000000..cd716deec14f5
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/LowerVectorIntrinsics.cpp
@@ -0,0 +1,73 @@
+//===- LowerVectorIntrinsics.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "lower-vector-intrinsics"
+
+using namespace llvm;
+
+bool llvm::lowerUnaryVectorIntrinsicAsLoop(Module &M, CallInst *CI) {
+  Type *ArgTy = CI->getArgOperand(0)->getType();
+  VectorType *VecTy = cast<VectorType>(ArgTy);
+
+  BasicBlock *PreLoopBB = CI->getParent();
+  BasicBlock *PostLoopBB = nullptr;
+  Function *ParentFunc = PreLoopBB->getParent();
+  LLVMContext &Ctx = PreLoopBB->getContext();
+
+  PostLoopBB = PreLoopBB->splitBasicBlock(CI);
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "", ParentFunc, PostLoopBB);
+  PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+  // Loop preheader
+  IRBuilder<> PreLoopBuilder(PreLoopBB->getTerminator());
+  Value *LoopEnd = nullptr;
+  if (auto *ScalableVecTy = dyn_cast<ScalableVectorType>(VecTy)) {
+    Value *VScale = PreLoopBuilder.CreateVScale(
+        ConstantInt::get(PreLoopBuilder.getInt64Ty(), 1));
+    Value *N = ConstantInt::get(PreLoopBuilder.getInt64Ty(),
+                                ScalableVecTy->getMinNumElements());
+    LoopEnd = PreLoopBuilder.CreateMul(VScale, N);
+  } else {
+    FixedVectorType *FixedVecTy = cast<FixedVectorType>(VecTy);
+    LoopEnd = ConstantInt::get(PreLoopBuilder.getInt64Ty(),
+                               FixedVecTy->getNumElements());
+  }
+
+  // Loop body
+  IRBuilder<> LoopBuilder(LoopBB);
+  Type *Int64Ty = LoopBuilder.getInt64Ty();
+
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(Int64Ty, 2);
+  LoopIndex->addIncoming(ConstantInt::get(Int64Ty, 0U), PreLoopBB);
+  PHINode *Vec = LoopBuilder.CreatePHI(VecTy, 2);
+  Vec->addIncoming(CI->getArgOperand(0), PreLoopBB);
+
+  Value *Elem = LoopBuilder.CreateExtractElement(Vec, LoopIndex);
+  Function *Exp = Intrinsic::getOrInsertDeclaration(&M, CI->getIntrinsicID(),
+                                                    VecTy->getElementType());
+  Value *Res = LoopBuilder.CreateCall(Exp, Elem);
+  Value *NewVec = LoopBuilder.CreateInsertElement(Vec, Res, LoopIndex);
+  Vec->addIncoming(NewVec, LoopBB);
+
+  Value *One = ConstantInt::get(Int64Ty, 1U);
+  Value *NextLoopIndex = LoopBuilder.CreateAdd(LoopIndex, One);
+  LoopIndex->addIncoming(NextLoopIndex, LoopBB);
+
+  Value *ExitCond =
+      LoopBuilder.CreateICmp(CmpInst::ICMP_EQ, NextLoopIndex, LoopEnd);
+  LoopBuilder.CreateCondBr(ExitCond, PostLoopBB, LoopBB);
+
+  CI->replaceAllUsesWith(NewVec);
+  CI->eraseFromParent();
+  return true;
+}
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
new file mode 100644
index 0000000000000..284f2ad8072fc
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=pre-isel-intrinsic-lowering -S < %s | FileCheck %s
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64"
+
+define <vscale x 4 x float> @scalable_vec_exp(<vscale x 4 x float> %input) {
+; CHECK-LABEL: define <vscale x 4 x float> @scalable_vec_exp(
+; CHECK-SAME: <vscale x 4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP1]], 4
+; CHECK-NEXT:    br label %[[BB3:.*]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP9:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x float> [ [[INPUT]], [[TMP0]] ], [ [[TMP8:%.*]], %[[BB3]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x float> [[TMP5]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call float @llvm.exp.f32(float [[TMP6]])
+; CHECK-NEXT:    [[TMP8]] = insertelement <vscale x 4 x float> [[TMP5]], float [[TMP7]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9]] = add i64 [[TMP4]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[BB11:.*]], label %[[BB3]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP8]]
+;
+  %output = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> %input)
+  ret <vscale x 4 x float> %output
+}
+
+define <4 x float> @fixed_vec_exp(<4 x float> %input) {
+; CHECK-LABEL: define <4 x float> @fixed_vec_exp(
+; CHECK-SAME: <4 x float> [[INPUT:%.*]]) {
+; CHECK-NEXT:    [[OUTPUT:%.*]] = call <4 x float> @llvm.exp.v4f32(<4 x float> [[INPUT]])
+; CHECK-NEXT:    ret <4 x float> [[OUTPUT]]
+;
+  %output = call <4 x float> @llvm.exp.v4f32(<4 x float> %input)
+  ret <4 x float> %output
+}
+
+declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
+declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
+
+; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK-NEXT: attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg
new file mode 100644
index 0000000000000..10d4a0e953ed4
--- /dev/null
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 1479e1c355d95..b16fe19bddfd1 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -64,6 +64,7 @@ static_library("Utils") {
     "LowerInvoke.cpp",
     "LowerMemIntrinsics.cpp",
     "LowerSwitch.cpp",
+    "LowerVectorIntrinsics.cpp",
     "MatrixUtils.cpp",
     "Mem2Reg.cpp",
     "MemoryOpRemark.cpp",

From 83df39c649fe1b1dd556d8f2160999c65ce497eb Mon Sep 17 00:00:00 2001
From: junfengd-nv <junfengd@nvidia.com>
Date: Fri, 24 Jan 2025 11:06:37 -0800
Subject: [PATCH 041/432] [mlir][inline] Fix Issue#82401: Infinite loop in MLIR
 inliner for indirect recursive call. (#124026)

---
 mlir/lib/Transforms/Utils/Inliner.cpp         |  4 +-
 .../test/Transforms/inlining-recursive-2.mlir | 37 +++++++++++++++++++
 mlir/test/Transforms/inlining-recursive.mlir  |  2 +-
 3 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 mlir/test/Transforms/inlining-recursive-2.mlir

diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 8acfc96d2b611..756f5e379e7dd 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -713,9 +713,11 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) {
     return false;
 
   // Don't allow inlining if the target is a self-recursive function.
+  // Don't allow inlining if the call graph is like A->B->A.
   if (llvm::count_if(*resolvedCall.targetNode,
                      [&](CallGraphNode::Edge const &edge) -> bool {
-                       return edge.getTarget() == resolvedCall.targetNode;
+                       return edge.getTarget() == resolvedCall.targetNode ||
+                              edge.getTarget() == resolvedCall.sourceNode;
                      }) > 0)
     return false;
 
diff --git a/mlir/test/Transforms/inlining-recursive-2.mlir b/mlir/test/Transforms/inlining-recursive-2.mlir
new file mode 100644
index 0000000000000..e50cf9695c4a3
--- /dev/null
+++ b/mlir/test/Transforms/inlining-recursive-2.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-opt %s -inline='default-pipeline=' | FileCheck %s
+// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline=' | FileCheck %s
+
+module {
+  // CHECK-LABEL: func.func @parent1
+  func.func @parent1(%arg0: i32) -> i32 {
+    // CHECK: call @child
+    %0 = call @child(%arg0) : (i32) -> i32
+    return %0 : i32
+  }
+
+  // CHECK-LABEL: func.func @parent2
+  func.func @parent2(%arg0: i32) -> i32 {
+    // CHECK: call @child
+    %0 = call @child(%arg0) : (i32) -> i32
+    return %0 : i32
+  }
+
+  // CHECK-LABEL: func.func @child
+  func.func @child(%arg0: i32) -> i32 {
+    %c10_i32 = arith.constant 10 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %0 = arith.cmpi sge, %arg0, %c10_i32 : i32
+    %1 = scf.if %0 -> (i32) {
+      scf.yield %arg0 : i32
+    } else {
+      %2 = arith.addi %arg0, %c1_i32 : i32
+      // CHECK: call @parent1
+      // CHECK: call @parent2
+      %3 = func.call @parent1(%2) : (i32) -> i32
+      %4 = func.call @parent2(%2) : (i32) -> i32
+      %5 = arith.addi %3, %4 : i32
+      scf.yield %5 : i32
+    }
+    return %1 : i32
+  }
+}
diff --git a/mlir/test/Transforms/inlining-recursive.mlir b/mlir/test/Transforms/inlining-recursive.mlir
index 403accd8b7ee8..f953935475e1a 100644
--- a/mlir/test/Transforms/inlining-recursive.mlir
+++ b/mlir/test/Transforms/inlining-recursive.mlir
@@ -17,7 +17,7 @@ func.func @foo0(%arg0 : i32) -> i32 {
 
 // CHECK-LABEL: func.func @foo1
 func.func @foo1(%arg0 : i32) -> i32 {
-  // CHECK:    call @foo1
+  // CHECK:    call @foo0
   %0 = arith.constant 1 : i32
   %1 = arith.subi %arg0, %0 : i32
   %2 = call @foo0(%1) : (i32) -> i32

From 3b30f20c60d020e43f5700dae68cf1080158b725 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Fri, 24 Jan 2025 20:23:18 +0100
Subject: [PATCH 042/432] [libc++][TZDB] Fixes CI.

The commit 24e70e3930724ce499ad05d669bfbc4423c542e0 changed internal
macros which were used in 0cd794d4860e376698bb4da24bcdf8cbf331835c.

This caused build failures on platforms without TZDB support
---
 libcxx/include/__chrono/convert_to_tm.h | 8 ++++----
 libcxx/include/__chrono/formatter.h     | 4 ++--
 libcxx/include/__chrono/ostream.h       | 4 ++--
 libcxx/include/__chrono/utc_clock.h     | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__chrono/convert_to_tm.h b/libcxx/include/__chrono/convert_to_tm.h
index e547e107a5852..7d06a38d87f26 100644
--- a/libcxx/include/__chrono/convert_to_tm.h
+++ b/libcxx/include/__chrono/convert_to_tm.h
@@ -100,7 +100,7 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const chrono::sys_time<_Duration> __tp
 }
 
 #  if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
-#    if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    if _LIBCPP_HAS_EXPERIMENTAL_TZDB
 
 template <class _Tm, class _Duration>
 _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) {
@@ -112,7 +112,7 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(chrono::utc_time<_Duration> __tp) {
   return __result;
 }
 
-#    endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB
 #  endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
 
 // Convert a chrono (calendar) time point, or dururation to the given _Tm type,
@@ -128,10 +128,10 @@ _LIBCPP_HIDE_FROM_ABI _Tm __convert_to_tm(const _ChronoT& __value) {
     if constexpr (same_as<typename _ChronoT::clock, chrono::system_clock>)
       return std::__convert_to_tm<_Tm>(__value);
 #  if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
-#    if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    if _LIBCPP_HAS_EXPERIMENTAL_TZDB
     else if constexpr (same_as<typename _ChronoT::clock, chrono::utc_clock>)
       return std::__convert_to_tm<_Tm>(__value);
-#    endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#    endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB
 #  endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM && _LIBCPP_HAS_LOCALIZATION
     else if constexpr (same_as<typename _ChronoT::clock, chrono::file_clock>)
       return std::__convert_to_tm<_Tm>(_ChronoT::clock::to_sys(__value));
diff --git a/libcxx/include/__chrono/formatter.h b/libcxx/include/__chrono/formatter.h
index 6153fdc35a47b..d17acd274e4cd 100644
--- a/libcxx/include/__chrono/formatter.h
+++ b/libcxx/include/__chrono/formatter.h
@@ -721,7 +721,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<chrono::sys_time<_Duration>, _CharT> : pub
 };
 
 #    if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
-#      if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#      if _LIBCPP_HAS_EXPERIMENTAL_TZDB
 
 template <class _Duration, __fmt_char_type _CharT>
 struct _LIBCPP_TEMPLATE_VIS formatter<chrono::utc_time<_Duration>, _CharT> : public __formatter_chrono<_CharT> {
@@ -734,7 +734,7 @@ struct _LIBCPP_TEMPLATE_VIS formatter<chrono::utc_time<_Duration>, _CharT> : pub
   }
 };
 
-#      endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#      endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB
 #    endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
 
 template <class _Duration, __fmt_char_type _CharT>
diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index 66735e5c2c28b..ed9ad8e346ba9 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -63,7 +63,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_days& __dp) {
 }
 
 #    if _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
-#      if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#      if _LIBCPP_HAS_EXPERIMENTAL_TZDB
 
 template <class _CharT, class _Traits, class _Duration>
 _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
@@ -71,7 +71,7 @@ operator<<(basic_ostream<_CharT, _Traits>& __os, const utc_time<_Duration>& __tp
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T}"), __tp);
 }
 
-#      endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#      endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB
 #    endif   // _LIBCPP_HAS_TIME_ZONE_DATABASE && _LIBCPP_HAS_FILESYSTEM
 
 template <class _CharT, class _Traits, class _Duration>
diff --git a/libcxx/include/__chrono/utc_clock.h b/libcxx/include/__chrono/utc_clock.h
index 647b6eda13ea2..2207b89c92c59 100644
--- a/libcxx/include/__chrono/utc_clock.h
+++ b/libcxx/include/__chrono/utc_clock.h
@@ -12,7 +12,7 @@
 
 #include <version>
 // Enable the contents of the header only when libc++ was built with experimental features enabled.
-#if !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#if _LIBCPP_HAS_EXPERIMENTAL_TZDB
 
 #  include <__chrono/duration.h>
 #  include <__chrono/leap_second.h>
@@ -158,6 +158,6 @@ utc_clock::to_sys(const utc_time<_Duration>& __time) {
 
 _LIBCPP_END_NAMESPACE_STD
 
-#endif // !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_TZDB)
+#endif // _LIBCPP_HAS_EXPERIMENTAL_TZDB
 
 #endif // _LIBCPP___CHRONO_UTC_CLOCK_H

From 95d993a838863269dc1b90de3808c1e40ac6d5f2 Mon Sep 17 00:00:00 2001
From: Henrich Lauko <xlauko@mail.muni.cz>
Date: Fri, 24 Jan 2025 20:28:36 +0100
Subject: [PATCH 043/432] [MLIR] Fix import of calls with mismatched variadic
 types (#124286)

Previously, an indirect call was incorrectly generated when
`llvm::CallBase::getCalledFunction` returned null due to a type mismatch
between the call and the function. This patch updates the code to use
`llvm::CallBase::getCalledOperand` instead.
---
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 77 +++++++++++--------
 .../test/Target/LLVMIR/Import/instructions.ll | 25 ++++++
 2 files changed, 70 insertions(+), 32 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index f6826a2362bfd..40d86efe605ad 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1495,15 +1495,22 @@ LogicalResult ModuleImport::convertCallTypeAndOperands(
   if (!callInst->getType()->isVoidTy())
     types.push_back(convertType(callInst->getType()));
 
-  if (!callInst->getCalledFunction()) {
-    if (!allowInlineAsm ||
-        !isa<llvm::InlineAsm>(callInst->getCalledOperand())) {
-      FailureOr<Value> called = convertValue(callInst->getCalledOperand());
-      if (failed(called))
-        return failure();
-      operands.push_back(*called);
-    }
+  bool isInlineAsm = callInst->isInlineAsm();
+  if (isInlineAsm && !allowInlineAsm)
+    return failure();
+
+  // Cannot use isIndirectCall() here because we need to handle Constant callees
+  // that are not considered indirect calls by LLVM. However, in MLIR, they are
+  // treated as indirect calls to constant operands that need to be converted.
+  // Skip the callee operand if it's inline assembly, as it's handled separately
+  // in InlineAsmOp.
+  if (!isa<llvm::Function>(callInst->getCalledOperand()) && !isInlineAsm) {
+    FailureOr<Value> called = convertValue(callInst->getCalledOperand());
+    if (failed(called))
+      return failure();
+    operands.push_back(*called);
   }
+
   SmallVector<llvm::Value *> args(callInst->args());
   FailureOr<SmallVector<Value>> arguments = convertValues(args);
   if (failed(arguments))
@@ -1593,7 +1600,8 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
     return success();
   }
   if (inst->getOpcode() == llvm::Instruction::Call) {
-    auto *callInst = cast<llvm::CallInst>(inst);
+    auto callInst = cast<llvm::CallInst>(inst);
+    llvm::Value *calledOperand = callInst->getCalledOperand();
 
     SmallVector<Type> types;
     SmallVector<Value> operands;
@@ -1601,15 +1609,12 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
                                           /*allowInlineAsm=*/true)))
       return failure();
 
-    auto funcTy =
-        dyn_cast<LLVMFunctionType>(convertType(callInst->getFunctionType()));
-    if (!funcTy)
-      return failure();
-
-    if (auto asmI = dyn_cast<llvm::InlineAsm>(callInst->getCalledOperand())) {
+    if (auto asmI = dyn_cast<llvm::InlineAsm>(calledOperand)) {
+      Type resultTy = convertType(callInst->getType());
+      if (!resultTy)
+        return failure();
       auto callOp = builder.create<InlineAsmOp>(
-          loc, funcTy.getReturnType(), operands,
-          builder.getStringAttr(asmI->getAsmString()),
+          loc, resultTy, operands, builder.getStringAttr(asmI->getAsmString()),
           builder.getStringAttr(asmI->getConstraintString()),
           /*has_side_effects=*/true,
           /*is_align_stack=*/false, /*asm_dialect=*/nullptr,
@@ -1619,27 +1624,35 @@ LogicalResult ModuleImport::convertInstruction(llvm::Instruction *inst) {
       else
         mapNoResultOp(inst, callOp);
     } else {
-      CallOp callOp;
+      auto funcTy = dyn_cast<LLVMFunctionType>([&]() -> Type {
+        // Retrieve the real function type. For direct calls, use the callee's
+        // function type, as it may differ from the operand type in the case of
+        // variadic functions. For indirect calls, use the call function type.
+        if (auto callee = dyn_cast<llvm::Function>(calledOperand))
+          return convertType(callee->getFunctionType());
+        return convertType(callInst->getFunctionType());
+      }());
+
+      if (!funcTy)
+        return failure();
 
-      if (llvm::Function *callee = callInst->getCalledFunction()) {
-        callOp = builder.create<CallOp>(
-            loc, funcTy, SymbolRefAttr::get(context, callee->getName()),
-            operands);
-      } else {
-        callOp = builder.create<CallOp>(loc, funcTy, operands);
-      }
+      auto callOp = [&]() -> CallOp {
+        if (auto callee = dyn_cast<llvm::Function>(calledOperand)) {
+          auto name = SymbolRefAttr::get(context, callee->getName());
+          return builder.create<CallOp>(loc, funcTy, name, operands);
+        }
+        return builder.create<CallOp>(loc, funcTy, operands);
+      }();
+
+      // Handle function attributes.
       callOp.setCConv(convertCConvFromLLVM(callInst->getCallingConv()));
       callOp.setTailCallKind(
           convertTailCallKindFromLLVM(callInst->getTailCallKind()));
       setFastmathFlagsAttr(inst, callOp);
 
-      // Handle function attributes.
-      if (callInst->hasFnAttr(llvm::Attribute::Convergent))
-        callOp.setConvergent(true);
-      if (callInst->hasFnAttr(llvm::Attribute::NoUnwind))
-        callOp.setNoUnwind(true);
-      if (callInst->hasFnAttr(llvm::Attribute::WillReturn))
-        callOp.setWillReturn(true);
+      callOp.setConvergent(callInst->isConvergent());
+      callOp.setNoUnwind(callInst->doesNotThrow());
+      callOp.setWillReturn(callInst->hasFnAttr(llvm::Attribute::WillReturn));
 
       llvm::MemoryEffects memEffects = callInst->getMemoryEffects();
       ModRefInfo othermem = convertModRefInfoFromLLVM(
diff --git a/mlir/test/Target/LLVMIR/Import/instructions.ll b/mlir/test/Target/LLVMIR/Import/instructions.ll
index 7377e2584110b..77052ab6e41f6 100644
--- a/mlir/test/Target/LLVMIR/Import/instructions.ll
+++ b/mlir/test/Target/LLVMIR/Import/instructions.ll
@@ -570,6 +570,31 @@ define void @varargs_call(i32 %0) {
 
 ; // -----
 
+; CHECK: @varargs(...)
+declare void @varargs(...)
+
+; CHECK-LABEL: @varargs_call
+; CHECK-SAME:  %[[ARG1:[a-zA-Z0-9]+]]
+define void @varargs_call(i32 %0) {
+  ; CHECK:  llvm.call @varargs(%[[ARG1]]) vararg(!llvm.func<void (...)>) : (i32) -> ()
+  call void @varargs(i32 %0)
+  ret void
+}
+
+; // -----
+
+; CHECK: @varargs(...)
+declare void @varargs(...)
+
+; CHECK-LABEL: @empty_varargs_call
+define void @empty_varargs_call() {
+  ; CHECK:  llvm.call @varargs() vararg(!llvm.func<void (...)>) : () -> ()
+  call void @varargs()
+  ret void
+}
+
+; // -----
+
 ; CHECK: llvm.func @f()
 declare void @f()
 

From 1b1270f30bbdb2c7a310009d0512e167b09bac48 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Fri, 24 Jan 2025 19:48:40 +0000
Subject: [PATCH 044/432] [FMV][GlobalOpt] Enable static resolution of non-FMV
 callers. (#124314)

The undetectable FMV features predres and ls64 have been removed,
therefore the optimization is now re-enabled. The llvm testsuite
Graviton4 bots are expected to remain green.
---
 llvm/lib/Transforms/IPO/GlobalOpt.cpp               | 11 +----------
 llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll |  2 +-
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 00c20ad5f3709..9586fc97a39f7 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -95,20 +95,11 @@ STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
 STATISTIC(NumGlobalArraysPadded,
           "Number of global arrays padded to alignment boundary");
 
-// FIXME:
-// Optimizing non-FMV callers is causing a regression in the llvm test suite,
-// specifically a 'predres' version is unexpectedly trapping on GravitonG4.
-// My explanation is that when the caller in not a versioned function, the
-// compiler exclusively relies on the command line option, or target attribute
-// to deduce whether a feature is available. However, there is no guarantee
-// that in reality the host supports those implied features, which arguably
-// is a user error. This option allows disabling the optimization as a short
-// term workaround to keep the bots green.
 static cl::opt<bool>
     OptimizeNonFMVCallers("optimize-non-fmv-callers",
                           cl::desc("Statically resolve calls to versioned "
                                    "functions from non-versioned callers."),
-                          cl::init(false), cl::Hidden);
+                          cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
diff --git a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
index fa817a8cbf417..4b6a19d3f05cf 100644
--- a/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
+++ b/llvm/test/Transforms/GlobalOpt/resolve-fmv-ifunc.ll
@@ -221,7 +221,7 @@ resolver_entry:
 define i32 @caller4() #8 {
 ; CHECK-LABEL: define i32 @caller4(
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR7:[0-9]+]] {
-; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller()
+; CHECK:    [[CALL:%.*]] = tail call i32 @test_non_fmv_caller._Maes()
 ;
 entry:
   %call = tail call i32 @test_non_fmv_caller()

From d398c0c97aa0bfaeed5647f75bc37c87b8142f79 Mon Sep 17 00:00:00 2001
From: siya100 <85541510+siya100@users.noreply.github.com>
Date: Sat, 25 Jan 2025 02:02:51 +0530
Subject: [PATCH 045/432] [libc][cpio] Add cpio.h header. (#123798)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[libc][docs] add cpio to documentation and include related functi…
These changes ensure that the cpio header is documented properly
with respect to the issue
(https://github.com/llvm/llvm-project/issues/122006 ).

**Changes:**
1. **cpio.yaml**: Created a new YAML file for cpio with functions
and related macros.
2. **CMakeLists.txt**: Added cpio to the documentation
directories.
3. **index.rst**: Included `cpio` in the documentation index.

---------

Co-authored-by: siya <siya@Siya.com>
---
 libc/docs/CMakeLists.txt    |  1 +
 libc/docs/headers/index.rst |  1 +
 libc/utils/docgen/cpio.yaml | 44 +++++++++++++++++++++++++++++++++++++
 3 files changed, 46 insertions(+)
 create mode 100644 libc/utils/docgen/cpio.yaml

diff --git a/libc/docs/CMakeLists.txt b/libc/docs/CMakeLists.txt
index bb8e3e96e47ca..fc5e505c3be69 100644
--- a/libc/docs/CMakeLists.txt
+++ b/libc/docs/CMakeLists.txt
@@ -37,6 +37,7 @@ if (SPHINX_FOUND)
       aio
       arpa/inet
       assert
+      cpio
       ctype
       errno
       fenv
diff --git a/libc/docs/headers/index.rst b/libc/docs/headers/index.rst
index d08552d223252..bd48dd5989bcd 100644
--- a/libc/docs/headers/index.rst
+++ b/libc/docs/headers/index.rst
@@ -8,6 +8,7 @@ Implementation Status
    arpa/inet
    assert
    complex
+   cpio
    ctype
    errno
    fenv
diff --git a/libc/utils/docgen/cpio.yaml b/libc/utils/docgen/cpio.yaml
new file mode 100644
index 0000000000000..b31c03778fba5
--- /dev/null
+++ b/libc/utils/docgen/cpio.yaml
@@ -0,0 +1,44 @@
+macros:
+  C_IRUSR:
+    in-latest-posix: ''
+  C_IWUSR:
+    in-latest-posix: ''
+  C_IXUSR:
+    in-latest-posix: ''
+  C_IRGRP:
+    in-latest-posix: ''
+  C_IWGRP:
+    in-latest-posix: ''
+  C_IXGRP:
+    in-latest-posix: ''
+  C_IROTH:
+    in-latest-posix: ''
+  C_IWOTH:
+    in-latest-posix: ''
+  C_IXOTH:
+    in-latest-posix: ''
+  C_ISUID:
+    in-latest-posix: ''
+  C_ISGID:
+    in-latest-posix: ''
+  C_ISVTX:
+    in-latest-posix: ''
+  C_ISDIR:
+    in-latest-posix: ''
+  C_ISFIFO:
+    in-latest-posix: ''
+  C_ISREG:
+    in-latest-posix: ''
+  C_ISBLK:
+    in-latest-posix: ''
+  C_ISCHR:
+    in-latest-posix: ''
+  C_ISCTG:
+    in-latest-posix: ''
+  C_ISLNK:
+    in-latest-posix: ''
+  C_ISSOCK:
+    in-latest-posix: ''
+  MAGIC:
+    in-latest-posix: ''
+

From 074a25fb2678dacb4f3c6a24d5f907788c858e7a Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 24 Jan 2025 13:02:33 -0800
Subject: [PATCH 046/432] [RISCV][MC] Create an AsmOperand for carry-in vmask
 (#124317)

Previously we used a fixed assembly string as well as encoding for the
carry-in vector mask, since it will always be there. However, this makes
both AsmParser and disassembler to either create a garbage MCOperand for
the mask or fail to add one as a whole. This wouldn't be a problem for
majority of the cases but tools like llvm-mca who relies on MCInst will
fail to account for the register dependency on these mask operands.
---
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  4 ++
 .../RISCV/Disassembler/RISCVDisassembler.cpp  | 10 +++
 llvm/lib/Target/RISCV/RISCVInstrInfoV.td      | 35 ++++++----
 .../MC/Disassembler/RISCV/vmask-carry-in.txt  | 69 +++++++++++++++++++
 llvm/test/MC/RISCV/rvv/vmask-carry-in.s       | 69 +++++++++++++++++++
 5 files changed, 172 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt
 create mode 100644 llvm/test/MC/RISCV/rvv/vmask-carry-in.s

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 227a6361730da..2e86a891863fd 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -1681,6 +1681,10 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be v0.t");
   }
+  case Match_InvalidVMaskCarryInRegister: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be v0");
+  }
   case Match_InvalidSImm5Plus1: {
     return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4) + 1,
                                       (1 << 4),
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index a0b87f7c7ff25..1c4f322e2104e 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -297,6 +297,16 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeVMV0RegisterClass(MCInst &Inst, uint32_t RegNo,
+                                            uint64_t Address,
+                                            const MCDisassembler *Decoder) {
+  if (RegNo)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::createReg(RISCV::V0));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeVMaskReg(MCInst &Inst, uint32_t RegNo,
                                    uint64_t Address,
                                    const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
index 24a881dc6810f..671e493fb3763 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -50,6 +50,13 @@ def VMaskAsmOperand : AsmOperandClass {
   let DiagnosticType = "InvalidVMaskRegister";
 }
 
+def VMaskCarryInAsmOperand : AsmOperandClass {
+  let Name = "RVVMaskCarryInRegOpOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isV0Reg";
+  let DiagnosticType = "InvalidVMaskCarryInRegister";
+}
+
 def VMaskOp : RegisterOperand<VMV0> {
   let ParserMatchClass = VMaskAsmOperand;
   let PrintMethod = "printVMaskReg";
@@ -57,6 +64,11 @@ def VMaskOp : RegisterOperand<VMV0> {
   let DecoderMethod = "decodeVMaskReg";
 }
 
+def VMaskCarryInOp : RegisterOperand<VMV0> {
+  let ParserMatchClass = VMaskCarryInAsmOperand;
+  let EncoderMethod = "getVMaskReg";
+}
+
 def simm5 : RISCVSImmLeafOp<5> {
   let MCOperandPredicate = [{
     int64_t Imm;
@@ -442,10 +454,8 @@ class VALUVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
 // op vd, vs2, vs1, v0 (without mask, use v0 as carry input)
 class VALUmVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
     : RVInstVV<funct6, opv, (outs VR:$vd),
-                (ins VR:$vs2, VR:$vs1, VMV0:$v0),
-                opcodestr, "$vd, $vs2, $vs1, v0"> {
-  let vm = 0;
-}
+                (ins VR:$vs2, VR:$vs1, VMaskCarryInOp:$vm),
+                opcodestr, "$vd, $vs2, $vs1, $vm">;
 
 // op vd, vs1, vs2, vm (reverse the order of vs1 and vs2)
 class VALUrVV<bits<6> funct6, RISCVVFormat opv, string opcodestr,
@@ -474,10 +484,8 @@ class VALUVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
 // op vd, vs2, rs1, v0 (without mask, use v0 as carry input)
 class VALUmVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
     : RVInstVX<funct6, opv, (outs VR:$vd),
-                (ins VR:$vs2, GPR:$rs1, VMV0:$v0),
-                opcodestr, "$vd, $vs2, $rs1, v0"> {
-  let vm = 0;
-}
+                (ins VR:$vs2, GPR:$rs1, VMaskCarryInOp:$vm),
+                opcodestr, "$vd, $vs2, $rs1, $vm">;
 
 // op vd, rs1, vs2, vm (reverse the order of rs1 and vs2)
 class VALUrVX<bits<6> funct6, RISCVVFormat opv, string opcodestr,
@@ -506,10 +514,8 @@ class VALUVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
 // op vd, vs2, imm, v0 (without mask, use v0 as carry input)
 class VALUmVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
     : RVInstIVI<funct6, (outs VR:$vd),
-                (ins VR:$vs2, optype:$imm, VMV0:$v0),
-                opcodestr, "$vd, $vs2, $imm, v0"> {
-  let vm = 0;
-}
+                (ins VR:$vs2, optype:$imm, VMaskCarryInOp:$vm),
+                opcodestr, "$vd, $vs2, $imm, $vm">;
 
 // op vd, vs2, imm, vm
 class VALUVINoVm<bits<6> funct6, string opcodestr, Operand optype = simm5>
@@ -1458,10 +1464,9 @@ defm VFCLASS_V : VCLS_FV_VS2<"vfclass.v", 0b010011, 0b10000>;
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 
 // Vector Floating-Point Merge Instruction
-let vm = 0 in
 def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VR:$vd),
-                           (ins VR:$vs2, FPR32:$rs1, VMV0:$v0),
-                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0">,
+                           (ins VR:$vs2, FPR32:$rs1, VMaskCarryInOp:$vm),
+                           "vfmerge.vfm", "$vd, $vs2, $rs1, $vm">,
                   SchedBinaryMC<"WriteVFMergeV", "ReadVFMergeV", "ReadVFMergeF">;
 
 // Vector Floating-Point Move Instruction
diff --git a/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt b/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt
new file mode 100644
index 0000000000000..e9af01ac60b43
--- /dev/null
+++ b/llvm/test/MC/Disassembler/RISCV/vmask-carry-in.txt
@@ -0,0 +1,69 @@
+# RUN: llvm-mc -triple=riscv64 -disassemble -show-inst --mattr=+v %s \
+# RUN:   --M no-aliases | FileCheck %s
+
+# Check if there is a MCOperand for the carry-in mask.
+
+[0x57,0x04,0x4a,0x5c]
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x44,0x45,0x5c]
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0xb4,0x47,0x5c]
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+[0x57,0x04,0x4a,0x40]
+# CHECK: <MCInst #{{[0-9]+}} VADC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x44,0x45,0x40]
+# CHECK: <MCInst #{{[0-9]+}} VADC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0xb4,0x47,0x40]
+# CHECK: <MCInst #{{[0-9]+}} VADC_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+[0x57,0x04,0x4a,0x44]
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x44,0x45,0x44]
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0xb4,0x47,0x44]
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+[0x57,0x04,0x4a,0x48]
+# CHECK: <MCInst #{{[0-9]+}} VSBC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x44,0x45,0x48]
+# CHECK: <MCInst #{{[0-9]+}} VSBC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x04,0x4a,0x4c]
+# CHECK: <MCInst #{{[0-9]+}} VMSBC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x44,0x45,0x4c]
+# CHECK: <MCInst #{{[0-9]+}} VMSBC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+[0x57,0x54,0x45,0x5c]
+# CHECK: <MCInst #{{[0-9]+}} VFMERGE_VFM
+# CHECK-COUNT-4: MCOperand Reg
diff --git a/llvm/test/MC/RISCV/rvv/vmask-carry-in.s b/llvm/test/MC/RISCV/rvv/vmask-carry-in.s
new file mode 100644
index 0000000000000..2eb0a2bd0ef85
--- /dev/null
+++ b/llvm/test/MC/RISCV/rvv/vmask-carry-in.s
@@ -0,0 +1,69 @@
+# RUN: llvm-mc -triple=riscv64 -show-inst --mattr=+v %s \
+# RUN:   --M no-aliases | FileCheck %s
+
+# Check if there is a MCOperand for the carry-in mask.
+
+vmerge.vvm v8, v4, v20, v0
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmerge.vxm v8, v4, a0, v0
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmerge.vim v8, v4, 15, v0
+# CHECK: <MCInst #{{[0-9]+}} VMERGE_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+vadc.vvm v8, v4, v20, v0
+# CHECK: <MCInst #{{[0-9]+}} VADC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+vadc.vxm v8, v4, a0, v0
+# CHECK: <MCInst #{{[0-9]+}} VADC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+vadc.vim v8, v4, 15, v0
+# CHECK: <MCInst #{{[0-9]+}} VADC_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+vmadc.vvm v8, v4, v20, v0
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmadc.vxm v8, v4, a0, v0
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmadc.vim v8, v4, 15, v0
+# CHECK: <MCInst #{{[0-9]+}} VMADC_VIM
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Reg
+# CHECK-NEXT: MCOperand Imm
+# CHECK-NEXT: MCOperand Reg
+
+vsbc.vvm v8, v4, v20, v0
+# CHECK: <MCInst #{{[0-9]+}} VSBC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+vsbc.vxm v8, v4, a0, v0
+# CHECK: <MCInst #{{[0-9]+}} VSBC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmsbc.vvm v8, v4, v20, v0
+# CHECK: <MCInst #{{[0-9]+}} VMSBC_VVM
+# CHECK-COUNT-4: MCOperand Reg
+
+vmsbc.vxm v8, v4, a0, v0
+# CHECK: <MCInst #{{[0-9]+}} VMSBC_VXM
+# CHECK-COUNT-4: MCOperand Reg
+
+vfmerge.vfm v8, v4, fa0, v0
+# CHECK: <MCInst #{{[0-9]+}} VFMERGE_VFM
+# CHECK-COUNT-4: MCOperand Reg

From ae8b560899c00d1792fd87ec591558cd91add2ef Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 24 Jan 2025 13:06:11 -0800
Subject: [PATCH 047/432] [MemProf] Disable hot hints by default (#124338)

By default we were marking some contexts as hot, and adding hot hints to
unambiguously hot allocations. However, there is not yet support for
cloning to expose hot allocation contexts, and none is planned for the
forseeable future.

While we convert hot contexts to notcold contexts during the cloning
step, their existence was greatly limiting the context trimming
performed when we add the MemProf profile to the IR. This change simply
disables the generation of hot contexts / hints by default, as few
allocations were unambiguously hot.

A subsequent change will address the issue when hot hints are optionally
enabled. See PR124219 for details.

This change resulted in significant overhead reductions for a large
target:
~48% reduction in the per-module ThinLTO bitcode summary sizes
~72% reduction in the distributed ThinLTO bitcode combined summary sizes
~68% reduction in thin link time
~34% reduction in thin link peak memory
---
 llvm/lib/Analysis/MemoryProfileInfo.cpp        | 10 ++++++++--
 llvm/test/Transforms/PGOProfile/memprof.ll     | 13 +++++++++++--
 .../PGOProfile/memprof_loop_unroll.ll          |  4 +++-
 .../Analysis/MemoryProfileInfoTest.cpp         | 18 ++++++++++++++----
 4 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 2f3c87a89f9f9..52f4adbdb0429 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -42,6 +42,11 @@ cl::opt<unsigned> MemProfMinAveLifetimeAccessDensityHotThreshold(
     cl::desc("The minimum TotalLifetimeAccessDensity / AllocCount for an "
              "allocation to be considered hot"));
 
+cl::opt<bool>
+    MemProfUseHotHints("memprof-use-hot-hints", cl::init(false), cl::Hidden,
+                       cl::desc("Enable use of hot hints (only supported for "
+                                "unambigously hot allocations)"));
+
 cl::opt<bool> MemProfReportHintedSizes(
     "memprof-report-hinted-sizes", cl::init(false), cl::Hidden,
     cl::desc("Report total allocation sizes of hinted allocations"));
@@ -60,8 +65,9 @@ AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
 
   // The access densities are multiplied by 100 to hold 2 decimal places of
   // precision, so need to divide by 100.
-  if (((float)TotalLifetimeAccessDensity) / AllocCount / 100 >
-      MemProfMinAveLifetimeAccessDensityHotThreshold)
+  if (MemProfUseHotHints &&
+      ((float)TotalLifetimeAccessDensity) / AllocCount / 100 >
+          MemProfMinAveLifetimeAccessDensityHotThreshold)
     return AllocationType::Hot;
 
   return AllocationType::NotCold;
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index c0e44cccbf16f..367069e993fe1 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -84,6 +84,8 @@
 ; RUN: llvm-profdata merge -memprof-random-hotness -memprof-random-hotness-seed=1730170724 %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand2 2>&1 | FileCheck %s --check-prefix=RAND2
 ; RAND2: random hotness seed = 1730170724
 ; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdatarand2>' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS
+;; Check with hot hints enabled
+; RUN: opt < %s -memprof-use-hot-hints -passes='memprof-use<profile-filename=%t.memprofdatarand2>' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2HOT
 
 ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched
 ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched
@@ -408,8 +410,15 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROFRAND2: !"cold"
 ; MEMPROFRAND2: !"cold"
 ; MEMPROFRAND2: !"cold"
-; MEMPROFRAND2: !"hot"
-; MEMPROFRAND2: !"hot"
+; MEMPROFRAND2: !"notcold"
+; MEMPROFRAND2: !"notcold"
+
+;; With hot hints enabled the last 2 should be hot.
+; MEMPROFRAND2HOT: !"cold"
+; MEMPROFRAND2HOT: !"cold"
+; MEMPROFRAND2HOT: !"cold"
+; MEMPROFRAND2HOT: !"hot"
+; MEMPROFRAND2HOT: !"hot"
 
 ; MEMPROFSTATS:  8 memprof - Number of alloc contexts in memory profile.
 ; MEMPROFSTATS: 10 memprof - Number of callsites in memory profile.
diff --git a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
index 9bc1282ab4529..2461ca32e9821 100644
--- a/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof_loop_unroll.ll
@@ -10,7 +10,9 @@
 ;; $ clang++ -gmlt -fdebug-info-for-profiling -S %S/Inputs/memprof_loop_unroll_b.cc -emit-llvm
 
 ; RUN: llvm-profdata merge %S/Inputs/memprof_loop_unroll.memprofraw --profiled-binary %S/Inputs/memprof_loop_unroll.exe -o %t.memprofdata
-; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes 2>&1 | FileCheck %s
+;; Set the minimum lifetime threshold to 0 to ensure that one context is
+;; considered cold (the other will be notcold).
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S -memprof-report-hinted-sizes -memprof-ave-lifetime-cold-threshold=0 2>&1 | FileCheck %s
 
 ;; Conservatively annotate as not cold. We get two messages as there are two
 ;; unrolled copies of the allocation.
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index 4c177ae844690..3888faf5453d3 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -25,6 +25,7 @@ using namespace llvm::memprof;
 extern cl::opt<float> MemProfLifetimeAccessDensityColdThreshold;
 extern cl::opt<unsigned> MemProfAveLifetimeColdThreshold;
 extern cl::opt<unsigned> MemProfMinAveLifetimeAccessDensityHotThreshold;
+extern cl::opt<bool> MemProfUseHotHints;
 
 namespace {
 
@@ -81,14 +82,23 @@ TEST_F(MemoryProfileInfoTest, GetAllocType) {
   //    MemProfMinAveLifetimeAccessDensityHotThreshold
   // so compute the HotTotalLifetimeAccessDensityThreshold  at the threshold.
   const uint64_t HotTotalLifetimeAccessDensityThreshold =
-      (uint64_t)(MemProfMinAveLifetimeAccessDensityHotThreshold * AllocCount * 100);  
-   
-  
+      (uint64_t)(MemProfMinAveLifetimeAccessDensityHotThreshold * AllocCount *
+                 100);
+
+  // Make sure the option for detecting hot allocations is set.
+  MemProfUseHotHints = true;
   // Test Hot
   // More accesses per byte per sec than hot threshold is hot.
   EXPECT_EQ(getAllocType(HotTotalLifetimeAccessDensityThreshold + 1, AllocCount,
                          ColdTotalLifetimeThreshold + 1),
-            AllocationType::Hot);  
+            AllocationType::Hot);
+  // Undo the manual set of the option above.
+  cl::ResetAllOptionOccurrences();
+
+  // Without MemProfUseHotHints (default) we should treat simply as NotCold.
+  EXPECT_EQ(getAllocType(HotTotalLifetimeAccessDensityThreshold + 1, AllocCount,
+                         ColdTotalLifetimeThreshold + 1),
+            AllocationType::NotCold);
 
   // Test Cold
   // Long lived with less accesses per byte per sec than cold threshold is cold.

From ee054404dfde9913ed47d9bac5ea2be28926f5ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 24 Jan 2025 13:09:58 -0800
Subject: [PATCH 048/432] [flang][cuda] Carry over the cuf.proc_attr attribute
 to gpu.launch_func (#124325)

---
 .../Optimizer/Transforms/CUFOpConversion.cpp  |  5 +++++
 flang/test/Fir/CUDA/cuda-launch.fir           | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 7292ce741b85b..cc525d703ae57 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -810,6 +810,7 @@ struct CUFLaunchOpConversion
             rewriter.getContext(),
             op.getCallee().getLeafReference().getValue())});
     mlir::Value clusterDimX, clusterDimY, clusterDimZ;
+    cuf::ProcAttributeAttr procAttr;
     if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
             op.getCallee().getLeafReference())) {
       if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
@@ -821,6 +822,8 @@ struct CUFLaunchOpConversion
         clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
             loc, clusterDimsAttr.getZ().getInt());
       }
+      procAttr =
+          funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
     }
     llvm::SmallVector<mlir::Value> args;
     for (mlir::Value arg : op.getArgs()) {
@@ -855,6 +858,8 @@ struct CUFLaunchOpConversion
       gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
       gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
     }
+    if (procAttr)
+      gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
     rewriter.replaceOp(op, gpuLaunchOp);
     return mlir::success();
   }
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 8432b9ec926e3..7833fc7b490bf 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -104,3 +104,24 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
 // CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
 // CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 // CHECK: gpu.launch_func  @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
+
+// -----
+
+module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QMdevptrPtest() kernel {
+      gpu.return
+    }
+  }
+  func.func @_QMdevptrPtest() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
+    return
+  }
+  func.func @_QQmain() {
+    %c1_i32 = arith.constant 1 : i32
+    cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
+    return
+  }
+}
+
+// CHECK-LABEL: func.func @_QQmain()
+// CHECK: gpu.launch_func  @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})  dynamic_shared_memory_size %{{.*}}  {cuf.proc_attr = #cuf.cuda_proc<grid_global>}

From df9b31f1e0cdb8096e9d2e0749e473dd815b39f7 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Fri, 24 Jan 2025 16:11:18 -0500
Subject: [PATCH 049/432] [clang][Sema] Handle undeduced auto types in
 HeuristicResolver (#124236)

Fixes https://github.com/clangd/clangd/issues/897
---
 clang/lib/Sema/HeuristicResolver.cpp          | 17 +++++++-
 .../unittests/Sema/HeuristicResolverTest.cpp  | 40 +++++++++++++++++++
 2 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index 2a726fe51d355..e893afed71d26 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -227,6 +227,7 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
   }
 
   // Try resolving the member inside the expression's base type.
+  Expr *Base = ME->isImplicitAccess() ? nullptr : ME->getBase();
   QualType BaseType = ME->getBaseType();
   if (ME->isArrow()) {
     BaseType = getPointeeType(BaseType);
@@ -237,11 +238,25 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
     // If BaseType is the type of a dependent expression, it's just
     // represented as BuiltinType::Dependent which gives us no information. We
     // can get further by analyzing the dependent expression.
-    Expr *Base = ME->isImplicitAccess() ? nullptr : ME->getBase();
     if (Base && BT->getKind() == BuiltinType::Dependent) {
       BaseType = resolveExprToType(Base);
     }
   }
+  if (const auto *AT = BaseType->getContainedAutoType()) {
+    // If BaseType contains a dependent `auto` type, deduction will not have
+    // been performed on it yet. In simple cases (e.g. `auto` variable with
+    // initializer), get the approximate type that would result from deduction.
+    // FIXME: A more accurate implementation would propagate things like the
+    // `const` in `const auto`.
+    if (AT->isUndeducedAutoType()) {
+      if (const auto *DRE = dyn_cast<DeclRefExpr>(Base)) {
+        if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+          if (VD->hasInit())
+            BaseType = resolveExprToType(VD->getInit());
+        }
+      }
+    }
+  }
   return resolveDependentMember(BaseType, ME->getMember(), NoFilter);
 }
 
diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
index 2cd5486b3227f..2b775b11719ea 100644
--- a/clang/unittests/Sema/HeuristicResolverTest.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -155,6 +155,46 @@ TEST(HeuristicResolver, MemberExpr_SmartPointer_Qualified) {
       cxxMethodDecl(hasName("find"), isConst()).bind("output"));
 }
 
+TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction1) {
+  std::string Code = R"cpp(
+    template <typename T>
+    struct A {
+      int waldo;
+    };
+    template <typename T>
+    void foo(A<T> a) {
+      auto copy = a;
+      copy.waldo;
+    }
+  )cpp";
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
+TEST(HeuristicResolver, MemberExpr_AutoTypeDeduction2) {
+  std::string Code = R"cpp(
+    struct B {
+      int waldo;
+    };
+
+    template <typename T>
+    struct A {
+      B b;
+    };
+    template <typename T>
+    void foo(A<T> a) {
+      auto b = a.b;
+      b.waldo;
+    }
+  )cpp";
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("waldo")).bind("input"),
+      fieldDecl(hasName("waldo")).bind("output"));
+}
+
 TEST(HeuristicResolver, MemberExpr_Chained) {
   std::string Code = R"cpp(
     struct A { void foo() {} };

From 73b462321c2968a450779f8f6c240f46a1830376 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 24 Jan 2025 13:39:20 -0800
Subject: [PATCH 050/432] [libc] Include size_t type header in strings.h
 (#124352)

A number of functions in strings.h take size_t as an argument.
---
 libc/include/strings.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/include/strings.yaml b/libc/include/strings.yaml
index e672dca6a94dd..b6aa8f6d60b27 100644
--- a/libc/include/strings.yaml
+++ b/libc/include/strings.yaml
@@ -1,7 +1,8 @@
 header: strings.h
 header_template: strings.h.def
 macros: []
-types: []
+types:
+  - type_name: size_t
 enums: []
 objects: []
 functions:

From b41987beaedaa6ea78fd8dd11ba8c3b21eb8fa88 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 24 Jan 2025 13:59:32 -0800
Subject: [PATCH 051/432] [SandboxVec][DAG] Fix MemDGNode chain maintenance
 when move destination is non-mem (#124227)

This patch fixes a bug in the maintenance of the MemDGNode chain of the
DAG. Whenever we move a memory instruction, the DAG gets notified about
the move and maintains the chain of memory nodes. The bug was that if
the destination of the move was not a memory instruction, then the
memory node's next node would end up pointing to itself.
---
 .../SandboxVectorizer/DependencyGraph.h       | 16 +++--
 .../SandboxVectorizer/DependencyGraph.cpp     | 70 +++++++++++++------
 .../SandboxVectorizer/DependencyGraphTest.cpp | 43 ++++++++++++
 3 files changed, 103 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
index b2d7c9b8aa8bb..6e3f99d78b932 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h
@@ -218,12 +218,14 @@ class MemDGNode final : public DGNode {
   friend class PredIterator; // For MemPreds.
   /// Creates both edges: this<->N.
   void setNextNode(MemDGNode *N) {
+    assert(N != this && "About to point to self!");
     NextMemN = N;
     if (NextMemN != nullptr)
       NextMemN->PrevMemN = this;
   }
   /// Creates both edges: N<->this.
   void setPrevNode(MemDGNode *N) {
+    assert(N != this && "About to point to self!");
     PrevMemN = N;
     if (PrevMemN != nullptr)
       PrevMemN->NextMemN = this;
@@ -348,13 +350,15 @@ class DependencyGraph {
   void createNewNodes(const Interval<Instruction> &NewInterval);
 
   /// Helper for `notify*Instr()`. \Returns the first MemDGNode that comes
-  /// before \p N, including or excluding \p N based on \p IncludingN, or
-  /// nullptr if not found.
-  MemDGNode *getMemDGNodeBefore(DGNode *N, bool IncludingN) const;
+  /// before \p N, skipping \p SkipN, including or excluding \p N based on
+  /// \p IncludingN, or nullptr if not found.
+  MemDGNode *getMemDGNodeBefore(DGNode *N, bool IncludingN,
+                                MemDGNode *SkipN = nullptr) const;
   /// Helper for `notifyMoveInstr()`. \Returns the first MemDGNode that comes
-  /// after \p N, including or excluding \p N based on \p IncludingN, or nullptr
-  /// if not found.
-  MemDGNode *getMemDGNodeAfter(DGNode *N, bool IncludingN) const;
+  /// after \p N, skipping \p SkipN, including or excluding \p N based on \p
+  /// IncludingN, or nullptr if not found.
+  MemDGNode *getMemDGNodeAfter(DGNode *N, bool IncludingN,
+                               MemDGNode *SkipN = nullptr) const;
 
   /// Called by the callbacks when a new instruction \p I has been created.
   void notifyCreateInstr(Instruction *I);
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index f080111f08d45..390a5e9688cc7 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -325,29 +325,31 @@ void DependencyGraph::createNewNodes(const Interval<Instruction> &NewInterval) {
   setDefUseUnscheduledSuccs(NewInterval);
 }
 
-MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N,
-                                               bool IncludingN) const {
+MemDGNode *DependencyGraph::getMemDGNodeBefore(DGNode *N, bool IncludingN,
+                                               MemDGNode *SkipN) const {
   auto *I = N->getInstruction();
   for (auto *PrevI = IncludingN ? I : I->getPrevNode(); PrevI != nullptr;
        PrevI = PrevI->getPrevNode()) {
     auto *PrevN = getNodeOrNull(PrevI);
     if (PrevN == nullptr)
       return nullptr;
-    if (auto *PrevMemN = dyn_cast<MemDGNode>(PrevN))
+    auto *PrevMemN = dyn_cast<MemDGNode>(PrevN);
+    if (PrevMemN != nullptr && PrevMemN != SkipN)
       return PrevMemN;
   }
   return nullptr;
 }
 
-MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N,
-                                              bool IncludingN) const {
+MemDGNode *DependencyGraph::getMemDGNodeAfter(DGNode *N, bool IncludingN,
+                                              MemDGNode *SkipN) const {
   auto *I = N->getInstruction();
   for (auto *NextI = IncludingN ? I : I->getNextNode(); NextI != nullptr;
        NextI = NextI->getNextNode()) {
     auto *NextN = getNodeOrNull(NextI);
     if (NextN == nullptr)
       return nullptr;
-    if (auto *NextMemN = dyn_cast<MemDGNode>(NextN))
+    auto *NextMemN = dyn_cast<MemDGNode>(NextN);
+    if (NextMemN != nullptr && NextMemN != SkipN)
       return NextMemN;
   }
   return nullptr;
@@ -377,6 +379,20 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) {
          !(To == BB->end() && std::next(I->getIterator()) == BB->end()) &&
          "Should not have been called if destination is same as origin.");
 
+  // TODO: We can only handle fully internal movements within DAGInterval or at
+  // the borders, i.e., right before the top or right after the bottom.
+  assert(To.getNodeParent() == I->getParent() &&
+         "TODO: We don't support movement across BBs!");
+  assert(
+      (To == std::next(DAGInterval.bottom()->getIterator()) ||
+       (To != BB->end() && std::next(To) == DAGInterval.top()->getIterator()) ||
+       (To != BB->end() && DAGInterval.contains(&*To))) &&
+      "TODO: To should be either within the DAGInterval or right "
+      "before/after it.");
+
+  // Make a copy of the DAGInterval before we update it.
+  auto OrigDAGInterval = DAGInterval;
+
   // Maintain the DAGInterval.
   DAGInterval.notifyMoveInstr(I, To);
 
@@ -389,23 +405,37 @@ void DependencyGraph::notifyMoveInstr(Instruction *I, const BBIterator &To) {
   MemDGNode *MemN = dyn_cast<MemDGNode>(N);
   if (MemN == nullptr)
     return;
-  // First detach it from the existing chain.
+
+  // First safely detach it from the existing chain.
   MemN->detachFromChain();
+
   // Now insert it back into the chain at the new location.
-  if (To != BB->end()) {
-    DGNode *ToN = getNodeOrNull(&*To);
-    if (ToN != nullptr) {
-      MemN->setPrevNode(getMemDGNodeBefore(ToN, /*IncludingN=*/false));
-      MemN->setNextNode(getMemDGNodeAfter(ToN, /*IncludingN=*/true));
-    }
+  //
+  // We won't always have a DGNode to insert before it. If `To` is BB->end() or
+  // if it points to an instr after DAGInterval.bottom() then we will have to
+  // find a node to insert *after*.
+  //
+  // BB:                              BB:
+  //  I1                               I1 ^
+  //  I2                               I2 | DAGInteval [I1 to I3]
+  //  I3                               I3 V
+  //  I4                               I4   <- `To` == right after DAGInterval
+  //    <- `To` == BB->end()
+  //
+  if (To == BB->end() ||
+      To == std::next(OrigDAGInterval.bottom()->getIterator())) {
+    // If we don't have a node to insert before, find a node to insert after and
+    // update the chain.
+    DGNode *InsertAfterN = getNode(&*std::prev(To));
+    MemN->setPrevNode(
+        getMemDGNodeBefore(InsertAfterN, /*IncludingN=*/true, /*SkipN=*/MemN));
   } else {
-    // MemN becomes the last instruction in the BB.
-    auto *TermN = getNodeOrNull(BB->getTerminator());
-    if (TermN != nullptr) {
-      MemN->setPrevNode(getMemDGNodeBefore(TermN, /*IncludingN=*/false));
-    } else {
-      // The terminator is outside the DAG interval so do nothing.
-    }
+    // We have a node to insert before, so update the chain.
+    DGNode *BeforeToN = getNode(&*To);
+    MemN->setPrevNode(
+        getMemDGNodeBefore(BeforeToN, /*IncludingN=*/false, /*SkipN=*/MemN));
+    MemN->setNextNode(
+        getMemDGNodeAfter(BeforeToN, /*IncludingN=*/true, /*SkipN=*/MemN));
   }
 }
 
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index 3fa4de501f3f5..29fc05a7f256a 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -926,3 +926,46 @@ define void @foo(ptr %ptr, ptr %ptr2, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
   EXPECT_EQ(LdN->getPrevNode(), S1N);
   EXPECT_EQ(LdN->getNextNode(), S2N);
 }
+
+// Check that the mem chain is maintained correctly when the move destination is
+// not a mem node.
+TEST_F(DependencyGraphTest, MoveInstrCallbackWithNonMemInstrs) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %arg) {
+  %ld = load i8, ptr %ptr
+  %zext1 = zext i8 %arg to i32
+  %zext2 = zext i8 %arg to i32
+  store i8 %v1, ptr %ptr
+  store i8 %v2, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Ld = cast<sandboxir::LoadInst>(&*It++);
+  [[maybe_unused]] auto *Zext1 = cast<sandboxir::CastInst>(&*It++);
+  auto *Zext2 = cast<sandboxir::CastInst>(&*It++);
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S2 = cast<sandboxir::StoreInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
+  DAG.extend({Ld, S2});
+  auto *LdN = cast<sandboxir::MemDGNode>(DAG.getNode(Ld));
+  auto *S1N = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
+  auto *S2N = cast<sandboxir::MemDGNode>(DAG.getNode(S2));
+  EXPECT_EQ(LdN->getNextNode(), S1N);
+  EXPECT_EQ(S1N->getNextNode(), S2N);
+
+  S1->moveBefore(Zext2);
+  EXPECT_EQ(LdN->getNextNode(), S1N);
+  EXPECT_EQ(S1N->getNextNode(), S2N);
+
+  // Try move right after the end of the DAGInterval.
+  S1->moveBefore(Ret);
+  EXPECT_EQ(S2N->getNextNode(), S1N);
+  EXPECT_EQ(S1N->getNextNode(), nullptr);
+}

From 425d25f5df4c6814e5551640b810bec53322f3df Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <56735936+hjyamauchi@users.noreply.github.com>
Date: Fri, 24 Jan 2025 14:01:41 -0800
Subject: [PATCH 052/432] [AArch64][WinCFI] Fix a crash due to missing seh
 directives (#123993)

https://github.com/llvm/llvm-project/issues/123808
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 22 +++--
 .../CodeGen/AArch64/stack-hazard-windows.ll   |  4 +
 .../AArch64/wincfi-missing-seh-directives.ll  | 86 +++++++++++++++++++
 3 files changed, 103 insertions(+), 9 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index eabe64361938b..a082a1ebe95bf 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1491,13 +1491,6 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
     NewOpc = AArch64::LDRQpost;
     break;
   }
-  // Get rid of the SEH code associated with the old instruction.
-  if (NeedsWinCFI) {
-    auto SEH = std::next(MBBI);
-    if (AArch64InstrInfo::isSEHInstruction(*SEH))
-      SEH->eraseFromParent();
-  }
-
   TypeSize Scale = TypeSize::getFixed(1), Width = TypeSize::getFixed(0);
   int64_t MinOffset, MaxOffset;
   bool Success = static_cast<const AArch64InstrInfo *>(TII)->getMemOpInfo(
@@ -1512,16 +1505,27 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
       CSStackSizeInc > MaxOffset * (int64_t)Scale.getFixedValue()) {
     // If we are destroying the frame, make sure we add the increment after the
     // last frame operation.
-    if (FrameFlag == MachineInstr::FrameDestroy)
+    if (FrameFlag == MachineInstr::FrameDestroy) {
       ++MBBI;
+      // Also skip the SEH instruction, if needed
+      if (NeedsWinCFI && AArch64InstrInfo::isSEHInstruction(*MBBI))
+        ++MBBI;
+    }
     emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
                     StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
-                    false, false, nullptr, EmitCFI,
+                    false, NeedsWinCFI, HasWinCFI, EmitCFI,
                     StackOffset::getFixed(CFAOffset));
 
     return std::prev(MBBI);
   }
 
+  // Get rid of the SEH code associated with the old instruction.
+  if (NeedsWinCFI) {
+    auto SEH = std::next(MBBI);
+    if (AArch64InstrInfo::isSEHInstruction(*SEH))
+      SEH->eraseFromParent();
+  }
+
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
   MIB.addReg(AArch64::SP, RegState::Define);
 
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll
index 2a034fe5e5290..927d8b68c46be 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard-windows.ll
@@ -76,7 +76,9 @@ define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-po
 ; CHECK1024:       .seh_proc fpr_csr_stackobj
 ; CHECK1024-NEXT:  // %bb.0: // %entry
 ; CHECK1024-NEXT:    sub sp, sp, #1072
+; CHECK1024-NEXT:    .seh_stackalloc 1072
 ; CHECK1024-NEXT:    str x23, [sp] // 8-byte Folded Spill
+; CHECK1024-NEXT:    .seh_save_reg x23, 0
 ; CHECK1024-NEXT:    str x29, [sp, #8] // 8-byte Folded Spill
 ; CHECK1024-NEXT:    .seh_save_reg x29, 8
 ; CHECK1024-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
@@ -105,7 +107,9 @@ define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-po
 ; CHECK1024-NEXT:    ldr x29, [sp, #8] // 8-byte Folded Reload
 ; CHECK1024-NEXT:    .seh_save_reg x29, 8
 ; CHECK1024-NEXT:    ldr x23, [sp] // 8-byte Folded Reload
+; CHECK1024-NEXT:    .seh_save_reg x23, 0
 ; CHECK1024-NEXT:    add sp, sp, #1072
+; CHECK1024-NEXT:    .seh_stackalloc 1072
 ; CHECK1024-NEXT:    .seh_endepilogue
 ; CHECK1024-NEXT:    ret
 ; CHECK1024-NEXT:    .seh_endfunclet
diff --git a/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll b/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll
new file mode 100644
index 0000000000000..2002c37cb2528
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/wincfi-missing-seh-directives.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=aarch64-windows %s --filetype obj -o /dev/null
+; RUN: llc -mtriple=aarch64-windows %s --filetype asm -o - | FileCheck %s
+
+; Check that it doesn't crash and that each instruction in the
+; prologue has a corresponding seh directive.
+;
+; CHECK-NOT: error: Incorrect size for
+; CHECK: foo:
+; CHECK: .seh_proc foo
+; CHECK: sub     sp, sp, #288
+; CHECK: .seh_stackalloc 288
+; CHECK: str     x19, [sp]                       // 8-byte Folded Spill
+; CHECK: .seh_save_reg   x19, 0
+; CHECK: str     x21, [sp, #8]                   // 8-byte Folded Spill
+; CHECK: .seh_save_reg   x21, 8
+; CHECK: stp     x23, x24, [sp, #16]             // 16-byte Folded Spill
+; CHECK: .seh_save_regp  x23, 16
+; CHECK: stp     x25, x26, [sp, #32]             // 16-byte Folded Spill
+; CHECK: .seh_save_regp  x25, 32
+; CHECK: stp     x27, x28, [sp, #48]             // 16-byte Folded Spill
+; CHECK: .seh_save_regp  x27, 48
+; CHECK: stp     x29, x30, [sp, #64]             // 16-byte Folded Spill
+; CHECK: .seh_save_fplr  64
+; CHECK: sub     sp, sp, #224
+; CHECK: .seh_stackalloc 224
+; CHECK: .seh_endprologue
+
+target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "aarch64-unknown-windows-msvc19.42.34436"
+
+%swift.refcounted = type { ptr, i64 }
+%TScA_pSg = type <{ [16 x i8] }>
+%T5repro4TestVSg = type <{ [32 x i8] }>
+%T5repro4TestV = type <{ %TSS, %TSS }>
+%TSS = type <{ %Ts11_StringGutsV }>
+%Ts11_StringGutsV = type <{ %Ts13_StringObjectV }>
+%Ts13_StringObjectV = type <{ %Ts6UInt64V, ptr }>
+%Ts6UInt64V = type <{ i64 }>
+
+declare swiftcc ptr @swift_task_alloc()
+
+declare swifttailcc void @bar(ptr, ptr, i64, i64, i64, ptr, i64, i64, i64, i64, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr, i64, ptr)
+
+define swifttailcc void @foo(ptr %0, ptr swiftasync %1, ptr swiftself %2, ptr %3, ptr %._guts2._object._object, ptr %.rid4._guts._object._object, ptr %4, ptr %.idx8, ptr %.idx8._guts._object._object, ptr %5, ptr %.rid9._guts._object._object, ptr %6) {
+entry:
+  %7 = load i64, ptr null, align 8
+  %8 = load i64, ptr %3, align 8
+  %9 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 2
+  %10 = load i64, ptr %9, align 8
+  %11 = load ptr, ptr %1, align 8
+  %12 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 3
+  %13 = load i64, ptr %.rid9._guts._object._object, align 8
+  %14 = load i64, ptr %.idx8._guts._object._object, align 8
+  %15 = load i64, ptr %5, align 8
+  %16 = getelementptr { i64, i64, i64, i64 }, ptr %12, i32 0, i32 3
+  %17 = load i64, ptr %16, align 8
+  %18 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 4
+  %19 = load i64, ptr %18, align 8
+  %.rid._guts._object._object = getelementptr %Ts13_StringObjectV, ptr %18, i32 0, i32 1
+  %20 = load ptr, ptr %.rid._guts._object._object, align 8
+  %21 = load i64, ptr %.rid4._guts._object._object, align 8
+  %22 = load i64, ptr %0, align 8
+  %23 = load ptr, ptr %6, align 8
+  %24 = load i64, ptr %2, align 8
+  %25 = load ptr, ptr %._guts2._object._object, align 8
+  %26 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 7
+  %27 = load i64, ptr %26, align 8
+  %._guts3._object._object = getelementptr %Ts13_StringObjectV, ptr %26, i32 0, i32 1
+  %28 = load ptr, ptr %._guts3._object._object, align 8
+  %29 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 8
+  %30 = load i64, ptr %29, align 8
+  %.idx5 = getelementptr %T5repro4TestV, ptr %29, i32 0, i32 1
+  %31 = load i64, ptr %.idx5, align 8
+  %.idx5._guts._object._object = getelementptr %Ts13_StringObjectV, ptr %.idx5, i32 0, i32 1
+  %32 = load ptr, ptr %.idx5._guts._object._object, align 8
+  %33 = getelementptr <{ %swift.refcounted, %TScA_pSg, %TSS, %T5repro4TestVSg, %T5repro4TestV, %TSS, %TSS, %TSS, %T5repro4TestV, %TSS, %T5repro4TestV, %T5repro4TestV, %TSS }>, ptr %2, i32 0, i32 9
+  %34 = load i64, ptr %33, align 8
+  %35 = load i64, ptr %4, align 8
+  %36 = load i64, ptr %.idx8, align 8
+  %37 = load i64, ptr %1, align 8
+  %38 = call swiftcc ptr @swift_task_alloc()
+  store ptr null, ptr %3, align 8
+  store ptr null, ptr %4, align 8
+  musttail call swifttailcc void @bar(ptr null, ptr swiftasync %.rid4._guts._object._object, i64 %7, i64 %8, i64 %10, ptr %5, i64 %13, i64 %14, i64 %15, i64 %17, i64 %19, ptr %20, i64 %21, ptr %.idx8, i64 %22, ptr %23, i64 %24, ptr %25, i64 %27, ptr %28, i64 %30, ptr %.idx8._guts._object._object, i64 %31, ptr %32, i64 %34, ptr %._guts2._object._object, i64 %35, ptr %2, i64 %36, ptr %1, i64 %37, ptr %0, i64 0, ptr null, i64 0, ptr null)
+  ret void
+}

From 77c23fd0aa1534abe904c2d5256a6d7879dc3cf7 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86@yahoo.com>
Date: Fri, 24 Jan 2025 14:12:18 -0800
Subject: [PATCH 053/432] [AMDGPU] Update AMDGPUUsage.rst to document two
 intrinsics (#123816)

The AMDGPUUsage.rst file is updated to document two intrinsics:
llvm.amdgcn.mov.dpp and llvm.amdgcn.update.dpp.
---
 llvm/docs/AMDGPUUsage.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 40b393224f15d..8f09df2406f10 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1422,6 +1422,19 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    Returns a pair for the swapped registers. The first element of the return
                                                    corresponds to the swapped element of the first argument.
 
+  llvm.amdgcn.mov.dpp                              The llvm.amdgcn.mov.dpp.`<type>` intrinsic represents the mov.dpp operation in AMDGPU.
+                                                   This operation is being deprecated and can be replaced with llvm.amdgcn.update.dpp.
+
+  llvm.amdgcn.update.dpp                           The llvm.amdgcn.update.dpp.`<type>` intrinsic represents the update.dpp operation in AMDGPU.
+                                                   It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
+                                                   Various data types are supported, including, bf16, f16, f32, f64, i16, i32, i64, p0, p3, p5, v2f16, v2f32, v2i16, v2i32, v2p0, v3i32, v4i32, v8f16.
+                                                   This operation is equivalent to a sequence of v_mov_b32 operations.
+                                                   It is preferred over llvm.amdgcn.mov.dpp.`<type>` for future use.
+                                                   `llvm.amdgcn.update.dpp.<type> <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
+                                                   Should be equivalent to:
+                                                   - `v_mov_b32 <dest> <old>`
+                                                   - `v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
+
   ==============================================   ==========================================================
 
 .. TODO::

From 34c6c5e72f48de65a7e332033af9566576c1895d Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Fri, 24 Jan 2025 14:20:24 -0800
Subject: [PATCH 054/432] [BOLT][AArch64] Fix PLT optimization (#124192)

Preserve C++ exception metadata while running PLT optimization on
AArch64.
---
 bolt/include/bolt/Core/MCPlusBuilder.h        |  7 ++++---
 bolt/lib/Passes/PLTCall.cpp                   |  4 ++--
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   |  5 ++---
 bolt/lib/Target/X86/X86MCPlusBuilder.cpp      |  2 +-
 bolt/test/AArch64/exceptions-plt.cpp          | 21 +++++++++++++++++++
 bolt/test/runtime/exceptions-plt.cpp          | 16 ++++++++++++++
 6 files changed, 46 insertions(+), 9 deletions(-)
 create mode 100644 bolt/test/AArch64/exceptions-plt.cpp
 create mode 100644 bolt/test/runtime/exceptions-plt.cpp

diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index 5d77e6faff2fc..c1460b2aac8a6 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -1426,11 +1426,12 @@ class MCPlusBuilder {
   }
 
   /// Creates an indirect call to the function within the \p DirectCall PLT
-  /// stub. The function's memory location is pointed by the \p TargetLocation
+  /// stub. The function's address location is pointed by the \p TargetLocation
   /// symbol.
+  /// Move instruction annotations from \p DirectCall to the indirect call.
   virtual InstructionListType
-  createIndirectPltCall(const MCInst &DirectCall,
-                        const MCSymbol *TargetLocation, MCContext *Ctx) {
+  createIndirectPLTCall(MCInst &&DirectCall, const MCSymbol *TargetLocation,
+                        MCContext *Ctx) {
     llvm_unreachable("not implemented");
     return {};
   }
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index 2ed996fadbb99..31c2d92ebc204 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -70,8 +70,8 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
         const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol);
         if (!CalleeBF || !CalleeBF->isPLTFunction())
           continue;
-        const InstructionListType NewCode = BC.MIB->createIndirectPltCall(
-            *II, CalleeBF->getPLTSymbol(), BC.Ctx.get());
+        const InstructionListType NewCode = BC.MIB->createIndirectPLTCall(
+            std::move(*II), CalleeBF->getPLTSymbol(), BC.Ctx.get());
         II = BB.replaceInstruction(II, NewCode);
         assert(!NewCode.empty() && "PLT Call replacement must be non-empty");
         std::advance(II, NewCode.size() - 1);
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 0b6f21527f0ac..ac709c5dd063a 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1263,7 +1263,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+  InstructionListType createIndirectPLTCall(MCInst &&DirectCall,
                                             const MCSymbol *TargetLocation,
                                             MCContext *Ctx) override {
     const bool IsTailCall = isTailCall(DirectCall);
@@ -1297,8 +1297,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     MCInst InstCall;
     InstCall.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR);
     InstCall.addOperand(MCOperand::createReg(AArch64::X17));
-    if (IsTailCall)
-      setTailCall(InstCall);
+    moveAnnotations(std::move(DirectCall), InstCall);
     Code.emplace_back(InstCall);
 
     return Code;
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 63086c06d74fd..465533ee71f2b 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1605,7 +1605,7 @@ class X86MCPlusBuilder : public MCPlusBuilder {
     return true;
   }
 
-  InstructionListType createIndirectPltCall(const MCInst &DirectCall,
+  InstructionListType createIndirectPLTCall(MCInst &&DirectCall,
                                             const MCSymbol *TargetLocation,
                                             MCContext *Ctx) override {
     assert((DirectCall.getOpcode() == X86::CALL64pcrel32 ||
diff --git a/bolt/test/AArch64/exceptions-plt.cpp b/bolt/test/AArch64/exceptions-plt.cpp
new file mode 100644
index 0000000000000..576f0fc91a9d8
--- /dev/null
+++ b/bolt/test/AArch64/exceptions-plt.cpp
@@ -0,0 +1,21 @@
+// Verify that PLT optimization in BOLT preserves exception-handling info.
+
+// REQUIRES: system-linux
+
+// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe
+// RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all --print-only=.*main.* \
+// RUN:   --print-finalized 2>&1 | FileCheck %s
+
+// CHECK-LABEL: Binary Function
+// CHECK:      adrp {{.*}}__cxa_throw
+// CHECK-NEXT: ldr {{.*}}__cxa_throw
+// CHECK-NEXT: blr x17 {{.*}} handler: {{.*}} PLTCall:
+
+int main() {
+  try {
+    throw new int;
+  } catch (...) {
+    return 0;
+  }
+  return 1;
+}
diff --git a/bolt/test/runtime/exceptions-plt.cpp b/bolt/test/runtime/exceptions-plt.cpp
new file mode 100644
index 0000000000000..8a75a3cb384b9
--- /dev/null
+++ b/bolt/test/runtime/exceptions-plt.cpp
@@ -0,0 +1,16 @@
+// Verify that PLT optimization in BOLT preserves exception-handling info.
+
+// REQUIRES: system-linux
+
+// RUN: %clangxx %cxxflags -O1 -Wl,-q,-znow %s -o %t.exe
+// RUN: llvm-bolt %t.exe -o %t.bolt.exe --plt=all
+// RUN: %t.bolt.exe
+
+int main() {
+  try {
+    throw new int;
+  } catch (...) {
+    return 0;
+  }
+  return 1;
+}

From 4b209c5d87c8b8eb4bbf2750ea9daa5927a13699 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 24 Jan 2025 14:28:55 -0800
Subject: [PATCH 055/432] [SandboxIR][Region] Add cost modeling to the region
 (#124354)

This patch implements cost modeling for Region. All instructions that
are added or removed get their cost counted in the Scoreboard. This is
used for checking if the region before or after a transformation is more
profitable.
---
 llvm/include/llvm/SandboxIR/Region.h          | 52 +++++++++++--
 llvm/include/llvm/SandboxIR/Value.h           |  1 +
 llvm/lib/SandboxIR/Region.cpp                 | 35 ++++++++-
 .../Passes/RegionsFromMetadata.cpp            |  2 +-
 llvm/unittests/SandboxIR/PassTest.cpp         |  8 +-
 llvm/unittests/SandboxIR/RegionTest.cpp       | 73 ++++++++++++++++---
 6 files changed, 151 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h
index 8133e01734ea7..c1195141cb54c 100644
--- a/llvm/include/llvm/SandboxIR/Region.h
+++ b/llvm/include/llvm/SandboxIR/Region.h
@@ -6,18 +6,55 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H
-#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H
+#ifndef LLVM_SANDBOXIR_REGION_H
+#define LLVM_SANDBOXIR_REGION_H
 
 #include <memory>
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/SandboxIR/Instruction.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm::sandboxir {
 
+class Region;
+
+class ScoreBoard {
+  const Region &Rgn;
+  TargetTransformInfo &TTI;
+  constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  /// The cost of all instructions added to the region.
+  InstructionCost AfterCost = 0;
+  /// The cost of all instructions that got removed and replaced by new ones.
+  InstructionCost BeforeCost = 0;
+  /// Helper for both add() and remove(). \Returns the TTI cost of \p I.
+  InstructionCost getCost(Instruction *I) const;
+  /// No need to allow copies.
+  ScoreBoard(const ScoreBoard &) = delete;
+  const ScoreBoard &operator=(const ScoreBoard &) = delete;
+
+public:
+  ScoreBoard(Region &Rgn, TargetTransformInfo &TTI) : Rgn(Rgn), TTI(TTI) {}
+  /// Mark \p I as a newly added instruction to the region.
+  void add(Instruction *I) { AfterCost += getCost(I); }
+  /// Mark \p I as a deleted instruction from the region.
+  void remove(Instruction *I);
+  /// \Returns the cost of the newly added instructions.
+  InstructionCost getAfterCost() const { return AfterCost; }
+  /// \Returns the cost of the Removed instructions.
+  InstructionCost getBeforeCost() const { return BeforeCost; }
+
+#ifndef NDEBUG
+  void dump(raw_ostream &OS) const {
+    OS << "BeforeCost: " << BeforeCost << "\n";
+    OS << "AfterCost:  " << AfterCost << "\n";
+  }
+  LLVM_DUMP_METHOD void dump() const;
+#endif // NDEBUG
+};
+
 /// The main job of the Region is to point to new instructions generated by
 /// vectorization passes. It is the unit that RegionPasses operate on with their
 /// runOnRegion() function.
@@ -62,6 +99,8 @@ class Region {
   static constexpr const char *RegionStr = "sandboxregion";
 
   Context &Ctx;
+  /// Keeps track of cost of instructions added and removed.
+  ScoreBoard Scoreboard;
 
   /// ID (for later deregistration) of the "create instruction" callback.
   Context::CallbackID CreateInstCB;
@@ -72,7 +111,7 @@ class Region {
   // TODO: Add a way to encode/decode region info to/from metadata.
 
 public:
-  Region(Context &Ctx);
+  Region(Context &Ctx, TargetTransformInfo &TTI);
   ~Region();
 
   Context &getContext() const { return Ctx; }
@@ -91,7 +130,10 @@ class Region {
   iterator end() { return Insts.end(); }
   iterator_range<iterator> insts() { return make_range(begin(), end()); }
 
-  static SmallVector<std::unique_ptr<Region>> createRegionsFromMD(Function &F);
+  static SmallVector<std::unique_ptr<Region>>
+  createRegionsFromMD(Function &F, TargetTransformInfo &TTI);
+  /// \Returns the ScoreBoard data structure that keeps track of instr costs.
+  const ScoreBoard &getScoreboard() const { return Scoreboard; }
 
 #ifndef NDEBUG
   /// This is an expensive check, meant for testing.
@@ -109,4 +151,4 @@ class Region {
 
 } // namespace llvm::sandboxir
 
-#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_REGION_H
+#endif // LLVM_SANDBOXIR_REGION_H
diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h
index 243195f4c1c4b..28e33ca0f2312 100644
--- a/llvm/include/llvm/SandboxIR/Value.h
+++ b/llvm/include/llvm/SandboxIR/Value.h
@@ -167,6 +167,7 @@ class Value {
   // Region needs to manipulate metadata in the underlying LLVM Value, we don't
   // expose metadata in sandboxir.
   friend class Region;
+  friend class ScoreBoard; // Needs access to `Val` for the instruction cost.
 
   /// All values point to the context.
   Context &Ctx;
diff --git a/llvm/lib/SandboxIR/Region.cpp b/llvm/lib/SandboxIR/Region.cpp
index 1455012440f90..8c84d0c46fa10 100644
--- a/llvm/lib/SandboxIR/Region.cpp
+++ b/llvm/lib/SandboxIR/Region.cpp
@@ -11,7 +11,29 @@
 
 namespace llvm::sandboxir {
 
-Region::Region(Context &Ctx) : Ctx(Ctx) {
+InstructionCost ScoreBoard::getCost(Instruction *I) const {
+  auto *LLVMI = cast<llvm::Instruction>(I->Val);
+  SmallVector<const llvm::Value *> Operands(LLVMI->operands());
+  return TTI.getInstructionCost(LLVMI, Operands, CostKind);
+}
+
+void ScoreBoard::remove(Instruction *I) {
+  auto Cost = getCost(I);
+  if (Rgn.contains(I))
+    // If `I` is one the newly added ones, then we should adjust `AfterCost`
+    AfterCost -= Cost;
+  else
+    // If `I` is one of the original instructions (outside the region) then it
+    // is part of the original code, so adjust `BeforeCost`.
+    BeforeCost += Cost;
+}
+
+#ifndef NDEBUG
+void ScoreBoard::dump() const { dump(dbgs()); }
+#endif
+
+Region::Region(Context &Ctx, TargetTransformInfo &TTI)
+    : Ctx(Ctx), Scoreboard(*this, TTI) {
   LLVMContext &LLVMCtx = Ctx.LLVMCtx;
   auto *RegionStrMD = MDString::get(LLVMCtx, RegionStr);
   RegionMDN = MDNode::getDistinct(LLVMCtx, {RegionStrMD});
@@ -31,9 +53,15 @@ void Region::add(Instruction *I) {
   Insts.insert(I);
   // TODO: Consider tagging instructions lazily.
   cast<llvm::Instruction>(I->Val)->setMetadata(MDKind, RegionMDN);
+  // Keep track of the instruction cost.
+  Scoreboard.add(I);
 }
 
 void Region::remove(Instruction *I) {
+  // Keep track of the instruction cost. This need to be done *before* we remove
+  // `I` from the region.
+  Scoreboard.remove(I);
+
   Insts.remove(I);
   cast<llvm::Instruction>(I->Val)->setMetadata(MDKind, nullptr);
 }
@@ -58,7 +86,8 @@ void Region::dump() const {
 }
 #endif // NDEBUG
 
-SmallVector<std::unique_ptr<Region>> Region::createRegionsFromMD(Function &F) {
+SmallVector<std::unique_ptr<Region>>
+Region::createRegionsFromMD(Function &F, TargetTransformInfo &TTI) {
   SmallVector<std::unique_ptr<Region>> Regions;
   DenseMap<MDNode *, Region *> MDNToRegion;
   auto &Ctx = F.getContext();
@@ -68,7 +97,7 @@ SmallVector<std::unique_ptr<Region>> Region::createRegionsFromMD(Function &F) {
         Region *R = nullptr;
         auto It = MDNToRegion.find(MDN);
         if (It == MDNToRegion.end()) {
-          Regions.push_back(std::make_unique<Region>(Ctx));
+          Regions.push_back(std::make_unique<Region>(Ctx, TTI));
           R = Regions.back().get();
           MDNToRegion[MDN] = R;
         } else {
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
index 8e3f5b77429c5..121a195f45ee4 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
@@ -19,7 +19,7 @@ RegionsFromMetadata::RegionsFromMetadata(StringRef Pipeline)
 
 bool RegionsFromMetadata::runOnFunction(Function &F, const Analyses &A) {
   SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
-      sandboxir::Region::createRegionsFromMD(F);
+      sandboxir::Region::createRegionsFromMD(F, A.getTTI());
   for (auto &R : Regions) {
     RPM.runOnRegion(*R, A);
   }
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 751aedefd8fe2..19fce94563e48 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/Pass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Module.h"
 #include "llvm/SandboxIR/Constant.h"
@@ -23,10 +24,13 @@ struct PassTest : public testing::Test {
   llvm::LLVMContext LLVMCtx;
   std::unique_ptr<llvm::Module> LLVMM;
   std::unique_ptr<Context> Ctx;
+  std::unique_ptr<llvm::TargetTransformInfo> TTI;
 
   Function *parseFunction(const char *IR, const char *FuncName) {
     llvm::SMDiagnostic Err;
     LLVMM = parseAssemblyString(IR, Err, LLVMCtx);
+    TTI = std::make_unique<llvm::TargetTransformInfo>(LLVMM->getDataLayout());
+
     if (!LLVMM)
       Err.print("PassTest", llvm::errs());
     Ctx = std::make_unique<Context>(LLVMCtx);
@@ -119,7 +123,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   EXPECT_EQ(TPass.getName(), "test-pass");
   // Check runOnRegion();
   llvm::SmallVector<std::unique_ptr<Region>> Regions =
-      Region::createRegionsFromMD(*F);
+      Region::createRegionsFromMD(*F, *TTI);
   ASSERT_EQ(Regions.size(), 1u);
   TPass.runOnRegion(*Regions[0], Analyses::emptyForTesting());
   EXPECT_EQ(InstCount, 2u);
@@ -242,7 +246,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   RPM.addPass(std::make_unique<TestPass2>(InstCount2));
   // Check runOnRegion().
   llvm::SmallVector<std::unique_ptr<Region>> Regions =
-      Region::createRegionsFromMD(*F);
+      Region::createRegionsFromMD(*F, *TTI);
   ASSERT_EQ(Regions.size(), 1u);
   RPM.runOnRegion(*Regions[0], Analyses::emptyForTesting());
   EXPECT_EQ(InstCount1, 2u);
diff --git a/llvm/unittests/SandboxIR/RegionTest.cpp b/llvm/unittests/SandboxIR/RegionTest.cpp
index 47368f93a32c0..1ee72d127daa4 100644
--- a/llvm/unittests/SandboxIR/RegionTest.cpp
+++ b/llvm/unittests/SandboxIR/RegionTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/SandboxIR/Region.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/SandboxIR/Context.h"
 #include "llvm/SandboxIR/Function.h"
@@ -20,10 +21,12 @@ using namespace llvm;
 struct RegionTest : public testing::Test {
   LLVMContext C;
   std::unique_ptr<Module> M;
+  std::unique_ptr<TargetTransformInfo> TTI;
 
   void parseIR(LLVMContext &C, const char *IR) {
     SMDiagnostic Err;
     M = parseAssemblyString(IR, Err, C);
+    TTI = std::make_unique<TargetTransformInfo>(M->getDataLayout());
     if (!M)
       Err.print("RegionTest", errs());
   }
@@ -45,7 +48,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   auto *T0 = cast<sandboxir::Instruction>(&*It++);
   auto *T1 = cast<sandboxir::Instruction>(&*It++);
   auto *Ret = cast<sandboxir::Instruction>(&*It++);
-  sandboxir::Region Rgn(Ctx);
+  sandboxir::Region Rgn(Ctx, *TTI);
 
   // Check getContext.
   EXPECT_EQ(&Ctx, &Rgn.getContext());
@@ -73,7 +76,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
 #ifndef NDEBUG
   // Check equality comparison. Insert in reverse order into `Other` to check
   // that comparison is order-independent.
-  sandboxir::Region Other(Ctx);
+  sandboxir::Region Other(Ctx, *TTI);
   Other.add(Ret);
   EXPECT_NE(Rgn, Other);
   Other.add(T1);
@@ -98,7 +101,7 @@ define i8 @foo(i8 %v0, i8 %v1, ptr %ptr) {
   auto *T0 = cast<sandboxir::Instruction>(&*It++);
   auto *T1 = cast<sandboxir::Instruction>(&*It++);
   auto *Ret = cast<sandboxir::Instruction>(&*It++);
-  sandboxir::Region Rgn(Ctx);
+  sandboxir::Region Rgn(Ctx, *TTI);
   Rgn.add(T0);
   Rgn.add(T1);
 
@@ -134,7 +137,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   auto *T2 = cast<sandboxir::Instruction>(&*It++);
 
   SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
-      sandboxir::Region::createRegionsFromMD(*F);
+      sandboxir::Region::createRegionsFromMD(*F, *TTI);
   EXPECT_THAT(Regions[0]->insts(), testing::UnorderedElementsAre(T0));
   EXPECT_THAT(Regions[1]->insts(), testing::UnorderedElementsAre(T1, T2));
 }
@@ -160,7 +163,7 @@ define i8 @foo(i8 %v0, i8 %v1) {
   auto *T2 = cast<sandboxir::Instruction>(&*It++);
 
   SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
-      sandboxir::Region::createRegionsFromMD(*F);
+      sandboxir::Region::createRegionsFromMD(*F, *TTI);
   EXPECT_THAT(Regions[0]->insts(), testing::UnorderedElementsAre(T0, T2));
 }
 
@@ -182,9 +185,9 @@ define i8 @foo(i8 %v0, i8 %v1) {
   [[maybe_unused]] auto *T1 = cast<sandboxir::Instruction>(&*It++);
   auto *T2 = cast<sandboxir::Instruction>(&*It++);
   [[maybe_unused]] auto *Ret = cast<sandboxir::Instruction>(&*It++);
-  sandboxir::Region Rgn(Ctx);
+  sandboxir::Region Rgn(Ctx, *TTI);
   Rgn.add(T0);
-  sandboxir::Region Rgn2(Ctx);
+  sandboxir::Region Rgn2(Ctx, *TTI);
   Rgn2.add(T2);
 
   std::string output;
@@ -226,14 +229,66 @@ define i8 @foo(i8 %v0, i8 %v1) {
   auto *T0 = cast<sandboxir::Instruction>(&*It++);
   auto *T1 = cast<sandboxir::Instruction>(&*It++);
 
-  sandboxir::Region Rgn(Ctx);
+  sandboxir::Region Rgn(Ctx, *TTI);
   Rgn.add(T0);
   Rgn.add(T1);
 
   SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
-      sandboxir::Region::createRegionsFromMD(*F);
+      sandboxir::Region::createRegionsFromMD(*F, *TTI);
   ASSERT_EQ(1U, Regions.size());
 #ifndef NDEBUG
   EXPECT_EQ(Rgn, *Regions[0].get());
 #endif
 }
+
+TEST_F(RegionTest, RegionCost) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v0, i8 %v1, i8 %v2) {
+  %add0 = add i8 %v0, 1
+  %add1 = add i8 %v1, 2
+  %add2 = add i8 %v2, 3
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  auto *LLVMBB = &*LLVMF->begin();
+  auto LLVMIt = LLVMBB->begin();
+  auto *LLVMAdd0 = &*LLVMIt++;
+  auto *LLVMAdd1 = &*LLVMIt++;
+  auto *LLVMAdd2 = &*LLVMIt++;
+
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Add0 = cast<sandboxir::Instruction>(&*It++);
+  auto *Add1 = cast<sandboxir::Instruction>(&*It++);
+  auto *Add2 = cast<sandboxir::Instruction>(&*It++);
+
+  sandboxir::Region Rgn(Ctx, *TTI);
+  const auto &SB = Rgn.getScoreboard();
+  EXPECT_EQ(SB.getAfterCost(), 0);
+  EXPECT_EQ(SB.getBeforeCost(), 0);
+
+  auto GetCost = [this](llvm::Instruction *LLVMI) {
+    constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+    SmallVector<const llvm::Value *> Operands(LLVMI->operands());
+    return TTI->getInstructionCost(LLVMI, Operands, CostKind);
+  };
+  // Add `Add0` to the region, should be counted in "After".
+  Rgn.add(Add0);
+  EXPECT_EQ(SB.getBeforeCost(), 0);
+  EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd0));
+  // Same for `Add1`.
+  Rgn.add(Add1);
+  EXPECT_EQ(SB.getBeforeCost(), 0);
+  EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd0) + GetCost(LLVMAdd1));
+  // Remove `Add0`, should be subtracted from "After".
+  Rgn.remove(Add0);
+  EXPECT_EQ(SB.getBeforeCost(), 0);
+  EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd1));
+  // Remove `Add2` which was never in the region, should counted in "Before".
+  Rgn.remove(Add2);
+  EXPECT_EQ(SB.getBeforeCost(), GetCost(LLVMAdd2));
+  EXPECT_EQ(SB.getAfterCost(), GetCost(LLVMAdd1));
+}

From 05fd4d5775e2c40c00057d7af195290bc3a39cd3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 24 Jan 2025 14:32:07 -0800
Subject: [PATCH 056/432] [flang][cuda] Perform inlined assignment when field
 is c_devptr (#124322)

When a field in a derived type is `c_devptr`, keep check if we can do a
memcpy instead of falling back to the runtime assignment.

Many internal CUDA Fortran derived type have a `c_devptr` field and this
would lead to stack overflow on the device if the assignment is
performed by the runtime function.
---
 flang/lib/Optimizer/Builder/FIRBuilder.cpp |  3 ++-
 flang/test/Lower/CUDA/cuda-devptr.cuf      | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index 64c540cfb95ae..35dc9a2abd69c 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -1410,7 +1410,8 @@ static bool recordTypeCanBeMemCopied(fir::RecordType recordType) {
   for (auto [_, fieldType] : recordType.getTypeList()) {
     // Derived type component may have user assignment (so far, we cannot tell
     // in FIR, so assume it is always the case, TODO: get the actual info).
-    if (mlir::isa<fir::RecordType>(fir::unwrapSequenceType(fieldType)))
+    if (mlir::isa<fir::RecordType>(fir::unwrapSequenceType(fieldType)) &&
+        !fir::isa_builtin_c_devptr_type(fir::unwrapSequenceType(fieldType)))
       return false;
     // Allocatable components need deep copy.
     if (auto boxType = mlir::dyn_cast<fir::BaseBoxType>(fieldType))
diff --git a/flang/test/Lower/CUDA/cuda-devptr.cuf b/flang/test/Lower/CUDA/cuda-devptr.cuf
index d61d84d9bc750..0a9087cf6c133 100644
--- a/flang/test/Lower/CUDA/cuda-devptr.cuf
+++ b/flang/test/Lower/CUDA/cuda-devptr.cuf
@@ -4,6 +4,12 @@
 
 module cudafct
   use __fortran_builtins, only : c_devptr => __builtin_c_devptr
+  
+  type :: t1
+    type(c_devptr) :: devp
+    integer :: a
+  end type
+
 contains
   function c_devloc(x)
     use iso_c_binding, only: c_loc
@@ -12,6 +18,10 @@ contains
     real, target, device :: x
     c_devloc%cptr = c_loc(x)
   end function
+
+  attributes(device) function get_t1()
+    type(t1) :: get_t1
+  end
 end
 
 subroutine sub1()
@@ -68,3 +78,12 @@ end subroutine
 ! CHECK: %[[P_ADDR_COORD:.*]] = fir.coordinate_of %[[P_CPTR_COORD]], %[[ADDRESS_FIELD]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64>
 ! CHECK: %[[ADDR:.*]] = fir.load %[[RES_ADDR_COORD]] : !fir.ref<i64>
 ! CHECK: fir.store %[[ADDR]] to %[[P_ADDR_COORD]] : !fir.ref<i64>
+
+attributes(global) subroutine assign_nested_c_devptr(p, a)
+  use cudafct
+  type(t1), device :: p
+  p = get_t1()
+end subroutine
+
+! CHECK-LABEL: func.func @_QPassign_nested_c_devptr
+! CHECK-NOT: fir.call @_FortranAAssign

From cff7ad56babc2e8e7c731b3f60d3c0b4c8aca96f Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 24 Jan 2025 14:35:20 -0800
Subject: [PATCH 057/432] [SandboxVec][Utils] Implement Utils::verifyFunction()
 (#124356)

This patch implements a wrapper function for the LLVM IR verifier for
functions, and calls it (flag-guarded) within the bottom-up-vectorizer
for finding IR bugs as soon as they happen.
---
 llvm/include/llvm/SandboxIR/Utils.h            |  9 +++++++++
 .../SandboxVectorizer/Passes/BottomUpVec.cpp   | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/Utils.h b/llvm/include/llvm/SandboxIR/Utils.h
index d58fe52214395..5c6f0d9edd618 100644
--- a/llvm/include/llvm/SandboxIR/Utils.h
+++ b/llvm/include/llvm/SandboxIR/Utils.h
@@ -17,6 +17,8 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/SandboxIR/Function.h"
 #include "llvm/SandboxIR/Instruction.h"
 #include <optional>
 
@@ -122,6 +124,13 @@ class Utils {
                              const std::optional<MemoryLocation> &OptLoc) {
     return BatchAA.getModRefInfo(cast<llvm::Instruction>(I->Val), OptLoc);
   }
+
+  /// Equivalent to llvm::verifyFunction().
+  /// \Returns true if the IR is broken.
+  static bool verifyFunction(const Function *F, raw_ostream &OS) {
+    const auto &LLVMF = *cast<llvm::Function>(F->Val);
+    return llvm::verifyFunction(LLVMF, &OS);
+  }
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 7cebde335cb4e..b3a477c64a5cc 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -27,6 +27,13 @@ static cl::opt<bool>
     AllowNonPow2("sbvec-allow-non-pow2", cl::init(false), cl::Hidden,
                  cl::desc("Allow non-power-of-2 vectorization."));
 
+#ifndef NDEBUG
+static cl::opt<bool>
+    AlwaysVerify("sbvec-always-verify", cl::init(false), cl::Hidden,
+                 cl::desc("Helps find bugs by verifying the IR whenever we "
+                          "emit new instructions (*very* expensive)."));
+#endif // NDEBUG
+
 namespace sandboxir {
 
 BottomUpVec::BottomUpVec(StringRef Pipeline)
@@ -365,6 +372,17 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl,
     break;
   }
   }
+#ifndef NDEBUG
+  if (AlwaysVerify) {
+    // This helps find broken IR by constantly verifying the function. Note that
+    // this is very expensive and should only be used for debugging.
+    Instruction *I0 = isa<Instruction>(Bndl[0])
+                          ? cast<Instruction>(Bndl[0])
+                          : cast<Instruction>(UserBndl[0]);
+    assert(!Utils::verifyFunction(I0->getParent()->getParent(), dbgs()) &&
+           "Broken function!");
+  }
+#endif
   return NewVec;
 }
 

From d910fbcbd10c5e72d0771dd9607e7133ae51dc70 Mon Sep 17 00:00:00 2001
From: Sam Elliott <quic_aelliott@quicinc.com>
Date: Fri, 24 Jan 2025 14:46:01 -0800
Subject: [PATCH 058/432] [RISCV][NFC] cR Constraint Release Note

---
 llvm/docs/ReleaseNotes.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 05d902641d093..0872f20bea590 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -272,6 +272,9 @@ Changes to the RISC-V Backend
   * `cf` constraint meaning an RVC-encoding compatible FPR (`f8`-`f15`)
   * `R` constraint meaning an even-odd GPR pair (prints as the even register,
     but both registers in the pair are considered live).
+  * `cR` constraint meaning an RVC-encoding compatible even-odd GPR Pair (prints
+    as an even register between `x8` and `x14`, but both registers in the pair
+    are considered live).
   * `N` modifer meaning print the register encoding (0-31) rather than the name.
 * `f` and `cf` inline assembly constraints, when using F-/D-/H-in-X extensions,
   will use the relevant GPR rather than FPR. This makes inline assembly portable

From ac75d322801411f496fe5d1155c86453f915ae98 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 24 Jan 2025 14:52:57 -0800
Subject: [PATCH 059/432] [SandboxVec][VecUtils] Filter out instructions not in
 BB in VecUtils:getLowest() (#124360)

This patch changes the functionality of `VecUtils::getLowest(Vals, BB)`
such that it filters out any instructions in `Vals` that are not in BB.
This is useful when Vals contains instructions from different BBs,
because in that case we are only interested in one BB.
---
 .../Vectorize/SandboxVectorizer/VecUtils.h    | 17 ++++++------
 .../SandboxVectorizer/Passes/BottomUpVec.cpp  |  2 +-
 .../Transforms/SandboxVectorizer/cross_bbs.ll |  4 +--
 .../test/Transforms/SandboxVectorizer/pack.ll |  4 +--
 .../SandboxVectorizer/VecUtilsTest.cpp        | 27 ++++++++++++-------
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
index 64090febc5a09..bec1cecf241f6 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h
@@ -111,10 +111,12 @@ class VecUtils {
     return LowestI;
   }
   /// \Returns the lowest instruction in \p Vals, or nullptr if no instructions
-  /// are found or if not in the same BB.
-  static Instruction *getLowest(ArrayRef<Value *> Vals) {
-    // Find the first Instruction in Vals.
-    auto It = find_if(Vals, [](Value *V) { return isa<Instruction>(V); });
+  /// are found. Skips instructions not in \p BB.
+  static Instruction *getLowest(ArrayRef<Value *> Vals, BasicBlock *BB) {
+    // Find the first Instruction in Vals that is also in `BB`.
+    auto It = find_if(Vals, [BB](Value *V) {
+      return isa<Instruction>(V) && cast<Instruction>(V)->getParent() == BB;
+    });
     // If we couldn't find an instruction return nullptr.
     if (It == Vals.end())
       return nullptr;
@@ -122,15 +124,14 @@ class VecUtils {
     // Now look for the lowest instruction in Vals starting from one position
     // after FirstI.
     Instruction *LowestI = FirstI;
-    auto *LowestBB = LowestI->getParent();
     for (auto *V : make_range(std::next(It), Vals.end())) {
       auto *I = dyn_cast<Instruction>(V);
       // Skip non-instructions.
       if (I == nullptr)
         continue;
-      // If the instructions are in different BBs return nullptr.
-      if (I->getParent() != LowestBB)
-        return nullptr;
+      // Skips instructions not in \p BB.
+      if (I->getParent() != BB)
+        continue;
       // If `LowestI` comes before `I` then `I` is the new lowest.
       if (LowestI->comesBefore(I))
         LowestI = I;
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index b3a477c64a5cc..6f65657d29790 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -54,7 +54,7 @@ static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
 /// of BB if no instruction found in \p Vals.
 static BasicBlock::iterator getInsertPointAfterInstrs(ArrayRef<Value *> Vals,
                                                       BasicBlock *BB) {
-  auto *BotI = VecUtils::getLastPHIOrSelf(VecUtils::getLowest(Vals));
+  auto *BotI = VecUtils::getLastPHIOrSelf(VecUtils::getLowest(Vals, BB));
   if (BotI == nullptr)
     // We are using BB->begin() (or after PHIs) as the fallback insert point.
     return BB->empty()
diff --git a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
index e913fc5913ba7..6ec31060d7e0f 100644
--- a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
@@ -8,10 +8,10 @@ define void @cross_bbs(ptr %ptr) {
 ; CHECK-NEXT:    [[PTR1:%.*]] = getelementptr i8, ptr [[PTR]], i32 1
 ; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr [[PTR0]], align 1
 ; CHECK-NEXT:    [[L1:%.*]] = load i8, ptr [[PTR1]], align 1
-; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[L0]], i32 0
-; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[L1]], i32 1
 ; CHECK-NEXT:    br label %[[BB:.*]]
 ; CHECK:       [[BB]]:
+; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[L0]], i32 0
+; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[L1]], i32 1
 ; CHECK-NEXT:    store <2 x i8> [[PACK1]], ptr [[PTR0]], align 1
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll
index 373ab743fb890..a0aa2a79a0ade 100644
--- a/llvm/test/Transforms/SandboxVectorizer/pack.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll
@@ -59,12 +59,12 @@ define void @packFromOtherBB(ptr %ptr, i8 %val) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[ADD0:%.*]] = add i8 [[VAL]], 0
 ; CHECK-NEXT:    [[MUL1:%.*]] = mul i8 [[VAL]], 1
-; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0
-; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[MUL1]], i32 1
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[PHI0:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ 1, %[[LOOP]] ]
 ; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ 0, %[[ENTRY]] ], [ 1, %[[LOOP]] ]
+; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0
+; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[MUL1]], i32 1
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
 ; CHECK-NEXT:    store <2 x i8> [[PACK1]], ptr [[GEP0]], align 1
 ; CHECK-NEXT:    br label %[[LOOP]]
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp
index a46e47afea3c7..5c062df8112f6 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/VecUtilsTest.cpp
@@ -461,24 +461,33 @@ define void @foo(i8 %v) {
 
   // Check getLowest(ArrayRef<Value *>)
   SmallVector<sandboxir::Value *> C1Only({C1});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only), nullptr);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only, &BB), nullptr);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1Only, &BB0), nullptr);
   SmallVector<sandboxir::Value *> AOnly({IA});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly, &BB), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AOnly, &BB0), nullptr);
   SmallVector<sandboxir::Value *> AC1({IA, C1});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1, &BB), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1, &BB0), nullptr);
   SmallVector<sandboxir::Value *> C1A({C1, IA});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A, &BB), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1A, &BB0), nullptr);
   SmallVector<sandboxir::Value *> AC1B({IA, C1, IB});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B), IB);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B, &BB), IB);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1B, &BB0), nullptr);
   SmallVector<sandboxir::Value *> ABC1({IA, IB, C1});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1), IB);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1, &BB), IB);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(ABC1, &BB0), nullptr);
   SmallVector<sandboxir::Value *> AC1C2({IA, C1, C2});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2, &BB), IA);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(AC1C2, &BB0), nullptr);
   SmallVector<sandboxir::Value *> C1C2C3({C1, C2, C3});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3), nullptr);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3, &BB), nullptr);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(C1C2C3, &BB0), nullptr);
 
   SmallVector<sandboxir::Value *> DiffBBs({BB0I, IA});
-  EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs), nullptr);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs, &BB0), BB0I);
+  EXPECT_EQ(sandboxir::VecUtils::getLowest(DiffBBs, &BB), IA);
 }
 
 TEST_F(VecUtilsTest, GetLastPHIOrSelf) {

From 4df9c17e5f436702ca4f5439322972b0385d629a Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Fri, 24 Jan 2025 23:59:00 +0100
Subject: [PATCH 060/432] [libc++] Fix tests for clang::no_specializations for
 C++17 and C++20

---
 libcxx/include/__type_traits/result_of.h      |  2 +-
 .../ranges/no_specializations.verify.cpp      |  4 +++-
 .../type_traits/no_specializations.verify.cpp | 24 ++++++++++++-------
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h
index 217ca70b4cd20..8cc009dbe8baa 100644
--- a/libcxx/include/__type_traits/result_of.h
+++ b/libcxx/include/__type_traits/result_of.h
@@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS)
 template <class _Callable>
-struct _LIBCPP_DEPRECATED_IN_CXX17 result_of;
+struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of;
 
 template <class _Fp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {};
diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
index 69d458a920558..489e3a6a73744 100644
--- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
@@ -13,7 +13,9 @@
 
 #include <ranges>
 
-#if !__has_warning("-Winvalid-specialization")
+#include "test_macros.h"
+
+#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20
 // expected-no-diagnostics
 #else
 struct S {};
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index e6d960667e8c0..807d01e381b49 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -36,15 +36,22 @@ SPECIALIZE_TRAIT(make_unsigned);        // expected-error {{cannot be specialize
 SPECIALIZE_TRAIT(remove_all_extents);   // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_const);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_cv);            // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(remove_cvref);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_extent);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_pointer);       // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_reference);     // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_volatile);      // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(type_identity);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(underlying_type);      // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_reference);     // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_ref_decay);     // expected-error {{cannot be specialized}}
+
+#  if TEST_STD_VER <= 17
+SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}}
+#  endif
+
+#  if TEST_STD_VER >= 20
+SPECIALIZE_TRAIT(remove_cvref);     // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(type_identity);    // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}}
+#  endif
 
 #  undef SPECIALIZE_TRAIT
 #  define SPECIALIZE_UTT(Trait)                                                                                        \
@@ -96,7 +103,6 @@ SPECIALIZE_UTT(is_move_assignable);                 // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_move_constructible);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(is_nothrow_assignable);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_constructible);           // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_BTT(is_nothrow_convertible);             // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_assignable);         // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_constructible);      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_default_constructible);   // expected-error 2 {{cannot be specialized}}
@@ -130,7 +136,6 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_trivially_destructible);          // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_assignable);       // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_constructible);    // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_unbounded_array);                 // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_union);                           // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_unsigned);                        // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_void);                            // expected-error 2 {{cannot be specialized}}
@@ -140,11 +145,12 @@ SPECIALIZE_UTT(rank);                               // expected-error 2 {{cannot
 
 #  if TEST_STD_VER <= 17
 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(result_of);       // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 20
-SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_bounded_array);       // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 23
@@ -171,6 +177,8 @@ struct std::conditional<true, S, S>; // expected-error {{cannot be specialized}}
 template <>
 struct std::enable_if<true, S>; // expected-error {{cannot be specialized}}
 
+#if TEST_STD_VER >= 20
 template <>
 struct std::integral_constant<S, {}>; // expected-error {{cannot be specialized}}
 #endif
+#endif

From e2005d1461942539f7533a518aa78017074f6bf9 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Fri, 24 Jan 2025 14:59:56 -0800
Subject: [PATCH 061/432] [LLDB] Reapply #123873 SBSaveCore Docstrings
 (#124355)

In my last attempt at this (#123873), I didn't realize we needed semi
colons! Also fixed the bug that the feature summary didn't have a type
defined.

CC @JDevlieghere hope you get a laugh at needing to revert doc strings
for breaking the build....
---
 .../interface/SBSaveCoreOptionsDocstrings.i   | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
index e69de29bb2d1d..08bbdf89d68de 100644
--- a/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
+++ b/lldb/bindings/interface/SBSaveCoreOptionsDocstrings.i
@@ -0,0 +1,71 @@
+%feature("docstring",
+"A container to specify how to save a core file.
+
+SBSaveCoreOptions includes API's to specify the memory regions and threads to include
+when generating a core file. It extends the existing SaveCoreStyle option.
+
+* eSaveCoreFull will save off all thread and memory regions, ignoring the memory regions and threads in
+the options object.
+
+* eSaveCoreDirtyOnly pages will capture all threads and all rw- memory regions, in addition to the regions specified
+in the options object if they are not already captured.
+
+* eSaveCoreStackOnly will capture all threads, but no memory regions unless specified.
+
+* eSaveCoreCustomOnly Custom defers entirely to the SBSaveCoreOptions object and will only save what is specified. 
+  Picking custom and specifying nothing will result in an error being returned.
+
+Note that currently ELF Core files are not supported."
+) lldb::SBSaveCoreOptions;
+
+%feature("docstring", "
+    Set the plugin name to save a Core file with. Only plugins registered with Plugin manager will be accepted
+    Examples are Minidump and Mach-O."
+) lldb::SBSaveCoreOptions::SetPluginName;
+
+%feature("docstring", "
+    Get the specified plugin name, or None if the name is not set."
+) lldb::SBSaveCoreOptions::GetPluginName;
+
+%feature("docstring", "
+    Set the lldb.SaveCoreStyle."
+) lldb::SBSaveCoreOptions::SetStyle;
+
+%feature("docstring", "
+    Get the specified lldb.SaveCoreStyle, or eSaveCoreUnspecified if not set."
+) lldb::SBSaveCoreOptions::GetStyle;
+
+%feature("docstring", "
+    Set the file path to save the Core file at."
+) lldb::SBSaveCoreOptions::SetOutputFile;
+
+%feature("docstring", "
+    Get an SBFileSpec corresponding to the specified output path, or none if not set."
+) lldb::SBSaveCoreOptions::GetOutputFile;
+
+%feature("docstring", "
+    Set the process to save, or unset a process by providing a default SBProcess. 
+    Resetting will result in the reset of all process specific options, such as Threads to save."
+) lldb::SBSaveCoreOptions::SetProcess;
+
+%feature("docstring", "
+    Add an SBThread to be saved, an error will be returned if an SBThread from a different process is specified. 
+    The process is set either by the first SBThread added to the options container, or explicitly by the SetProcess call."
+) lldb::SBSaveCoreOptions::AddThread;
+
+%feature("docstring", "
+    Remove an SBthread if present in the container, returns true if a matching thread was found and removed."
+) lldb::SBSaveCoreOptions::RemoveThread;
+
+%feature("docstring", "
+    Add a memory region to save, an error will be returned in the region is invalid. 
+    Ranges that overlap will be unioned into a single region."
+) lldb::SBSaveCoreOptions::AddMemoryRegionToSave;
+
+%feature("docstring", "
+    Get an SBThreadCollection of all threads marked to be saved. This collection is not sorted according to insertion order."
+) lldb::SBSaveCoreOptions::GetThreadsToSave;
+
+%feature("docstring", "
+    Unset all options."
+) lldb::SBSaveCoreOptions::Clear;

From 241e5d8c5c424155e02e05524e8f731fc524aa40 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Fri, 24 Jan 2025 18:15:40 -0500
Subject: [PATCH 062/432] [AMDGPU][True16][MC] true16 for v_cmpx_eq_f16
 (#124038)

True16 format for v_cmpx_eq_f16.

Also cleaned up some stray gfx11 check line in gfx12 dasm test
---
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |   2 +-
 .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s  |  65 ++++----
 .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s   |  25 +++-
 .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s     |  14 +-
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s         |  75 ++++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s   |  65 ++++----
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s    |  21 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s |  42 ++++--
 .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s   |  42 ++++--
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s        |  14 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s  |  73 +++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s   |  33 +++--
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s         |  72 +++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s   |  62 ++++----
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s    |  18 ++-
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s |  42 ++++--
 .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s   |  42 ++++--
 .../gfx11_dasm_vop3_dpp16_from_vopcx.txt      |  54 +++++--
 .../gfx11_dasm_vop3_dpp8_from_vopcx.txt       |  24 ++-
 .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt     |  14 +-
 .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt  |  65 ++++++--
 .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt         |  54 +++++--
 .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt          |  24 ++-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt |  28 ++--
 .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt        | 140 +++++++++++-------
 .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt         |  64 +++-----
 .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt  |  61 ++++++--
 .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt         |  50 +++++--
 .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt          |  14 +-
 29 files changed, 845 insertions(+), 454 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index aa930249c5003..46cad585b8a82 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1977,7 +1977,7 @@ defm V_CMP_CLASS_F64     : VOPC_Real_gfx11_gfx12<0x07f>;
 
 defm V_CMPX_F_F16_fake16     : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
 defm V_CMPX_LT_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
-defm V_CMPX_EQ_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
+defm V_CMPX_EQ_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
 defm V_CMPX_LE_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
 defm V_CMPX_GT_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
 defm V_CMPX_LG_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
index 80264a4a791bb..e946097f35e23 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
@@ -96,47 +96,56 @@ v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
index 119e4826b3277..e60406078f745 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
@@ -29,17 +29,26 @@ v_cmpx_class_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
index 1614f00e1f07e..799a8f86a01e9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
@@ -149,11 +149,11 @@ v_cmpx_class_f64_e64 -|src_scc|, src_scc
 v_cmpx_class_f64_e64 0xaf123456, 0xaf123456
 // GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f16_e64 v1, v2
-// GFX11: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_eq_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_eq_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_eq_f16_e64 v255, v255
-// GFX11: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_eq_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_eq_f16_e64 s1, s2
 // GFX11: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
@@ -194,6 +194,12 @@ v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_eq_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_eq_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_eq_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_eq_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_eq_f32_e64 v1, v2
 // GFX11: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
index cdad89321d89a..88d9fb6cc1357 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
@@ -143,50 +143,65 @@ v_cmpx_class_f64 src_scc, v2
 v_cmpx_class_f64 0xaf123456, v255
 // GFX11: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f16 v1, v2
-// GFX11: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
+v_cmpx_eq_f16 v1.l, v2.l
+// GFX11: v_cmpx_eq_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x04,0x7d]
 
-v_cmpx_eq_f16 v127, v2
-// GFX11: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
+v_cmpx_eq_f16 v127.l, v2.l
+// GFX11: v_cmpx_eq_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x04,0x7d]
 
-v_cmpx_eq_f16 s1, v2
-// GFX11: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
+v_cmpx_eq_f16 s1, v2.l
+// GFX11: v_cmpx_eq_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 s105, v2
-// GFX11: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
+v_cmpx_eq_f16 s105, v2.l
+// GFX11: v_cmpx_eq_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 vcc_lo, v2
-// GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
+v_cmpx_eq_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 vcc_hi, v2
-// GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
+v_cmpx_eq_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 ttmp15, v2
-// GFX11: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
+v_cmpx_eq_f16 ttmp15, v2.l
+// GFX11: v_cmpx_eq_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 m0, v2
-// GFX11: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
+v_cmpx_eq_f16 m0, v2.l
+// GFX11: v_cmpx_eq_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 exec_lo, v2
-// GFX11: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
+v_cmpx_eq_f16 exec_lo, v2.l
+// GFX11: v_cmpx_eq_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 exec_hi, v2
-// GFX11: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
+v_cmpx_eq_f16 exec_hi, v2.l
+// GFX11: v_cmpx_eq_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 null, v2
-// GFX11: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
+v_cmpx_eq_f16 null, v2.l
+// GFX11: v_cmpx_eq_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 -1, v2
-// GFX11: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
+v_cmpx_eq_f16 -1, v2.l
+// GFX11: v_cmpx_eq_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 0.5, v2
-// GFX11: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
+v_cmpx_eq_f16 0.5, v2.l
+// GFX11: v_cmpx_eq_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 src_scc, v2
-// GFX11: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
+v_cmpx_eq_f16 src_scc, v2.l
+// GFX11: v_cmpx_eq_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 0xfe0b, v127
-// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_eq_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_eq_f16 v1.h, v2.l
+// GFX11: v_cmpx_eq_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x04,0x7d]
+
+v_cmpx_eq_f16 v127.h, v2.l
+// GFX11: v_cmpx_eq_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x04,0x7d]
+
+v_cmpx_eq_f16 0.5, v127.l
+// GFX11: v_cmpx_eq_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x04,0x7d]
+
+v_cmpx_eq_f16 src_scc, v2.h
+// GFX11: v_cmpx_eq_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x05,0x7d]
+
+v_cmpx_eq_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_eq_f32 v1, v2
 // GFX11: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
index ddaa30af953b8..e8d458874596e 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
@@ -95,47 +95,56 @@ v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x35,0x30]
 
-v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
index 1cead89c0a82e..4f8895faf10a2 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
@@ -29,14 +29,23 @@ v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
index 5cab502e99647..fe2220d7f5902 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
@@ -37,23 +37,41 @@ v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_eq_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_eq_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_eq_i16_e32 v1.h, v255.h
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
index 5102a32075066..c6814de818e6d 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
@@ -37,23 +37,41 @@ v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v255
-// GFX11: v_cmpx_eq_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_eq_f16 v1.h, v255.h
+// GFX11: v_cmpx_eq_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_eq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v255, v2
-// GFX11: v_cmpx_eq_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_eq_f16 v1.l, v255.l
+// GFX11: v_cmpx_eq_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16 v255.h, v2.h
+// GFX11: v_cmpx_eq_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16 v255.l, v2.l
+// GFX11: v_cmpx_eq_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_eq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_eq_i16 v1.h, v255.h
 // GFX11: v_cmpx_eq_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
index d7bec00b83080..17bd81fa7d259 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
@@ -146,11 +146,11 @@ v_cmpx_class_f64_e64 -|src_scc|, src_scc
 v_cmpx_class_f64_e64 0xaf123456, 0xaf123456
 // GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f16_e64 v1, v2
-// GFX12: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_eq_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_eq_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_eq_f16_e64 v255, v255
-// GFX12: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_eq_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_eq_f16_e64 s1, s2
 // GFX12: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
@@ -191,6 +191,12 @@ v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_eq_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_eq_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_eq_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_eq_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_eq_f32_e64 v1, v2
 // GFX12: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
index faad68f902d5f..86f4b9a6789dd 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
@@ -107,53 +107,62 @@ v_cmpx_class_f32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x05,0x30]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
index 588ad2b75a410..071a00ac73b8a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
@@ -41,23 +41,32 @@ v_cmpx_class_f32_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x01,0xfe,0xd4,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
-v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_eq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x82,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x82,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x82,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
index 4d43b98978eb5..ab01d37c39d3e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
@@ -140,50 +140,62 @@ v_cmpx_class_f64 src_scc, v2
 v_cmpx_class_f64 0xaf123456, v255
 // GFX12: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f16 v1, v2
-// GFX12: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
+v_cmpx_eq_f16 v1.l, v2.l
+// GFX12: v_cmpx_eq_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x04,0x7d]
 
-v_cmpx_eq_f16 v127, v2
-// GFX12: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
+v_cmpx_eq_f16 v127.l, v2.l
+// GFX12: v_cmpx_eq_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x04,0x7d]
 
-v_cmpx_eq_f16 s1, v2
-// GFX12: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
+v_cmpx_eq_f16 s1, v2.l
+// GFX12: v_cmpx_eq_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 s105, v2
-// GFX12: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
+v_cmpx_eq_f16 s105, v2.l
+// GFX12: v_cmpx_eq_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 vcc_lo, v2
-// GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
+v_cmpx_eq_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 vcc_hi, v2
-// GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
+v_cmpx_eq_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 ttmp15, v2
-// GFX12: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
+v_cmpx_eq_f16 ttmp15, v2.l
+// GFX12: v_cmpx_eq_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 m0, v2
-// GFX12: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
+v_cmpx_eq_f16 m0, v2.l
+// GFX12: v_cmpx_eq_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 exec_lo, v2
-// GFX12: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
+v_cmpx_eq_f16 exec_lo, v2.l
+// GFX12: v_cmpx_eq_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 exec_hi, v2
-// GFX12: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
+v_cmpx_eq_f16 exec_hi, v2.l
+// GFX12: v_cmpx_eq_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 null, v2
-// GFX12: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
+v_cmpx_eq_f16 null, v2.l
+// GFX12: v_cmpx_eq_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 -1, v2
-// GFX12: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
+v_cmpx_eq_f16 -1, v2.l
+// GFX12: v_cmpx_eq_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 0.5, v2
-// GFX12: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
+v_cmpx_eq_f16 0.5, v2.l
+// GFX12: v_cmpx_eq_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 src_scc, v2
-// GFX12: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
+v_cmpx_eq_f16 src_scc, v2.l
+// GFX12: v_cmpx_eq_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x04,0x7d]
 
-v_cmpx_eq_f16 0xfe0b, v127
-// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_eq_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_eq_f16 v1.h, v2.l
+// GFX12: v_cmpx_eq_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x04,0x7d]
+
+v_cmpx_eq_f16 v127.h, v2.l
+// GFX12: v_cmpx_eq_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x04,0x7d]
+
+v_cmpx_eq_f16 src_scc, v2.h
+// GFX12: v_cmpx_eq_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x05,0x7d]
+
+v_cmpx_eq_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_eq_f32 v1, v2
 // GFX12: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
index 5c54d1ad5788c..2b919fa9d671e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
@@ -92,47 +92,53 @@ v_cmpx_class_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x35,0x30]
 
-v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
index c6e7fd1aa96da..11579786d78a8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
@@ -26,14 +26,20 @@ v_cmpx_class_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
index 7ba3aff6c80ca..265ab2c8ff66d 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
@@ -37,23 +37,41 @@ v_cmpx_class_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:22: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_eq_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_eq_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_eq_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_eq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_eq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_eq_i16_e32 v1.h, v255.h
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
index b7423dcde03d4..ed228c061d019 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
@@ -37,23 +37,41 @@ v_cmpx_class_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_class_f16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_class_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xfd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v1, v255
-// GFX12: v_cmpx_eq_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_eq_f16 v1.h, v255.h
+// GFX12: v_cmpx_eq_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x82,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_eq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_eq_f16 v255, v2
-// GFX12: v_cmpx_eq_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_eq_f16 v1.l, v255.l
+// GFX12: v_cmpx_eq_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_eq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_eq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_eq_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_eq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_eq_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16 v255.h, v2.h
+// GFX12: v_cmpx_eq_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x82,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_eq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_eq_f16 v255.l, v2.l
+// GFX12: v_cmpx_eq_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_eq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_eq_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_eq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_eq_i16 v1.h, v255.h
 // GFX12: v_cmpx_eq_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb2,0xd4,0x01,0xff,0x03,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
index 20250c1df729e..b73c7f83c7442 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
@@ -115,46 +115,72 @@
 # GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
index a1ef8f36e77be..0b7e14108848c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
@@ -31,16 +31,32 @@
 # GFX11: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
index a3e9f92454e3a..cd897944845a0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
@@ -160,10 +160,12 @@
 # GFX11: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
@@ -204,6 +206,14 @@
 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
index f058a9b981625..90a95138144f1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
@@ -166,49 +166,84 @@
 # GFX11: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 
 0x7f,0x05,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 
 0x01,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 
 0x69,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 
 0x6a,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 
 0x6b,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 
 0x7b,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 
 0x7d,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 
 0x7e,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 
 0x7f,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 
 0x7c,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 
 0xc1,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 
 0xf0,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 
 0xfd,0x04,0x04,0x7d
-# GFX11: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 
 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x04,0x7d
+# GFX11-REAL16: v_cmpx_eq_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x04,0x7d]
+
+0xff,0x05,0x04,0x7d
+# GFX11-REAL16: v_cmpx_eq_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x04,0x7d]
+
+0xf0,0xfe,0x04,0x7d
+# GFX11-REAL16: v_cmpx_eq_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x04,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x04,0x7d]
+
+0xfd,0x04,0x05,0x7d
+# GFX11-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x05,0x7d]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x05,0x7d]
+
+0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x24,0x7d
 # GFX11: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
index 5a57f93c65939..9b9b423a7b104 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
@@ -115,46 +115,72 @@
 # GFX11: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 
 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_eq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
index 8350088ca95a5..6ca58524688e2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
@@ -31,10 +31,30 @@
 # GFX11: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x04,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
index 80235451fec6f..d3a19ae00fa21 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
@@ -156,10 +156,12 @@
 # GFX12: v_cmpx_class_f64_e64 0xaf123456, 0xaf123456 ; encoding: [0x7e,0x00,0xff,0xd4,0xff,0xfe,0x01,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
@@ -200,6 +202,15 @@
 0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_eq_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x92,0xd4,0x01,0x05,0x02,0x00]
 
@@ -336,7 +347,6 @@
 # GFX12-REAL16: v_cmpx_eq_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb2,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_eq_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc2,0xd4,0x01,0x05,0x02,0x00]
@@ -474,7 +484,6 @@
 # GFX12-REAL16: v_cmpx_eq_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xba,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_eq_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xca,0xd4,0x01,0x05,0x02,0x00]
@@ -738,7 +747,6 @@
 # GFX12-REAL16: v_cmpx_ge_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb6,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ge_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc6,0xd4,0x01,0x05,0x02,0x00]
@@ -876,7 +884,6 @@
 # GFX12-REAL16: v_cmpx_ge_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbe,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ge_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xce,0xd4,0x01,0x05,0x02,0x00]
@@ -1140,7 +1147,6 @@
 # GFX12-REAL16: v_cmpx_gt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb4,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_gt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc4,0xd4,0x01,0x05,0x02,0x00]
@@ -1278,7 +1284,6 @@
 # GFX12-REAL16: v_cmpx_gt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbc,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_gt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcc,0xd4,0x01,0x05,0x02,0x00]
@@ -1542,7 +1547,6 @@
 # GFX12-REAL16: v_cmpx_le_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb3,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_le_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_le_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc3,0xd4,0x01,0x05,0x02,0x00]
@@ -1680,7 +1684,6 @@
 # GFX12-REAL16: v_cmpx_le_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbb,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_le_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_le_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_le_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcb,0xd4,0x01,0x05,0x02,0x00]
@@ -2080,7 +2083,6 @@
 # GFX12-REAL16: v_cmpx_lt_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb1,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_lt_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc1,0xd4,0x01,0x05,0x02,0x00]
@@ -2218,7 +2220,6 @@
 # GFX12-REAL16: v_cmpx_lt_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb9,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_lt_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc9,0xd4,0x01,0x05,0x02,0x00]
@@ -2356,7 +2357,6 @@
 # GFX12-REAL16: v_cmpx_ne_i16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xb5,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64 v255, v255            ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ne_i32_e64 v1, v2                ; encoding: [0x7e,0x00,0xc5,0xd4,0x01,0x05,0x02,0x00]
@@ -2494,8 +2494,6 @@
 # GFX12-REAL16: v_cmpx_ne_u16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0xbd,0xd4,0xff,0xff,0x03,0x00]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64 v255, v255            ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xff,0x03,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64 v1.h, v2.l op_sel:[1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0x01,0x05,0x02,0x00]
-
 0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ne_u32_e64 v1, v2                ; encoding: [0x7e,0x00,0xcd,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index 2dc231a4220f1..a1061a067d73c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -171,49 +171,123 @@
 # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xfa,0xfe,0x03,0x20,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x82,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x82,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x82,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x82,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x82,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x82,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x82,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x82,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x82,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x82,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_eq_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x82,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x82,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x82,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x82,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_eq_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x82,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_eq_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x82,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_eq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x82,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x82,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x82,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x92,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -367,17 +441,14 @@
 0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_eq_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb2,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -537,17 +608,14 @@
 0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_eq_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xba,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -794,17 +862,14 @@
 0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ge_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb6,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -961,17 +1026,14 @@
 0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ge_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbe,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -1218,17 +1280,14 @@
 0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_gt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb4,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -1385,17 +1444,14 @@
 0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_gt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbc,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -1642,17 +1698,14 @@
 0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_le_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb3,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -1809,17 +1862,14 @@
 0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_le_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbb,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -2230,17 +2280,14 @@
 0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_lt_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb1,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -2397,17 +2444,14 @@
 0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_lt_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb9,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -2564,17 +2608,14 @@
 0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ne_i16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xb5,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
@@ -2731,17 +2772,14 @@
 0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ne_u16_e64 0xfe0b, vcc_hi        ; encoding: [0x7e,0x00,0xbd,0xd4,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x18,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x08,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index cff9497778265..56e1ea1194a5c 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -38,19 +38,37 @@
 # GFX12: v_cmpx_class_f32_e64_dpp -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x01,0xfe,0xd4,0xea,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x82,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x82,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x82,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x82,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_eq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x92,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -82,17 +100,14 @@
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb2,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_eq_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_eq_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb2,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -118,17 +133,14 @@
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xba,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_eq_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_eq_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xba,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -187,17 +199,14 @@
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb6,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_ge_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ge_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb6,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -223,17 +232,14 @@
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbe,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_ge_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ge_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbe,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -289,17 +295,14 @@
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb4,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_gt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_gt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb4,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -325,17 +328,14 @@
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbc,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_gt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_gt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -391,17 +391,14 @@
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb3,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_le_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_le_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb3,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -427,17 +424,14 @@
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbb,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_le_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_le_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -540,17 +534,14 @@
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb1,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_lt_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_lt_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb1,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -576,17 +567,14 @@
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb9,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_lt_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_lt_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb9,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -612,17 +600,14 @@
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xb5,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_ne_i16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ne_i16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xb5,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
@@ -651,17 +636,14 @@
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v1.h, v2.l op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x08,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0xbd,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-# GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 # GFX12-REAL16: v_cmpx_ne_u16_e64_dpp v255.l, v255.h op_sel:[0,1] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x10,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 # GFX12-FAKE16: v_cmpx_ne_u16_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xbd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
index 6ca815a1c88d3..b458e613d2f20 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
@@ -162,49 +162,80 @@
 # GFX12: v_cmpx_class_f64_e32 0xaf123456, v255   ; encoding: [0xff,0xfe,0xff,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x04,0x7d]
 
 0x7f,0x05,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x04,0x7d]
 
 0x01,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x04,0x7d]
 
 0x69,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x04,0x7d]
 
 0x6a,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x04,0x7d]
 
 0x6b,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x04,0x7d]
 
 0x7b,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x04,0x7d]
 
 0x7d,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x04,0x7d]
 
 0x7e,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x04,0x7d]
 
 0x7f,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x04,0x7d]
 
 0x7c,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x04,0x7d]
 
 0xc1,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x04,0x7d]
 
 0xf0,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x04,0x7d]
 
 0xfd,0x04,0x04,0x7d
-# GFX12: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x04,0x7d]
 
 0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x04,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x04,0x7d
+# GFX12-REAL16: v_cmpx_eq_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x04,0x7d]
+
+0xff,0x05,0x04,0x7d
+# GFX12-REAL16: v_cmpx_eq_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x04,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x04,0x7d]
+
+0xfd,0x04,0x05,0x7d
+# GFX12-REAL16: v_cmpx_eq_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x05,0x7d]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x05,0x7d]
+
+0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_eq_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x05,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x24,0x7d
 # GFX12: v_cmpx_eq_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x24,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
index f1fca29120490..0289ed58f07e3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
@@ -111,46 +111,68 @@
 # GFX12: v_cmpx_class_f32 -|v255|, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0xfd,0x7d,0xff,0x6f,0x3d,0x30]
 
 0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x04,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_eq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_eq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x04,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x05,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_eq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_eq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x05,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_eq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x24,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
index b2539ad5a49e7..e55d6a4c77c2a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
@@ -27,10 +27,20 @@
 # GFX12: v_cmpx_class_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0xfd,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_eq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x04,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_eq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x04,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_eq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_eq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x05,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_eq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_eq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x05,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_eq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x24,0x7d,0x01,0x77,0x39,0x05]

From d789915f35a976bb532441915249cd1b165c2fd5 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 24 Jan 2025 15:32:35 -0800
Subject: [PATCH 063/432] [Github] Bump Runner Version in Containers (#124324)

This patch bumps the runner version to v2.322.0 in the CI containers.
Nothing looks suspicious in the change log, and it is important to keep
the runner up to date or we will end up with containers that cannot
connect to Github due to having a version too old.
---
 .../workflows/containers/github-action-ci-windows/Dockerfile    | 2 +-
 .github/workflows/containers/github-action-ci/Dockerfile        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
index 2295e39d62c30..9a1fab694c9df 100644
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -108,7 +108,7 @@ RUN choco install -y handle
 
 RUN pip3 install pywin32 buildbot-worker==2.8.4
 
-ARG RUNNER_VERSION=2.321.0
+ARG RUNNER_VERSION=2.322.0
 ENV RUNNER_VERSION=$RUNNER_VERSION
 
 RUN powershell -Command \
diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile
index 35a0f1f6020dc..377b8f14402ee 100644
--- a/.github/workflows/containers/github-action-ci/Dockerfile
+++ b/.github/workflows/containers/github-action-ci/Dockerfile
@@ -96,7 +96,7 @@ WORKDIR /home/gha
 
 FROM ci-container as ci-container-agent
 
-ENV GITHUB_RUNNER_VERSION=2.321.0
+ENV GITHUB_RUNNER_VERSION=2.322.0
 
 RUN mkdir actions-runner && \
     cd actions-runner && \

From 280c7d719834a828895b8a39f8ea982527fdcc73 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 24 Jan 2025 15:37:36 -0800
Subject: [PATCH 064/432] [CI] Increase Configurability of Monolithic Windows
 Build (#124328)

This patch makes it so that the caller of monolithic-windows.sh can set
the maximum number of parallel compile/link jobs in an environment
variable rather than manually specifying it inside of the CMake.
Additionally, the env variable definitions for CC, CXX, and LD are sunk
into the shell script due to those config options being pretty inherent
to what the pipeline is testing.

This is intended to make things more flexible/useable for the new
premerge CI pipeline, particularly as we are looking at using larger
runners and want the increased flexibility to experiment.
---
 .ci/generate-buildkite-pipeline-premerge | 5 ++---
 .ci/monolithic-windows.sh                | 8 ++++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index 9d9ca32183944..e547afaeb722f 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -128,9 +128,8 @@ if [[ "${windows_projects}" != "" ]]; then
         limit: 2
   timeout_in_minutes: 150
   env:
-    CC: 'cl'
-    CXX: 'cl'
-    LD: 'link'
+    MAX_PARALLEL_COMPILE_JOBS: '16'
+    MAX_PARALLEL_LINK_JOBS: '4'
   commands:
   - 'C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64'
   - 'bash .ci/monolithic-windows.sh "$(echo ${windows_projects} | tr ' ' ';')" "$(echo ${windows_check_targets})"'
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 68303a3ea153a..57b276f3e1df0 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -50,6 +50,10 @@ echo "--- cmake"
 pip install -q -r "${MONOREPO_ROOT}"/mlir/python/requirements.txt
 pip install -q -r "${MONOREPO_ROOT}"/.ci/requirements.txt
 
+export CC=cl
+export CXX=cl
+export LD=link
+
 # The CMAKE_*_LINKER_FLAGS to disable the manifest come from research
 # on fixing a build reliability issue on the build server, please
 # see https://github.com/llvm/llvm-project/pull/82393 and
@@ -72,8 +76,8 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
-      -D LLVM_PARALLEL_COMPILE_JOBS=16 \
-      -D LLVM_PARALLEL_LINK_JOBS=4
+      -D LLVM_PARALLEL_COMPILE_JOBS=${MAX_PARALLEL_COMPILE_JOBS} \
+      -D LLVM_PARALLEL_LINK_JOBS=${MAX_PARALLEL_LINK_JOBS}
 
 echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.

From 6409799bdcd86be3ed72e8d172181294d3e5ad09 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 24 Jan 2025 15:39:37 -0800
Subject: [PATCH 065/432] [SandboxVec][Legality] Pack from different BBs
 (#124363)

When the inputs of the pack come from different BBs we need to make sure
we emit the pack instructions at the correct place.
---
 .../Vectorize/SandboxVectorizer/Legality.h    |  3 ++
 .../Vectorize/SandboxVectorizer/Legality.cpp  |  5 ++++
 .../test/Transforms/SandboxVectorizer/pack.ll | 27 +++++++++++++++++
 .../SandboxVectorizer/LegalityTest.cpp        | 29 +++++++++++++++++--
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index f10c535aa820e..156b788d8a203 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -91,6 +91,7 @@ enum class ResultReason {
   DiffTypes,
   DiffMathFlags,
   DiffWrapFlags,
+  DiffBBs,
   NotConsecutive,
   CantSchedule,
   Unimplemented,
@@ -127,6 +128,8 @@ struct ToStr {
       return "DiffMathFlags";
     case ResultReason::DiffWrapFlags:
       return "DiffWrapFlags";
+    case ResultReason::DiffBBs:
+      return "DiffBBs";
     case ResultReason::NotConsecutive:
       return "NotConsecutive";
     case ResultReason::CantSchedule:
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
index 085f4cd67ab76..48bc246e4b56a 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Legality.cpp
@@ -214,6 +214,11 @@ const LegalityResult &LegalityAnalysis::canVectorize(ArrayRef<Value *> Bndl,
                dumpBndl(Bndl););
     return createLegalityResult<Pack>(ResultReason::NotInstructions);
   }
+  // Pack if not in the same BB.
+  auto *BB = cast<Instruction>(Bndl[0])->getParent();
+  if (any_of(drop_begin(Bndl),
+             [BB](auto *V) { return cast<Instruction>(V)->getParent() != BB; }))
+    return createLegalityResult<Pack>(ResultReason::DiffBBs);
 
   auto CollectDescrs = getHowToCollectValues(Bndl);
   if (CollectDescrs.hasVectorInputs()) {
diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll
index a0aa2a79a0ade..ec6e61a90c0fb 100644
--- a/llvm/test/Transforms/SandboxVectorizer/pack.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll
@@ -88,3 +88,30 @@ loop:
 exit:
   ret void
 }
+
+define void @packFromDiffBBs(ptr %ptr, i8 %v) {
+; CHECK-LABEL: define void @packFromDiffBBs(
+; CHECK-SAME: ptr [[PTR:%.*]], i8 [[V:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[ADD0:%.*]] = add i8 [[V]], 1
+; CHECK-NEXT:    br label %[[BB:.*]]
+; CHECK:       [[BB]]:
+; CHECK-NEXT:    [[ADD1:%.*]] = add i8 [[V]], 2
+; CHECK-NEXT:    [[PACK:%.*]] = insertelement <2 x i8> poison, i8 [[ADD0]], i32 0
+; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <2 x i8> [[PACK]], i8 [[ADD1]], i32 1
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 0
+; CHECK-NEXT:    store <2 x i8> [[PACK1]], ptr [[GEP0]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add0 = add i8 %v, 1
+  br label %bb
+
+bb:
+  %add1 = add i8 %v, 2
+  %gep0 = getelementptr i8, ptr %ptr, i64 0
+  %gep1 = getelementptr i8, ptr %ptr, i64 1
+  store i8 %add0, ptr %gep0
+  store i8 %add1, ptr %gep1
+  ret void
+}
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index b421d08bc6b02..acc887f9dc6c1 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -57,11 +57,24 @@ struct LegalityTest : public testing::Test {
   }
 };
 
+static sandboxir::BasicBlock *getBasicBlockByName(sandboxir::Function *F,
+                                                  StringRef Name) {
+  for (sandboxir::BasicBlock &BB : *F)
+    if (BB.getName() == Name)
+      return &BB;
+  llvm_unreachable("Expected to find basic block!");
+}
+
 TEST_F(LegalityTest, LegalitySkipSchedule) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float %farg0, float %farg1, i64 %v0, i64 %v1, i32 %v2) {
+entry:
   %gep0 = getelementptr float, ptr %ptr, i32 0
   %gep1 = getelementptr float, ptr %ptr, i32 1
+  store float %farg0, ptr %gep1
+  br label %bb
+
+bb:
   %gep3 = getelementptr float, ptr %ptr, i32 3
   %ld0 = load float, ptr %gep0
   %ld0b = load float, ptr %gep0
@@ -89,10 +102,14 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float
 
   sandboxir::Context Ctx(C);
   auto *F = Ctx.createFunction(LLVMF);
-  auto *BB = &*F->begin();
-  auto It = BB->begin();
+  auto *EntryBB = getBasicBlockByName(F, "entry");
+  auto It = EntryBB->begin();
   [[maybe_unused]] auto *Gep0 = cast<sandboxir::GetElementPtrInst>(&*It++);
   [[maybe_unused]] auto *Gep1 = cast<sandboxir::GetElementPtrInst>(&*It++);
+  auto *St1Entry = cast<sandboxir::StoreInst>(&*It++);
+
+  auto *BB = getBasicBlockByName(F, "bb");
+  It = BB->begin();
   [[maybe_unused]] auto *Gep3 = cast<sandboxir::GetElementPtrInst>(&*It++);
   auto *Ld0 = cast<sandboxir::LoadInst>(&*It++);
   auto *Ld0b = cast<sandboxir::LoadInst>(&*It++);
@@ -162,6 +179,14 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float
     EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
               sandboxir::ResultReason::DiffWrapFlags);
   }
+  {
+    // Check DiffBBs
+    const auto &Result =
+        Legality.canVectorize({St0, St1Entry}, /*SkipScheduling=*/true);
+    EXPECT_TRUE(isa<sandboxir::Pack>(Result));
+    EXPECT_EQ(cast<sandboxir::Pack>(Result).getReason(),
+              sandboxir::ResultReason::DiffBBs);
+  }
   {
     // Check DiffTypes for unary operands that have a different type.
     const auto &Result = Legality.canVectorize({Trunc64to8, Trunc32to8},

From 48657bf29b01e95749b5ecd8c7f675c14a7948d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 24 Jan 2025 15:52:05 -0800
Subject: [PATCH 066/432] [flang][cuda] Handle launch of cooperative kernel
 (#124362)

Add `CUFLaunchCooperativeKernel` entry points and lower gpu.launch_func
with grid_global attribute to this entry point.
---
 flang/include/flang/Runtime/CUDA/kernel.h     |  4 ++
 .../Transforms/CUFGPUToLLVMConversion.cpp     | 18 +++--
 flang/runtime/CUDA/kernel.cpp                 | 65 +++++++++++++++++++
 flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir | 35 ++++++++++
 4 files changed, 116 insertions(+), 6 deletions(-)

diff --git a/flang/include/flang/Runtime/CUDA/kernel.h b/flang/include/flang/Runtime/CUDA/kernel.h
index 85afda09e347a..1f812b580327a 100644
--- a/flang/include/flang/Runtime/CUDA/kernel.h
+++ b/flang/include/flang/Runtime/CUDA/kernel.h
@@ -28,6 +28,10 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
     intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
     int32_t smem, void **params, void **extra);
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra);
+
 } // extern "C"
 
 #endif // FORTRAN_RUNTIME_CUDA_KERNEL_H_
diff --git a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
index 60aa401e1cc8c..c469b5a95b044 100644
--- a/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
@@ -139,20 +139,26 @@ struct GPULaunchKernelConversion
                            adaptor.getBlockSizeY(), adaptor.getBlockSizeZ(),
                            dynamicMemorySize, kernelArgs, nullPtr});
     } else {
-      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(
-          RTNAME_STRING(CUFLaunchKernel));
+      auto procAttr =
+          op->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
+      bool isGridGlobal =
+          procAttr && procAttr.getValue() == cuf::ProcAttribute::GridGlobal;
+      llvm::StringRef fctName = isGridGlobal
+                                    ? RTNAME_STRING(CUFLaunchCooperativeKernel)
+                                    : RTNAME_STRING(CUFLaunchKernel);
+      auto funcOp = mod.lookupSymbol<mlir::LLVM::LLVMFuncOp>(fctName);
       auto funcTy = mlir::LLVM::LLVMFunctionType::get(
           voidTy,
           {ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
            llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
           /*isVarArg=*/false);
-      auto cufLaunchKernel = mlir::SymbolRefAttr::get(
-          mod.getContext(), RTNAME_STRING(CUFLaunchKernel));
+      auto cufLaunchKernel =
+          mlir::SymbolRefAttr::get(mod.getContext(), fctName);
       if (!funcOp) {
         mlir::OpBuilder::InsertionGuard insertGuard(rewriter);
         rewriter.setInsertionPointToStart(mod.getBody());
-        auto launchKernelFuncOp = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-            loc, RTNAME_STRING(CUFLaunchKernel), funcTy);
+        auto launchKernelFuncOp =
+            rewriter.create<mlir::LLVM::LLVMFuncOp>(loc, fctName, funcTy);
         launchKernelFuncOp.setVisibility(
             mlir::SymbolTable::Visibility::Private);
       }
diff --git a/flang/runtime/CUDA/kernel.cpp b/flang/runtime/CUDA/kernel.cpp
index bdc04ccb17672..02d89fb8423a5 100644
--- a/flang/runtime/CUDA/kernel.cpp
+++ b/flang/runtime/CUDA/kernel.cpp
@@ -151,4 +151,69 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
 }
 
+void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
+    intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
+    intptr_t blockZ, int32_t smem, void **params, void **extra) {
+  dim3 gridDim;
+  gridDim.x = gridX;
+  gridDim.y = gridY;
+  gridDim.z = gridZ;
+  dim3 blockDim;
+  blockDim.x = blockX;
+  blockDim.y = blockY;
+  blockDim.z = blockZ;
+  unsigned nbNegGridDim{0};
+  if (gridX < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridY < 0) {
+    ++nbNegGridDim;
+  }
+  if (gridZ < 0) {
+    ++nbNegGridDim;
+  }
+  if (nbNegGridDim == 1) {
+    int maxBlocks, nbBlocks, dev, multiProcCount;
+    cudaError_t err1, err2;
+    nbBlocks = blockDim.x * blockDim.y * blockDim.z;
+    cudaGetDevice(&dev);
+    err1 = cudaDeviceGetAttribute(
+        &multiProcCount, cudaDevAttrMultiProcessorCount, dev);
+    err2 = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &maxBlocks, kernel, nbBlocks, smem);
+    if (err1 == cudaSuccess && err2 == cudaSuccess) {
+      maxBlocks = multiProcCount * maxBlocks;
+    }
+    if (maxBlocks > 0) {
+      if (gridX > 0) {
+        maxBlocks = maxBlocks / gridDim.x;
+      }
+      if (gridY > 0) {
+        maxBlocks = maxBlocks / gridDim.y;
+      }
+      if (gridZ > 0) {
+        maxBlocks = maxBlocks / gridDim.z;
+      }
+      if (maxBlocks < 1) {
+        maxBlocks = 1;
+      }
+      if (gridX < 0) {
+        gridDim.x = maxBlocks;
+      }
+      if (gridY < 0) {
+        gridDim.y = maxBlocks;
+      }
+      if (gridZ < 0) {
+        gridDim.z = maxBlocks;
+      }
+    }
+  } else if (nbNegGridDim > 1) {
+    Fortran::runtime::Terminator terminator{__FILE__, __LINE__};
+    terminator.Crash("Too many invalid grid dimensions");
+  }
+  cudaStream_t stream = 0; // TODO stream managment
+  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(
+      kernel, gridDim, blockDim, params, smem, stream));
+}
+
 } // extern "C"
diff --git a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
index 3db2336c90a7d..0827e378c7c07 100644
--- a/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
+++ b/flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
@@ -131,3 +131,38 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
 // CHECK-LABEL: llvm.func @_QQmain()
 // CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
 // CHECK: llvm.call @_FortranACUFLaunchClusterKernel(%[[KERNEL_PTR]], {{.*}})
+
+// -----
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (git@github.com:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  llvm.func @_QMmod1Phost_sub() {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %2 = llvm.mlir.constant(40 : i64) : i64
+    %3 = llvm.mlir.constant(16 : i32) : i32
+    %4 = llvm.mlir.constant(25 : i32) : i32
+    %5 = llvm.mlir.constant(21 : i32) : i32
+    %6 = llvm.mlir.constant(17 : i32) : i32
+    %7 = llvm.mlir.constant(1 : index) : i64
+    %8 = llvm.mlir.constant(27 : i32) : i32
+    %9 = llvm.mlir.constant(6 : i32) : i32
+    %10 = llvm.mlir.constant(1 : i32) : i32
+    %11 = llvm.mlir.constant(0 : i32) : i32
+    %12 = llvm.mlir.constant(10 : index) : i64
+    %13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
+    %14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
+    gpu.launch_func  @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
+    llvm.return
+  }
+  llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
+  llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5()  {addr_space = 0 : i32} : !llvm.array<2 x i8> {
+    %0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
+    llvm.return %0 : !llvm.array<2 x i8>
+  }
+  llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
+  llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
+  gpu.binary @cuda_device_mod  [#gpu.object<#nvvm.target, "">]
+}
+
+// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
+// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

From c725a95e088dea14953c2d891d04429bc50b912e Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Fri, 24 Jan 2025 15:58:13 -0800
Subject: [PATCH 067/432] [MemProf] Convert Hot contexts to NotCold early
 (#124219)

While we convert hot contexts to notcold contexts during the cloning
step, their existence was greatly limiting the context trimming
performed when we add the MemProf profile to the IR. To address this,
any hot contexts are converted to notcold contexts immediately after
first checking for unambiguous allocation types, and before checking it
again and before adding metadata while performing context trimming.

Note that hot hints are now disabled by default, however, this avoids
adding unnecessary overhead if they are re-enabled.
---
 .../include/llvm/Analysis/MemoryProfileInfo.h | 14 ++++
 llvm/lib/Analysis/MemoryProfileInfo.cpp       | 29 +++++++-
 llvm/test/Transforms/PGOProfile/memprof.ll    |  9 ---
 .../Analysis/MemoryProfileInfoTest.cpp        | 73 ++++++-------------
 4 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/llvm/include/llvm/Analysis/MemoryProfileInfo.h b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
index 215139caef696..deb7ab134c161 100644
--- a/llvm/include/llvm/Analysis/MemoryProfileInfo.h
+++ b/llvm/include/llvm/Analysis/MemoryProfileInfo.h
@@ -65,6 +65,15 @@ class CallStackTrie {
     std::map<uint64_t, CallStackTrieNode *> Callers;
     CallStackTrieNode(AllocationType Type)
         : AllocTypes(static_cast<uint8_t>(Type)) {}
+    void addAllocType(AllocationType AllocType) {
+      AllocTypes |= static_cast<uint8_t>(AllocType);
+    }
+    void removeAllocType(AllocationType AllocType) {
+      AllocTypes &= ~static_cast<uint8_t>(AllocType);
+    }
+    bool hasAllocType(AllocationType AllocType) const {
+      return AllocTypes & static_cast<uint8_t>(AllocType);
+    }
   };
 
   // The node for the allocation at the root.
@@ -85,6 +94,11 @@ class CallStackTrie {
   void collectContextSizeInfo(CallStackTrieNode *Node,
                               std::vector<ContextTotalSize> &ContextSizeInfo);
 
+  // Recursively convert hot allocation types to notcold, since we don't
+  // actually do any cloning for hot contexts, to facilitate more aggressive
+  // pruning of contexts.
+  void convertHotToNotCold(CallStackTrieNode *Node);
+
   // Recursive helper to trim contexts and create metadata nodes.
   bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
                      std::vector<uint64_t> &MIBCallStack,
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 52f4adbdb0429..5553a2e2dd24b 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -147,7 +147,7 @@ void CallStackTrie::addCallStack(
       First = false;
       if (Alloc) {
         assert(AllocStackId == StackId);
-        Alloc->AllocTypes |= static_cast<uint8_t>(AllocType);
+        Alloc->addAllocType(AllocType);
       } else {
         AllocStackId = StackId;
         Alloc = new CallStackTrieNode(AllocType);
@@ -159,7 +159,7 @@ void CallStackTrie::addCallStack(
     auto Next = Curr->Callers.find(StackId);
     if (Next != Curr->Callers.end()) {
       Curr = Next->second;
-      Curr->AllocTypes |= static_cast<uint8_t>(AllocType);
+      Curr->addAllocType(AllocType);
       continue;
     }
     // Otherwise add a new caller node.
@@ -228,6 +228,15 @@ void CallStackTrie::collectContextSizeInfo(
     collectContextSizeInfo(Caller.second, ContextSizeInfo);
 }
 
+void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
+  if (Node->hasAllocType(AllocationType::Hot)) {
+    Node->removeAllocType(AllocationType::Hot);
+    Node->addAllocType(AllocationType::NotCold);
+  }
+  for (auto &Caller : Node->Callers)
+    convertHotToNotCold(Caller.second);
+}
+
 // Recursive helper to trim contexts and create metadata nodes.
 // Caller should have pushed Node's loc to MIBCallStack. Doing this in the
 // caller makes it simpler to handle the many early returns in this method.
@@ -307,6 +316,22 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
                                 "single");
     return false;
   }
+  // If there were any hot allocation contexts, the Alloc trie node would have
+  // the Hot type set. If so, because we don't currently support cloning for hot
+  // contexts, they should be converted to NotCold. This happens in the cloning
+  // support anyway, however, doing this now enables more aggressive context
+  // trimming when building the MIB metadata (and possibly may make the
+  // allocation have a single NotCold allocation type), greatly reducing
+  // overheads in bitcode, cloning memory and cloning time.
+  if (Alloc->hasAllocType(AllocationType::Hot)) {
+    convertHotToNotCold(Alloc);
+    // Check whether we now have a single alloc type.
+    if (hasSingleAllocType(Alloc->AllocTypes)) {
+      addSingleAllocTypeAttribute(CI, (AllocationType)Alloc->AllocTypes,
+                                  "single");
+      return false;
+    }
+  }
   auto &Ctx = CI->getContext();
   std::vector<uint64_t> MIBCallStack;
   MIBCallStack.push_back(AllocStackId);
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index 367069e993fe1..6aa2d307a1dc8 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -84,8 +84,6 @@
 ; RUN: llvm-profdata merge -memprof-random-hotness -memprof-random-hotness-seed=1730170724 %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand2 2>&1 | FileCheck %s --check-prefix=RAND2
 ; RAND2: random hotness seed = 1730170724
 ; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdatarand2>' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS
-;; Check with hot hints enabled
-; RUN: opt < %s -memprof-use-hot-hints -passes='memprof-use<profile-filename=%t.memprofdatarand2>' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2HOT
 
 ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched
 ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched
@@ -413,13 +411,6 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROFRAND2: !"notcold"
 ; MEMPROFRAND2: !"notcold"
 
-;; With hot hints enabled the last 2 should be hot.
-; MEMPROFRAND2HOT: !"cold"
-; MEMPROFRAND2HOT: !"cold"
-; MEMPROFRAND2HOT: !"cold"
-; MEMPROFRAND2HOT: !"hot"
-; MEMPROFRAND2HOT: !"hot"
-
 ; MEMPROFSTATS:  8 memprof - Number of alloc contexts in memory profile.
 ; MEMPROFSTATS: 10 memprof - Number of callsites in memory profile.
 ; MEMPROFSTATS:  6 memprof - Number of functions having valid memory profile.
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index 3888faf5453d3..b4e81e69116e8 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -165,6 +165,8 @@ define i32* @test() {
   %1 = bitcast i8* %call2 to i32*
   %call3 = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40)
   %2 = bitcast i8* %call3 to i32*  
+  %call4 = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40)
+  %3 = bitcast i8* %call4 to i32*
   ret i32* %1
 }
 declare dso_local noalias noundef i8* @malloc(i64 noundef)
@@ -204,6 +206,18 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
   EXPECT_FALSE(Call3->hasMetadata(LLVMContext::MD_memprof));
   EXPECT_TRUE(Call3->hasFnAttr("memprof"));
   EXPECT_EQ(Call3->getFnAttr("memprof").getValueAsString(), "hot");
+
+  // Fourth call has hot and non-cold contexts. These should be treated as
+  // notcold and given a notcold attribute.
+  CallStackTrie Trie4;
+  Trie4.addCallStack(AllocationType::Hot, {5, 6});
+  Trie4.addCallStack(AllocationType::NotCold, {5, 7, 8});
+  CallBase *Call4 = findCall(*Func, "call4");
+  Trie4.buildAndAttachMIBMetadata(Call4);
+
+  EXPECT_FALSE(Call4->hasMetadata(LLVMContext::MD_memprof));
+  EXPECT_TRUE(Call4->hasFnAttr("memprof"));
+  EXPECT_EQ(Call4->getFnAttr("memprof").getValueAsString(), "notcold");
 }
 
 // Test CallStackTrie::addCallStack interface taking allocation type and list of
@@ -299,56 +313,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Cold);
     else {
       ASSERT_EQ(StackId->getZExtValue(), 3u);
-      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot);
-    }
-  }
-}
-
-// Test CallStackTrie::addCallStack interface taking allocation type and list of
-// call stack ids.
-// Test that an allocation call reached by both non cold and hot call stacks
-// gets memprof metadata representing the different allocation type contexts.
-TEST_F(MemoryProfileInfoTest, NotColdAndHotMIB) {
-  LLVMContext C;
-  std::unique_ptr<Module> M = makeLLVMModule(C,
-                                             R"IR(
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux-gnu"
-define i32* @test() {
-entry:
-  %call = call noalias dereferenceable_or_null(40) i8* @malloc(i64 noundef 40)
-  %0 = bitcast i8* %call to i32*
-  ret i32* %0
-}
-declare dso_local noalias noundef i8* @malloc(i64 noundef)
-)IR");
-
-  Function *Func = M->getFunction("test");
-
-  CallStackTrie Trie;
-  Trie.addCallStack(AllocationType::NotCold, {1, 2});
-  Trie.addCallStack(AllocationType::Hot, {1, 3});
-
-  CallBase *Call = findCall(*Func, "call");
-  Trie.buildAndAttachMIBMetadata(Call);
-
-  EXPECT_FALSE(Call->hasFnAttr("memprof"));
-  EXPECT_TRUE(Call->hasMetadata(LLVMContext::MD_memprof));
-  MDNode *MemProfMD = Call->getMetadata(LLVMContext::MD_memprof);
-  ASSERT_EQ(MemProfMD->getNumOperands(), 2u);
-  for (auto &MIBOp : MemProfMD->operands()) {
-    MDNode *MIB = dyn_cast<MDNode>(MIBOp);
-    MDNode *StackMD = getMIBStackNode(MIB);
-    ASSERT_NE(StackMD, nullptr);
-    ASSERT_EQ(StackMD->getNumOperands(), 2u);
-    auto *StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(0));
-    ASSERT_EQ(StackId->getZExtValue(), 1u);
-    StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(1));
-    if (StackId->getZExtValue() == 2u)
+      // Hot contexts are converted to NotCold when building the metadata.
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
-    else {
-      ASSERT_EQ(StackId->getZExtValue(), 3u);
-      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot);
     }
   }
 }
@@ -401,7 +367,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     } else {
       ASSERT_EQ(StackId->getZExtValue(), 4u);
-      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot);
+      // Hot contexts are converted to NotCold when building the metadata.
+      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     }
   }
 }
@@ -463,7 +430,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     else {
       ASSERT_EQ(StackId->getZExtValue(), 8u);
-      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot);
+      // Hot contexts are converted to NotCold when building the metadata.
+      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     }
   }
 }
@@ -606,7 +574,8 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     else {
       ASSERT_EQ(StackId->getZExtValue(), 8u);
-      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Hot);
+      // Hot contexts are converted to NotCold when building the new metadata.
+      EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
     }
   }
 }

From db1ee18eda6329d7577ad019a47822220b3e293d Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Fri, 24 Jan 2025 15:56:07 -0800
Subject: [PATCH 068/432] NFC: Typo fix

Change-Id: I08470bc617490558250136ea35a4964003fa9981
---
 llvm/docs/AMDGPUUsage.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 8f09df2406f10..71f11bf89368f 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1327,7 +1327,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    with the fifth i32 operand. The i1 sixth operand is used to clamp
                                                    the output. The i1s preceding the vector operands decide the signedness.
 
-  llvm.amdgcn.sched_barrier                        Controls the types of instructions that may be allowed to cross the intrinsic
+  llvm.amdgcn.sched.barrier                        Controls the types of instructions that may be allowed to cross the intrinsic
                                                    during instruction scheduling. The parameter is a mask for the instruction types
                                                    that can cross the intrinsic.
 
@@ -1345,7 +1345,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    - 0x0200: All DS write instructions may be scheduled across sched_barrier.
                                                    - 0x0400: All Transcendental (e.g. V_EXP) instructions may be scheduled across sched_barrier.
 
-  llvm.amdgcn.sched_group_barrier                  Creates schedule groups with specific properties to create custom scheduling
+  llvm.amdgcn.sched.group.barrier                  Creates schedule groups with specific properties to create custom scheduling
                                                    pipelines. The ordering between groups is enforced by the instruction scheduler.
                                                    The intrinsic applies to the code that preceeds the intrinsic. The intrinsic
                                                    takes three values that control the behavior of the schedule groups.
@@ -1369,7 +1369,7 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
                                                    |  ``// 5 MFMA``
                                                    |  ``__builtin_amdgcn_sched_group_barrier(8, 5, 0)``
 
-  llvm.amdgcn.iglp_opt                             An **experimental** intrinsic for instruction group level parallelism. The intrinsic
+  llvm.amdgcn.iglp.opt                             An **experimental** intrinsic for instruction group level parallelism. The intrinsic
                                                    implements predefined intruction scheduling orderings. The intrinsic applies to the
                                                    surrounding scheduling region. The intrinsic takes a value that specifies the
                                                    strategy.  The compiler implements two strategies.

From 1b4bd4e1a5120c8bb4daa44787a3bc4559b6b3b4 Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Fri, 24 Jan 2025 16:43:02 -0800
Subject: [PATCH 069/432] [BOLT][AArch64] Remove assertions from jump table
 heuristic (#124372)

The code for jump table detection on AArch64 asserts liberally whenever
the input instruction sequence does not match the expected pattern. As a
result, BOLT fails to process binaries with such sequences instead of
ignoring functions with unknown control flow.

Remove asserts in analyzeIndirectBranchFragment() and mark indirect
jumps as instructions with unknown control flow instead.
---
 .../Target/AArch64/AArch64MCPlusBuilder.cpp   | 45 ++++++++++---------
 bolt/test/AArch64/jump-table-heuristic-fail.s | 29 ++++++++++++
 bolt/test/AArch64/test-indirect-branch.s      |  9 ++--
 3 files changed, 58 insertions(+), 25 deletions(-)
 create mode 100644 bolt/test/AArch64/jump-table-heuristic-fail.s

diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index ac709c5dd063a..4b21ff719b3ab 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -834,6 +834,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   ///                                   #  of this BB)
   ///   br      x0                      # Indirect jump instruction
   ///
+  /// Return true on successful jump table instruction sequence match, false
+  /// otherwise.
   bool analyzeIndirectBranchFragment(
       const MCInst &Inst,
       DenseMap<const MCInst *, SmallVector<MCInst *, 4>> &UDChain,
@@ -842,6 +844,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     // Expect AArch64 BR
     assert(Inst.getOpcode() == AArch64::BR && "Unexpected opcode");
 
+    JumpTable = nullptr;
+
     // Match the indirect branch pattern for aarch64
     SmallVector<MCInst *, 4> &UsesRoot = UDChain[&Inst];
     if (UsesRoot.size() == 0 || UsesRoot[0] == nullptr)
@@ -879,8 +883,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       // Parsed as ADDXrs reg:x8 reg:x8 reg:x12 imm:0
       return false;
     }
-    assert(DefAdd->getOpcode() == AArch64::ADDXrx &&
-           "Failed to match indirect branch!");
+    if (DefAdd->getOpcode() != AArch64::ADDXrx)
+      return false;
 
     // Validate ADD operands
     int64_t OperandExtension = DefAdd->getOperand(3).getImm();
@@ -897,8 +901,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       //   ldr     w7, [x6]
       //   add     x6, x6, w7, sxtw => no shift amount
       //   br      x6
-      errs() << "BOLT-WARNING: "
-                "Failed to match indirect branch: ShiftVAL != 2 \n";
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: "
+                           "failed to match indirect branch: ShiftVAL != 2\n");
       return false;
     }
 
@@ -909,7 +913,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     else if (ExtendType == AArch64_AM::SXTW)
       ScaleValue = 4LL;
     else
-      llvm_unreachable("Failed to match indirect branch! (fragment 3)");
+      return false;
 
     // Match an ADR to load base address to be used when addressing JT targets
     SmallVector<MCInst *, 4> &UsesAdd = UDChain[DefAdd];
@@ -920,18 +924,15 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       return false;
     }
     MCInst *DefBaseAddr = UsesAdd[1];
-    assert(DefBaseAddr->getOpcode() == AArch64::ADR &&
-           "Failed to match indirect branch pattern! (fragment 3)");
+    if (DefBaseAddr->getOpcode() != AArch64::ADR)
+      return false;
 
     PCRelBase = DefBaseAddr;
     // Match LOAD to load the jump table (relative) target
     const MCInst *DefLoad = UsesAdd[2];
-    assert(mayLoad(*DefLoad) &&
-           "Failed to match indirect branch load pattern! (1)");
-    assert((ScaleValue != 1LL || isLDRB(*DefLoad)) &&
-           "Failed to match indirect branch load pattern! (2)");
-    assert((ScaleValue != 2LL || isLDRH(*DefLoad)) &&
-           "Failed to match indirect branch load pattern! (3)");
+    if (!mayLoad(*DefLoad) || (ScaleValue == 1LL && !isLDRB(*DefLoad)) ||
+        (ScaleValue == 2LL && !isLDRH(*DefLoad)))
+      return false;
 
     // Match ADD that calculates the JumpTable Base Address (not the offset)
     SmallVector<MCInst *, 4> &UsesLoad = UDChain[DefLoad];
@@ -941,7 +942,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
         isRegToRegMove(*DefJTBaseAdd, From, To)) {
       // Sometimes base address may have been defined in another basic block
       // (hoisted). Return with no jump table info.
-      JumpTable = nullptr;
       return true;
     }
 
@@ -953,24 +953,27 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
       //  adr     x12, 0x247b30 <__gettextparse+0x5b0>
       //  add     x13, x12, w13, sxth #2
       //  br      x13
-      errs() << "BOLT-WARNING: Failed to match indirect branch: "
-                "nop/adr instead of adrp/add \n";
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: failed to match indirect branch: "
+                           "nop/adr instead of adrp/add\n");
       return false;
     }
 
-    assert(DefJTBaseAdd->getOpcode() == AArch64::ADDXri &&
-           "Failed to match jump table base address pattern! (1)");
+    if (DefJTBaseAdd->getOpcode() != AArch64::ADDXri) {
+      LLVM_DEBUG(dbgs() << "BOLT-DEBUG: failed to match jump table base "
+                           "address pattern! (1)\n");
+      return false;
+    }
 
     if (DefJTBaseAdd->getOperand(2).isImm())
       Offset = DefJTBaseAdd->getOperand(2).getImm();
     SmallVector<MCInst *, 4> &UsesJTBaseAdd = UDChain[DefJTBaseAdd];
     const MCInst *DefJTBasePage = UsesJTBaseAdd[1];
     if (DefJTBasePage == nullptr || isLoadFromStack(*DefJTBasePage)) {
-      JumpTable = nullptr;
       return true;
     }
-    assert(DefJTBasePage->getOpcode() == AArch64::ADRP &&
-           "Failed to match jump table base page pattern! (2)");
+    if (DefJTBasePage->getOpcode() != AArch64::ADRP)
+      return false;
+
     if (DefJTBasePage->getOperand(1).isExpr())
       JumpTable = DefJTBasePage->getOperand(1).getExpr();
     return true;
diff --git a/bolt/test/AArch64/jump-table-heuristic-fail.s b/bolt/test/AArch64/jump-table-heuristic-fail.s
new file mode 100644
index 0000000000000..724171ac39925
--- /dev/null
+++ b/bolt/test/AArch64/jump-table-heuristic-fail.s
@@ -0,0 +1,29 @@
+## Verify that BOLT does not crash while encountering instruction sequence that
+## does not perfectly match jump table pattern.
+
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
+# RUN: %clang %cflags --target=aarch64-unknown-linux %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg 2>&1 | FileCheck %s
+
+  .section .text
+  .align 4
+  .globl _start
+  .type  _start, %function
+_start:
+  sub     w0, w0, #0x4a
+## The address loaded into x22 is undefined. However, the instructions that
+## follow ldr, use the x22 address as a regular jump table.
+  ldr     x22, [x29, #0x98]
+  ldrb    w0, [x22, w0, uxtw]
+  adr     x1, #12
+  add     x0, x1, w0, sxtb #2
+  br      x0
+# CHECK: br x0 # UNKNOWN
+.L0:
+  ret
+.size _start, .-_start
+
+## Force relocation mode.
+  .reloc 0, R_AARCH64_NONE
diff --git a/bolt/test/AArch64/test-indirect-branch.s b/bolt/test/AArch64/test-indirect-branch.s
index 168e50c8f47f5..1e16e76b11530 100644
--- a/bolt/test/AArch64/test-indirect-branch.s
+++ b/bolt/test/AArch64/test-indirect-branch.s
@@ -3,10 +3,11 @@
 
 // clang-format off
 
-// REQUIRES: system-linux
+// REQUIRES: system-linux, asserts
+
 // RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
 // RUN: %clang %cflags --target=aarch64-unknown-linux %t.o -o %t.exe -Wl,-q
-// RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg --strict\
+// RUN: llvm-bolt %t.exe -o %t.bolt --print-cfg --strict --debug-only=mcplus \
 // RUN:  -v=1 2>&1 | FileCheck %s
 
 // Pattern 1: there is no shift amount after the 'add' instruction.
@@ -39,7 +40,7 @@ _start:
 // svc #0
 
 // Pattern 1
-// CHECK: BOLT-WARNING: Failed to match indirect branch: ShiftVAL != 2
+// CHECK: BOLT-DEBUG: failed to match indirect branch: ShiftVAL != 2
   .globl test1
   .type  test1, %function
 test1:
@@ -57,7 +58,7 @@ test1_2:
    ret
 
 // Pattern 2
-// CHECK: BOLT-WARNING: Failed to match indirect branch: nop/adr instead of adrp/add
+// CHECK: BOLT-DEBUG: failed to match indirect branch: nop/adr instead of adrp/add
   .globl test2
   .type  test2, %function
 test2:

From d92bac8a3ebb19106f6bca6b7613a27c52cb48ab Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Fri, 24 Jan 2025 16:48:35 -0800
Subject: [PATCH 070/432] [HLSL] Introduce address space `hlsl_constant(2)` for
 constant buffer declarations (#123411)

Introduces a new address space `hlsl_constant(2)` for constant buffer
declarations.

This address space is applied to declarations inside `cbuffer` block.
Later on, it will also be applied to `ConstantBuffer<T>` syntax and the
default `$Globals` constant buffer.

Clang codegen translates constant buffer declarations to global
variables and loads from `hlsl_constant(2)` address space. More work
coming soon will include addition of metadata that will map these
globals to individual constant buffers and enable their transformation
to appropriate constant buffer load intrinsics later on in an LLVM pass.

Fixes #123406
---
 clang/include/clang/Basic/AddressSpaces.h     |  1 +
 clang/lib/AST/TypePrinter.cpp                 |  6 +-
 clang/lib/Basic/Targets/AArch64.h             |  1 +
 clang/lib/Basic/Targets/AMDGPU.cpp            | 21 ++++---
 clang/lib/Basic/Targets/DirectX.h             |  1 +
 clang/lib/Basic/Targets/NVPTX.h               |  1 +
 clang/lib/Basic/Targets/SPIR.h                |  2 +
 clang/lib/Basic/Targets/SystemZ.h             |  1 +
 clang/lib/Basic/Targets/TCE.h                 |  1 +
 clang/lib/Basic/Targets/WebAssembly.h         |  1 +
 clang/lib/Basic/Targets/X86.h                 |  1 +
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 17 +----
 clang/lib/Sema/SemaHLSL.cpp                   | 13 +++-
 .../ast-dump-comment-cbuffer-tbuffer.hlsl     | 62 -------------------
 .../AST/HLSL/ast-dump-comment-cbuffer.hlsl    | 32 ++++++++++
 clang/test/AST/HLSL/cbuffer.hlsl              | 24 +++----
 .../test/AST/HLSL/cbuffer_and_namespaces.hlsl | 14 ++---
 clang/test/AST/HLSL/packoffset.hlsl           | 38 ++++++------
 clang/test/AST/HLSL/pch_hlsl_buffer.hlsl      |  8 +--
 .../test/AST/HLSL/resource_binding_attr.hlsl  |  8 +--
 clang/test/CodeGenHLSL/cbuf.hlsl              | 19 ++++--
 clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl | 14 +++--
 .../static_global_and_function_in_cb.hlsl     | 15 +++--
 .../SemaTemplate/address_space-dependent.cpp  |  2 +-
 24 files changed, 148 insertions(+), 155 deletions(-)
 delete mode 100644 clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
 create mode 100644 clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl

diff --git a/clang/include/clang/Basic/AddressSpaces.h b/clang/include/clang/Basic/AddressSpaces.h
index 7b723d508fff1..d18bfe54931f9 100644
--- a/clang/include/clang/Basic/AddressSpaces.h
+++ b/clang/include/clang/Basic/AddressSpaces.h
@@ -58,6 +58,7 @@ enum class LangAS : unsigned {
 
   // HLSL specific address spaces.
   hlsl_groupshared,
+  hlsl_constant,
 
   // Wasm specific address spaces.
   wasm_funcref,
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index a850410ffc846..31695374cb52b 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2552,10 +2552,12 @@ std::string Qualifiers::getAddrSpaceAsString(LangAS AS) {
     return "__uptr __ptr32";
   case LangAS::ptr64:
     return "__ptr64";
-  case LangAS::wasm_funcref:
-    return "__funcref";
   case LangAS::hlsl_groupshared:
     return "groupshared";
+  case LangAS::hlsl_constant:
+    return "hlsl_constant";
+  case LangAS::wasm_funcref:
+    return "__funcref";
   default:
     return std::to_string(toTargetAddressSpace(AS));
   }
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index ecf80b23a508c..600940f5e4e23 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -44,6 +44,7 @@ static const unsigned ARM64AddrSpaceMap[] = {
     static_cast<unsigned>(AArch64AddrSpace::ptr32_uptr),
     static_cast<unsigned>(AArch64AddrSpace::ptr64),
     0, // hlsl_groupshared
+    0, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp
index 99f8f2944e279..0d308cb6af969 100644
--- a/clang/lib/Basic/Targets/AMDGPU.cpp
+++ b/clang/lib/Basic/Targets/AMDGPU.cpp
@@ -59,6 +59,7 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsGenMap = {
     llvm::AMDGPUAS::FLAT_ADDRESS,     // ptr32_uptr
     llvm::AMDGPUAS::FLAT_ADDRESS,     // ptr64
     llvm::AMDGPUAS::FLAT_ADDRESS,     // hlsl_groupshared
+    llvm::AMDGPUAS::CONSTANT_ADDRESS, // hlsl_constant
 };
 
 const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = {
@@ -74,16 +75,16 @@ const LangASMap AMDGPUTargetInfo::AMDGPUDefIsPrivMap = {
     llvm::AMDGPUAS::CONSTANT_ADDRESS, // cuda_constant
     llvm::AMDGPUAS::LOCAL_ADDRESS,    // cuda_shared
     // SYCL address space values for this map are dummy
-    llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global
-    llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_device
-    llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_global_host
-    llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_local
-    llvm::AMDGPUAS::FLAT_ADDRESS, // sycl_private
-    llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_sptr
-    llvm::AMDGPUAS::FLAT_ADDRESS, // ptr32_uptr
-    llvm::AMDGPUAS::FLAT_ADDRESS, // ptr64
-    llvm::AMDGPUAS::FLAT_ADDRESS, // hlsl_groupshared
-
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // sycl_global
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // sycl_global_device
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // sycl_global_host
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // sycl_local
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // sycl_private
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // ptr32_sptr
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // ptr32_uptr
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // ptr64
+    llvm::AMDGPUAS::FLAT_ADDRESS,     // hlsl_groupshared
+    llvm::AMDGPUAS::CONSTANT_ADDRESS, // hlsl_constant
 };
 } // namespace targets
 } // namespace clang
diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h
index ab22d1281a4df..4e6bc0e040398 100644
--- a/clang/lib/Basic/Targets/DirectX.h
+++ b/clang/lib/Basic/Targets/DirectX.h
@@ -42,6 +42,7 @@ static const unsigned DirectXAddrSpaceMap[] = {
     0, // ptr32_uptr
     0, // ptr64
     3, // hlsl_groupshared
+    2, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index d81b89a7f24ac..c6531148fe30c 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -46,6 +46,7 @@ static const unsigned NVPTXAddrSpaceMap[] = {
     0, // ptr32_uptr
     0, // ptr64
     0, // hlsl_groupshared
+    0, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 5a328b9ceeb1d..c0849b69dcdb3 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -47,6 +47,7 @@ static const unsigned SPIRDefIsPrivMap[] = {
     0, // ptr32_uptr
     0, // ptr64
     0, // hlsl_groupshared
+    2, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
@@ -80,6 +81,7 @@ static const unsigned SPIRDefIsGenMap[] = {
     0, // ptr32_uptr
     0, // ptr64
     0, // hlsl_groupshared
+    0, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h
index d05948586c467..bd2827cf13a5b 100644
--- a/clang/lib/Basic/Targets/SystemZ.h
+++ b/clang/lib/Basic/Targets/SystemZ.h
@@ -42,6 +42,7 @@ static const unsigned ZOSAddressMap[] = {
     1, // ptr32_uptr
     0, // ptr64
     0, // hlsl_groupshared
+    0, // hlsl_constant
     0  // wasm_funcref
 };
 
diff --git a/clang/lib/Basic/Targets/TCE.h b/clang/lib/Basic/Targets/TCE.h
index d6280b02f07b2..edec30bf69de0 100644
--- a/clang/lib/Basic/Targets/TCE.h
+++ b/clang/lib/Basic/Targets/TCE.h
@@ -51,6 +51,7 @@ static const unsigned TCEOpenCLAddrSpaceMap[] = {
     0, // ptr32_uptr
     0, // ptr64
     0, // hlsl_groupshared
+    0, // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h
index 0a14da6a277b8..cfecc59ac75fd 100644
--- a/clang/lib/Basic/Targets/WebAssembly.h
+++ b/clang/lib/Basic/Targets/WebAssembly.h
@@ -42,6 +42,7 @@ static const unsigned WebAssemblyAddrSpaceMap[] = {
     0,  // ptr32_uptr
     0,  // ptr64
     0,  // hlsl_groupshared
+    0,  // hlsl_constant
     20, // wasm_funcref
 };
 
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 2c200e64a3d84..8bd54e362526f 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -46,6 +46,7 @@ static const unsigned X86AddrSpaceMap[] = {
     271, // ptr32_uptr
     272, // ptr64
     0,   // hlsl_groupshared
+    0,   // hlsl_constant
     // Wasm address space values for this target are dummy values,
     // as it is only enabled for Wasm targets.
     20, // wasm_funcref
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 345e218f42451..2ce54cc3c52ef 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -100,22 +100,6 @@ GlobalVariable *replaceBuffer(CGHLSLRuntime::Buffer &Buf) {
       llvm::formatv("{0}{1}", Buf.Name, Buf.IsCBuffer ? ".cb." : ".tb."),
       GlobalValue::NotThreadLocal);
 
-  IRBuilder<> B(CBGV->getContext());
-  Value *ZeroIdx = B.getInt32(0);
-  // Replace Const use with CB use.
-  for (auto &[GV, Offset] : Buf.Constants) {
-    Value *GEP =
-        B.CreateGEP(Buf.LayoutStruct, CBGV, {ZeroIdx, B.getInt32(Offset)});
-
-    assert(Buf.LayoutStruct->getElementType(Offset) == GV->getValueType() &&
-           "constant type mismatch");
-
-    // Replace.
-    GV->replaceAllUsesWith(GEP);
-    // Erase GV.
-    GV->removeDeadConstantUsers();
-    GV->eraseFromParent();
-  }
   return CBGV;
 }
 
@@ -144,6 +128,7 @@ void CGHLSLRuntime::addConstant(VarDecl *D, Buffer &CB) {
   }
 
   auto *GV = cast<GlobalVariable>(CGM.GetAddrOfGlobalVar(D));
+  GV->setExternallyInitialized(true);
   // Add debug info for constVal.
   if (CGDebugInfo *DI = CGM.getModuleDebugInfo())
     if (CGM.getCodeGenOpts().getDebugInfo() >=
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index f26469e6a2f1d..a7033cb54886a 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -25,6 +25,7 @@
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/Specifiers.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/ParsedAttr.h"
@@ -475,14 +476,20 @@ void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) {
   LS->setImplicit(true);
   LS->startDefinition();
 
-  for (const Decl *D : BufDecl->decls()) {
-    const VarDecl *VD = dyn_cast<VarDecl>(D);
+  for (Decl *D : BufDecl->decls()) {
+    VarDecl *VD = dyn_cast<VarDecl>(D);
     if (!VD || VD->getStorageClass() == SC_Static)
       continue;
     const Type *Ty = VD->getType()->getUnqualifiedDesugaredType();
     if (FieldDecl *FD =
-            createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS))
+            createFieldForHostLayoutStruct(S, Ty, VD->getIdentifier(), LS)) {
+      // add the field decl to the layout struct
       LS->addDecl(FD);
+      // update address space of the original decl to hlsl_constant
+      QualType NewTy =
+          AST.getAddrSpaceQualType(VD->getType(), LangAS::hlsl_constant);
+      VD->setType(NewTy);
+    }
   }
   LS->completeDefinition();
   BufDecl->addDecl(LS);
diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
deleted file mode 100644
index 0bff3ae144037..0000000000000
--- a/clang/test/AST/HLSL/ast-dump-comment-cbuffer-tbuffer.hlsl
+++ /dev/null
@@ -1,62 +0,0 @@
-// RUN: %clang_cc1 -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON
-// RUN: %clang_cc1 -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST
-
-// JSON:"kind": "HLSLBufferDecl",
-// JSON:"name": "A",
-// JSON-NEXT:"bufferKind": "cbuffer",
-// JSON:"kind": "TextComment",
-// JSON:"text": " CBuffer decl."
-
-/// CBuffer decl.
-cbuffer A {
-    // JSON: "kind": "VarDecl",
-    // JSON: "name": "a",
-    // JSON: "qualType": "float"
-    float a;
-    // JSON: "kind": "VarDecl",
-    // JSON: "name": "b",
-    // JSON: "qualType": "int"
-    int b;
-}
-
-// JSON:"kind": "HLSLBufferDecl",
-// JSON:"name": "B",
-// JSON-NEXT:"bufferKind": "tbuffer",
-// JSON:"kind": "TextComment",
-// JSON:"text": " TBuffer decl."
-
-/// TBuffer decl.
-tbuffer B {
-    // JSON: "kind": "VarDecl",
-    // JSON: "name": "c",
-    // JSON: "qualType": "float"
-    float c;
-    // JSON: "kind": "VarDecl",
-    // JSON: "name": "d",
-    // JSON: "qualType": "int"
-    int d;
-}
-
-// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A
-// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer
-// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer
-// AST-NEXT: FullComment
-// AST-NEXT: ParagraphComment
-// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl."
-// AST-NEXT: VarDecl {{.*}} a 'float'
-// AST-NEXT: VarDecl {{.*}} b 'int'
-// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition
-// AST: FieldDecl {{.*}} a 'float'
-// AST-NEXT: FieldDecl {{.*}} b 'int'
-
-// AST-NEXT: HLSLBufferDecl {{.*}} line:29:9 tbuffer B
-// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV
-// AST-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer
-// AST-NEXT: FullComment
-// AST-NEXT: ParagraphComment
-// AST-NEXT: TextComment {{.*}} Text=" TBuffer decl."
-// AST-NEXT: VarDecl {{.*}} c 'float'
-// AST-NEXT: VarDecl {{.*}} d 'int'
-// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_B definition
-// AST: FieldDecl {{.*}} c 'float'
-// AST-NEXT: FieldDecl {{.*}} d 'int'
diff --git a/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl
new file mode 100644
index 0000000000000..b2b3e13308da3
--- /dev/null
+++ b/clang/test/AST/HLSL/ast-dump-comment-cbuffer.hlsl
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -Wdocumentation -ast-dump=json -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=JSON
+// RUN: %clang_cc1 -Wdocumentation -ast-dump -x hlsl -triple dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefix=AST
+
+// JSON:"kind": "HLSLBufferDecl",
+// JSON:"name": "A",
+// JSON-NEXT:"bufferKind": "cbuffer",
+// JSON:"kind": "TextComment",
+// JSON:"text": " CBuffer decl."
+
+/// CBuffer decl.
+cbuffer A {
+    // JSON: "kind": "VarDecl",
+    // JSON: "name": "a",
+    // JSON: "qualType": "hlsl_constant float"
+    float a;
+    // JSON: "kind": "VarDecl",
+    // JSON: "name": "b",
+    // JSON: "qualType": "hlsl_constant int"
+    int b;
+}
+
+// AST: HLSLBufferDecl {{.*}} line:11:9 cbuffer A
+// AST-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer
+// AST-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer
+// AST-NEXT: FullComment
+// AST-NEXT: ParagraphComment
+// AST-NEXT: TextComment {{.*}} Text=" CBuffer decl."
+// AST-NEXT: VarDecl {{.*}} a 'hlsl_constant float'
+// AST-NEXT: VarDecl {{.*}} b 'hlsl_constant int'
+// AST-NEXT: CXXRecordDecl {{.*}} implicit class __layout_A definition
+// AST: FieldDecl {{.*}} a 'float'
+// AST-NEXT: FieldDecl {{.*}} b 'int'
diff --git a/clang/test/AST/HLSL/cbuffer.hlsl b/clang/test/AST/HLSL/cbuffer.hlsl
index 721abb290f163..f516cf5099e82 100644
--- a/clang/test/AST/HLSL/cbuffer.hlsl
+++ b/clang/test/AST/HLSL/cbuffer.hlsl
@@ -48,7 +48,7 @@ struct TwoFloats {
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}} col:9 used a1 'float'
+  // CHECK: VarDecl {{.*}} col:9 used a1 'hlsl_constant float'
   float a1;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB definition
   // CHECK: FieldDecl {{.*}} a1 'float'
@@ -60,7 +60,7 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(OneFloat, __layout
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}} col:9 used a2 'float'
+  // CHECK: VarDecl {{.*}} col:9 used a2 'hlsl_constant float'
   float a2;
   // CHECK: VarDecl {{.*}} col:19 b2 'RWBuffer<float>':'hlsl::RWBuffer<float>'
   RWBuffer<float> b2; 
@@ -68,7 +68,7 @@ cbuffer CB {
   EmptyStruct c2;
   // CHECK: VarDecl {{.*}} col:9 d2 'float[0]'
   float d2[0];
-  // CHECK: VarDecl {{.*}} col:9 e2 'float'
+  // CHECK: VarDecl {{.*}} col:9 e2 'hlsl_constant float'
   float e2;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_1 definition
   // CHECK: FieldDecl {{.*}} a2 'float'
@@ -81,11 +81,11 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}} col:5 used s1 'A'
+  // CHECK: VarDecl {{.*}} col:5 used s1 'hlsl_constant A'
   A s1;
-  // CHECK: VarDecl {{.*}} col:5 s2 'B'
+  // CHECK: VarDecl {{.*}} col:5 s2 'hlsl_constant B'
   B s2;
-  // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C
+  // CHECK: VarDecl {{.*}} col:12 s3 'CTypedef':'C'
   CTypedef s3;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_2 definition
   // CHECK: FieldDecl {{.*}} s1 'A'
@@ -102,7 +102,7 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}} s4 'D'
+  // CHECK: VarDecl {{.*}} s4 'hlsl_constant D'
   D s4;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_3 definition
   // CHECK: FieldDecl {{.*}} s4 '__layout_D'
@@ -120,9 +120,9 @@ _Static_assert(__builtin_hlsl_is_scalarized_layout_compatible(TwoFloats, __layou
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}}  s5 'E'
+  // CHECK: VarDecl {{.*}}  s5 'hlsl_constant E'
   E s5;
-  // CHECK: VarDecl {{.*}} s6 'BTypedef':'B'
+  // CHECK: VarDecl {{.*}} s6 'hlsl_constant BTypedef':'hlsl_constant B'
   BTypedef s6;
   // CHECK: CXXRecordDecl {{.*}}  implicit referenced class __layout_CB_4 definition
   // CHECK: FieldDecl {{.*}} s5 '__layout_E'
@@ -158,7 +158,7 @@ cbuffer CB {
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB {
-  // CHECK: VarDecl {{.*}} s8 'F'
+  // CHECK: VarDecl {{.*}} s8 'hlsl_constant F'
   F s8;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB_6 definition
   // CHECK: FieldDecl {{.*}} s8 '__layout_F'
@@ -182,7 +182,7 @@ cbuffer CB {
     // CHECK: FieldDecl {{.*}} f 'RWBuffer<float>':'hlsl::RWBuffer<float>'
     RWBuffer<float> f;
   } s9;
-  // CHECK: VarDecl {{.*}} s9 'struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3
+  // CHECK: VarDecl {{.*}} s9 'hlsl_constant struct (unnamed struct at {{.*}}cbuffer.hlsl:177:3
   // CHECK: CXXRecordDecl {{.*}} struct definition
   struct {
     // CHECK: FieldDecl {{.*}} g 'float'
@@ -190,7 +190,7 @@ cbuffer CB {
     // CHECK: FieldDecl {{.*}} f 'RWBuffer<float>':'hlsl::RWBuffer<float>'
     RWBuffer<float> f;
   } s10;
-  // CHECK: VarDecl {{.*}} s10 'struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3
+  // CHECK: VarDecl {{.*}} s10 'hlsl_constant struct (unnamed struct at {{.*}}cbuffer.hlsl:187:3
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon definition
   // CHECK: FieldDecl {{.*}} e 'float'
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_anon_1 definition
diff --git a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
index 4b1bbea736f85..12ce327d8be02 100644
--- a/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
+++ b/clang/test/AST/HLSL/cbuffer_and_namespaces.hlsl
@@ -35,11 +35,11 @@ struct Foo {
 // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
 cbuffer CB1 {
-  // CHECK: VarDecl {{.*}} foo1 'Foo'
+  // CHECK: VarDecl {{.*}} foo1 'hlsl_constant Foo'
   Foo foo1;
-  // CHECK: VarDecl {{.*}} foo2 'NS1::Foo'
+  // CHECK: VarDecl {{.*}} foo2 'hlsl_constant NS1::Foo'
   NS1::Foo foo2;
-  // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo'
+  // CHECK: VarDecl {{.*}} foo3 'hlsl_constant NS1::Bar::Foo'
   NS1::Bar::Foo foo3;
   // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB1 definition
   // CHECK: FieldDecl {{.*}} foo1 '__layout_Foo'
@@ -65,13 +65,13 @@ namespace NS2 {
   // CHECK: HLSLResourceClassAttr {{.*}} Implicit CBuffer
   // CHECK: HLSLResourceAttr {{.*}} Implicit CBuffer
   cbuffer CB2 {
-    // CHECK: VarDecl {{.*}} foo0 '::Foo':'Foo'
+    // CHECK: VarDecl {{.*}} foo0 'hlsl_constant ::Foo':'hlsl_constant Foo'
     ::Foo foo0;
-    // CHECK: VarDecl {{.*}} foo1 'Foo':'NS2::Foo'
+    // CHECK: VarDecl {{.*}} foo1 'hlsl_constant Foo':'hlsl_constant NS2::Foo'
     Foo foo1;
-    // CHECK: VarDecl {{.*}} foo2 'NS1::Foo'
+    // CHECK: VarDecl {{.*}} foo2 'hlsl_constant NS1::Foo'
     NS1::Foo foo2;
-    // CHECK: VarDecl {{.*}} foo3 'NS1::Bar::Foo'
+    // CHECK: VarDecl {{.*}} foo3 'hlsl_constant NS1::Bar::Foo'
     NS1::Bar::Foo foo3;
     // CHECK: CXXRecordDecl {{.*}} implicit referenced class __layout_CB2 definition
     // CHECK: FieldDecl {{.*}} foo0 '__layout_Foo'
diff --git a/clang/test/AST/HLSL/packoffset.hlsl b/clang/test/AST/HLSL/packoffset.hlsl
index 9c928bd6d922e..a9bb90bb386f9 100644
--- a/clang/test/AST/HLSL/packoffset.hlsl
+++ b/clang/test/AST/HLSL/packoffset.hlsl
@@ -6,13 +6,13 @@ cbuffer A
 {
     // CHECK-NEXT:-HLSLResourceClassAttr {{.*}} <<invalid sloc>> Implicit CBuffer
     // CHECK-NEXT:-HLSLResourceAttr {{.*}} <<invalid sloc>> Implicit CBuffer
-    // CHECK-NEXT: VarDecl {{.*}} A1 'float4'
+    // CHECK-NEXT: VarDecl {{.*}} A1 'hlsl_constant float4'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0
     float4 A1 : packoffset(c);
-    // CHECK-NEXT: VarDecl {{.*}} col:11 A2 'float'
+    // CHECK-NEXT: VarDecl {{.*}} col:11 A2 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0
     float A2 : packoffset(c1);
-    // CHECK-NEXT: VarDecl {{.*}} col:11 A3 'float'
+    // CHECK-NEXT: VarDecl {{.*}} col:11 A3 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 1
     float A3 : packoffset(c1.y);
 }
@@ -20,13 +20,13 @@ cbuffer A
 // CHECK: HLSLBufferDecl {{.*}} cbuffer B
 cbuffer B
 {
-    // CHECK: VarDecl {{.*}} B0 'float'
+    // CHECK: VarDecl {{.*}} B0 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
     float B0 : packoffset(c0.g);
-    // CHECK-NEXT: VarDecl {{.*}} B1 'double'
+    // CHECK-NEXT: VarDecl {{.*}} B1 'hlsl_constant double'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2
 	double B1 : packoffset(c0.b);
-    // CHECK-NEXT: VarDecl {{.*}} B2 'half'
+    // CHECK-NEXT: VarDecl {{.*}} B2 'hlsl_constant half'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 0
 	half B2 : packoffset(c0.r);
 }
@@ -34,13 +34,13 @@ cbuffer B
 // CHECK: HLSLBufferDecl {{.*}} cbuffer C
 cbuffer C
 {
-    // CHECK: VarDecl {{.*}} C0 'float'
+    // CHECK: VarDecl {{.*}} C0 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1
     float C0 : packoffset(c0.y);
-    // CHECK-NEXT: VarDecl {{.*}} C1 'float2'
+    // CHECK-NEXT: VarDecl {{.*}} C1 'hlsl_constant float2'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2
 	float2 C1 : packoffset(c0.z);
-    // CHECK-NEXT: VarDecl {{.*}} C2 'half'
+    // CHECK-NEXT: VarDecl {{.*}} C2 'hlsl_constant half'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0
 	half C2 : packoffset(c0.x);
 }
@@ -49,16 +49,16 @@ cbuffer C
 // CHECK: HLSLBufferDecl {{.*}} cbuffer D
 cbuffer D
 {
-    // CHECK: VarDecl {{.*}} D0 'float'
+    // CHECK: VarDecl {{.*}} D0 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
     float D0 : packoffset(c0.y);
-    // CHECK-NEXT: VarDecl {{.*}} D1 'float[2]'
+    // CHECK-NEXT: VarDecl {{.*}} D1 'hlsl_constant float[2]'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 1 0
 	float D1[2] : packoffset(c1.x);
-    // CHECK-NEXT: VarDecl {{.*}} D2 'half3'
+    // CHECK-NEXT: VarDecl {{.*}} D2 'hlsl_constant half3'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 1
 	half3 D2 : packoffset(c2.y);
-    // CHECK-NEXT: VarDecl {{.*}} D3 'double'
+    // CHECK-NEXT: VarDecl {{.*}} D3 'hlsl_constant double'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 2
 	double D3 : packoffset(c0.z);
 }
@@ -71,13 +71,13 @@ struct ST {
 
 // CHECK: HLSLBufferDecl {{.*}} cbuffer S
 cbuffer S {
-    // CHECK: VarDecl {{.*}} S0 'float'
+    // CHECK: VarDecl {{.*}} S0 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 1
   float S0 : packoffset(c0.y);
-    // CHECK: VarDecl {{.*}} S1 'ST'
+    // CHECK: VarDecl {{.*}} S1 'hlsl_constant ST'
     // CHECK: HLSLPackOffsetAttr {{.*}} 1 0
   ST S1 : packoffset(c1);
-    // CHECK: VarDecl {{.*}} S2 'double2'
+    // CHECK: VarDecl {{.*}} S2 'hlsl_constant double2'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 2 0
   double2 S2 : packoffset(c2);
 }
@@ -90,13 +90,13 @@ struct ST2 {
 
 // CHECK: HLSLBufferDecl {{.*}} cbuffer S2
 cbuffer S2 {
-    // CHECK: VarDecl {{.*}} S20 'float'
+    // CHECK: VarDecl {{.*}} S20 'hlsl_constant float'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 0 3
   float S20 : packoffset(c0.a);
-    // CHECK: VarDecl {{.*}} S21 'ST2'
+    // CHECK: VarDecl {{.*}} S21 'hlsl_constant ST2'
     // CHECK: HLSLPackOffsetAttr {{.*}} 1 0
   ST2 S21 : packoffset(c1);
-    // CHECK: VarDecl {{.*}} S22 'half'
+    // CHECK: VarDecl {{.*}} S22 'hlsl_constant half'
     // CHECK-NEXT: HLSLPackOffsetAttr {{.*}} 3 1
   half S22 : packoffset(c3.y);
 }
diff --git a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
index 3eabbb1f8ae22..98d7aba397852 100644
--- a/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
+++ b/clang/test/AST/HLSL/pch_hlsl_buffer.hlsl
@@ -20,14 +20,14 @@ float foo() {
 // CHECK: HLSLBufferDecl {{.*}} line:7:9 imported <undeserialized declarations> cbuffer A
 // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit CBuffer
 // CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit CBuffer
-// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'float'
+// CHECK-NEXT: VarDecl 0x[[A:[0-9a-f]+]] {{.*}} imported used a 'hlsl_constant float'
 // CHECK-NEXT: CXXRecordDecl {{.*}} imported implicit <undeserialized declarations> class __layout_A definition
 // CHECK: FieldDecl {{.*}} imported a 'float'
 
 // CHECK: HLSLBufferDecl {{.*}} line:11:9 imported <undeserialized declarations> tbuffer B
 // CHECK-NEXT: HLSLResourceClassAttr {{.*}} Implicit SRV
 // CHECK-NEXT: HLSLResourceAttr {{.*}} Implicit TBuffer
-// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'float'
+// CHECK-NEXT: VarDecl 0x[[B:[0-9a-f]+]] {{.*}} imported used b 'hlsl_constant float'
 // CHECK-NEXT: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} imported implicit <undeserialized declarations> class __layout_B definition
 // CHECK: FieldDecl 0x{{[0-9a-f]+}} {{.*}} imported b 'float'
 
@@ -36,6 +36,6 @@ float foo() {
 // CHECK-NEXT: ReturnStmt {{.*}}
 // CHECK-NEXT: BinaryOperator {{.*}} 'float' '+'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[A]] 'a' 'float'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl_constant float' lvalue Var 0x[[A]] 'a' 'hlsl_constant float'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr {{.*}} 'float' lvalue Var 0x[[B]] 'b' 'float'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'hlsl_constant float' lvalue Var 0x[[B]] 'b' 'hlsl_constant float'
diff --git a/clang/test/AST/HLSL/resource_binding_attr.hlsl b/clang/test/AST/HLSL/resource_binding_attr.hlsl
index 13957ad3c1fcc..6fac903f75e18 100644
--- a/clang/test/AST/HLSL/resource_binding_attr.hlsl
+++ b/clang/test/AST/HLSL/resource_binding_attr.hlsl
@@ -4,7 +4,7 @@
 // CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer
 // CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit CBuffer
 // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} <col:14> "b3" "space2"
-// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'float'
+// CHECK-NEXT:VarDecl 0x[[A:[0-9a-f]+]] {{.*}} col:9 used a 'hlsl_constant float'
 cbuffer CB : register(b3, space2) {
   float a;
 }
@@ -13,7 +13,7 @@ cbuffer CB : register(b3, space2) {
 // CHECK-NEXT:HLSLResourceClassAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit SRV
 // CHECK-NEXT:HLSLResourceAttr 0x[[CB:[0-9a-f]+]] {{.*}} Implicit TBuffer
 // CHECK-NEXT:HLSLResourceBindingAttr 0x{{[0-9a-f]+}} <col:14> "t2" "space1"
-// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'float'
+// CHECK-NEXT:VarDecl 0x[[B:[0-9a-f]+]] {{.*}} col:9 used b 'hlsl_constant float'
 tbuffer TB : register(t2, space1) {
   float b;
 }
@@ -21,9 +21,9 @@ tbuffer TB : register(t2, space1) {
 float foo() {
 // CHECK: BinaryOperator 0x{{[0-9a-f]+}} <col:10, col:14> 'float' '+'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} <col:10> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:10> 'float' lvalue Var 0x[[A]] 'a' 'float'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:10> 'hlsl_constant float' lvalue Var 0x[[A]] 'a' 'hlsl_constant float'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-f]+}} <col:14> 'float' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:14> 'float' lvalue Var 0x[[B]] 'b' 'float'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-f]+}} <col:14> 'hlsl_constant float' lvalue Var 0x[[B]] 'b' 'hlsl_constant float'
   return a + b;
 }
 
diff --git a/clang/test/CodeGenHLSL/cbuf.hlsl b/clang/test/CodeGenHLSL/cbuf.hlsl
index 3f9d4514967dd..825e7b8161a60 100644
--- a/clang/test/CodeGenHLSL/cbuf.hlsl
+++ b/clang/test/CodeGenHLSL/cbuf.hlsl
@@ -1,7 +1,14 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK: @a = external addrspace(2) externally_initialized global float, align 4
+// CHECK: @b = external addrspace(2) externally_initialized global double, align 8
+// CHECK: @c = external addrspace(2) externally_initialized global float, align 4
+// CHECK: @d = external addrspace(2) externally_initialized global double, align 8
+
 // CHECK: @[[CB:.+]] = external constant { float, double }
 cbuffer A : register(b0, space2) {
   float a;
@@ -15,10 +22,10 @@ tbuffer A : register(t2, space1) {
 }
 
 float foo() {
-// CHECK: load float, ptr @[[CB]], align 4
-// CHECK: load double, ptr getelementptr ({ float, double }, ptr @[[CB]], i32 0, i32 1), align 8
-// CHECK: load float, ptr @[[TB]], align 4
-// CHECK: load double, ptr getelementptr ({ float, double }, ptr @[[TB]], i32 0, i32 1), align 8
+// CHECK: load float, ptr addrspace(2) @a, align 4
+// CHECK: load double, ptr addrspace(2) @b, align 8
+// CHECK: load float, ptr addrspace(2) @c, align 4
+// CHECK: load double, ptr addrspace(2) @d, align 8
   return a + b + c*d;
 }
 
diff --git a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl b/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl
index 73dc376942dfb..13c401d428331 100644
--- a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl
+++ b/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl
@@ -1,8 +1,14 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 // Make sure cbuffer inside namespace works.
+
+// CHECK: @_ZN2n02n11aE = external addrspace(2) externally_initialized global float, align 4
+// CHECK: @_ZN2n01bE = external addrspace(2) externally_initialized global float, align 4
+
 // CHECK: @[[CB:.+]] = external constant { float }
 // CHECK: @[[TB:.+]] = external constant { float }
 namespace n0 {
@@ -17,7 +23,7 @@ namespace n1 {
 }
 
 float foo() {
-// CHECK: load float, ptr @[[CB]], align 4
-// CHECK: load float, ptr @[[TB]], align 4
+// CHECK: load float, ptr addrspace(2) @_ZN2n02n11aE, align 4
+// CHECK: load float, ptr addrspace(2) @_ZN2n01bE, align 4
   return n0::n1::a + n0::b;
 }
diff --git a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
index f85bab2113170..25f51cce2017d 100644
--- a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
+++ b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
@@ -1,16 +1,21 @@
 // RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
-// CHECK-DAG: @[[CB:.+]] = external constant { float }
+// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
 
 cbuffer A {
-    float a;
-  // CHECK-DAG:@_ZL1b = internal global float 3.000000e+00, align 4
+  // CHECK: @a = external addrspace(2) externally_initialized global float, align 4
+  float a;
+  // CHECK: @_ZL1b = internal global float 3.000000e+00, align 4
   static float b = 3;
-  // CHECK:load float, ptr @[[CB]], align 4
-  // CHECK:load float, ptr @_ZL1b, align 4
   float foo() { return a + b; }
 }
+// CHECK: @[[CB:.+]] = external constant { float }
+
+// CHECK:define {{.*}} float @_Z3foov()
+// CHECK:load float, ptr addrspace(2) @a, align 4
+// CHECK:load float, ptr @_ZL1b, align 4
 
 float bar() {
   return foo();
diff --git a/clang/test/SemaTemplate/address_space-dependent.cpp b/clang/test/SemaTemplate/address_space-dependent.cpp
index 2ca9b8007ab41..eb8dbc69a945e 100644
--- a/clang/test/SemaTemplate/address_space-dependent.cpp
+++ b/clang/test/SemaTemplate/address_space-dependent.cpp
@@ -43,7 +43,7 @@ void neg() {
 
 template <long int I>
 void tooBig() {
-  __attribute__((address_space(I))) int *bounds; // expected-error {{address space is larger than the maximum supported (8388586)}}
+  __attribute__((address_space(I))) int *bounds; // expected-error {{address space is larger than the maximum supported (8388585)}}
 }
 
 template <long int I>

From 07ed8187acc31ac3f4779da452864a29d48799ac Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Fri, 24 Jan 2025 16:56:10 -0800
Subject: [PATCH 071/432] [OpenMP] Replace nvvm.annotation usage with kernel
 calling conventions (#122320)

Specifying a kernel with the `ptx_kernel` or `amdgpu_kernel` calling
convention is a more idiomatic and compile-time performant than using
the `nvvm.annoation !"kernel"` metadata.

Transition OMPIRBuilder to use calling conventions for PTX kernels and
no longer emit `nvvm.annoation`. Update OpenMPOpt to work with kernels
specified via calling convention as well as metadata. Update OpenMP
tests to use the calling conventions.
---
 clang/test/OpenMP/assumes_include_nvptx.cpp   |    4 +-
 .../nvptx_target_firstprivate_codegen.cpp     |    2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   16 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |   61 +-
 .../Transforms/OpenMP/always_inline_device.ll |   17 +-
 .../attributor_module_slice_reproducer.ll     |    6 +-
 .../test/Transforms/OpenMP/barrier_removal.ll |  146 +-
 llvm/test/Transforms/OpenMP/bug66687.ll       |   11 +-
 .../OpenMP/custom_state_machines.ll           |   81 +-
 .../OpenMP/custom_state_machines_pre_lto.ll   |  109 +-
 .../OpenMP/custom_state_machines_remarks.ll   |    7 +-
 .../Transforms/OpenMP/deduplication_target.ll |    4 +-
 .../get_hardware_num_threads_in_block_fold.ll |   13 +-
 ...dware_num_threads_in_block_fold_optnone.ll |   13 +-
 .../Transforms/OpenMP/global_constructor.ll   |   13 +-
 .../OpenMP/globalization_remarks.ll           |    4 +-
 .../OpenMP/gpu_kernel_detection_remarks.ll    |    9 +-
 ..._state_machine_function_ptr_replacement.ll |    4 +-
 .../OpenMP/is_spmd_exec_mode_fold.ll          |   17 +-
 .../Transforms/OpenMP/nested_parallelism.ll   |   13 +-
 .../Transforms/OpenMP/parallel_level_fold.ll  |   13 +-
 .../Transforms/OpenMP/remove_globalization.ll |   26 +-
 .../OpenMP/replace_globalization.ll           |   21 +-
 .../OpenMP/single_threaded_execution.ll       |    4 +-
 llvm/test/Transforms/OpenMP/spmdization.ll    | 1607 +++--------------
 .../Transforms/OpenMP/spmdization_assumes.ll  |   27 +-
 .../OpenMP/spmdization_constant_prop.ll       |    2 -
 .../Transforms/OpenMP/spmdization_guarding.ll |  106 +-
 ...mdization_guarding_two_reaching_kernels.ll |   36 +-
 .../Transforms/OpenMP/spmdization_indirect.ll |  161 +-
 ...zation_no_guarding_two_reaching_kernels.ll |   35 +-
 .../Transforms/OpenMP/spmdization_remarks.ll  |    7 +-
 .../OpenMP/value-simplify-openmp-opt.ll       |   96 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          |   20 -
 34 files changed, 606 insertions(+), 2105 deletions(-)

diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp
index 4577ea4c9c2b5..c5040989a0e40 100644
--- a/clang/test/OpenMP/assumes_include_nvptx.cpp
+++ b/clang/test/OpenMP/assumes_include_nvptx.cpp
@@ -11,11 +11,11 @@
 
 // TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.
 
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
 // CHECK: call i32 @__kmpc_target_init(
 // CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]]
 // CHECK: declare void @__kmpc_target_deinit(
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
+// CHECK: define weak_odr protected ptx_kernel void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
 // CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]]
 // CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]]
 
diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index d573f1cd193d6..94ace20826db4 100644
--- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -90,7 +90,7 @@ int foo(int n, double *ptr) {
     ptr[0]++;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
+  // TCHECK:  define weak_odr protected ptx_kernel void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
   // TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr,
   // TCHECK:  [[PTR_ADDR:%.+]] = alloca ptr,
   // TCHECK-NOT: alloca ptr,
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 777391327f77c..8cc3a99d92023 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6468,6 +6468,8 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
     OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
     if (T.isAMDGCN())
       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
+    else if (T.isNVPTX())
+      OutlinedFn->setCallingConv(CallingConv::PTX_Kernel);
   }
 }
 
@@ -9223,20 +9225,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
   if (!Fn)
     return;
 
-  Module &M = *(Fn->getParent());
-  LLVMContext &Ctx = M.getContext();
-
-  // Get "nvvm.annotations" metadata node.
-  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-
-  Metadata *MDVals[] = {
-      ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
-  // Append metadata to nvvm.annotations.
-  MD->addOperand(MDNode::get(Ctx, MDVals));
-
   // Add a function attribute for the kernel.
-  Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
+  Fn->addFnAttr("kernel");
   if (T.isAMDGCN())
     Fn->addFnAttr("uniform-work-group-size", "true");
   Fn->addFnAttr(Attribute::MustProgress);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e7221ee406a18..10008130016c3 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -19,6 +19,7 @@
 
 #include "llvm/Transforms/IPO/OpenMPOpt.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -36,6 +37,7 @@
 #include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
 #include "llvm/IR/Assumptions.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -5903,34 +5905,51 @@ bool llvm::omp::isOpenMPKernel(Function &Fn) {
   return Fn.hasFnAttribute("kernel");
 }
 
+static bool isKernelCC(Function &F) {
+  switch (F.getCallingConv()) {
+  default:
+    return false;
+  case CallingConv::PTX_Kernel:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  }
+}
+
 KernelSet llvm::omp::getDeviceKernels(Module &M) {
   // TODO: Create a more cross-platform way of determining device kernels.
-  NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations");
   KernelSet Kernels;
 
-  if (!MD)
-    return Kernels;
+  DenseSet<const Function *> SeenKernels;
+  auto ProcessKernel = [&](Function &KF) {
+    if (SeenKernels.insert(&KF).second) {
+      // We are only interested in OpenMP target regions. Others, such as
+      // kernels generated by CUDA but linked together, are not interesting to
+      // this pass.
+      if (isOpenMPKernel(KF)) {
+        ++NumOpenMPTargetRegionKernels;
+        Kernels.insert(&KF);
+      } else
+        ++NumNonOpenMPTargetRegionKernels;
+    }
+  };
 
-  for (auto *Op : MD->operands()) {
-    if (Op->getNumOperands() < 2)
-      continue;
-    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
-    if (!KindID || KindID->getString() != "kernel")
-      continue;
+  if (NamedMDNode *MD = M.getNamedMetadata("nvvm.annotations"))
+    for (auto *Op : MD->operands()) {
+      if (Op->getNumOperands() < 2)
+        continue;
+      MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+      if (!KindID || KindID->getString() != "kernel")
+        continue;
 
-    Function *KernelFn =
-        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
-    if (!KernelFn)
-      continue;
+      if (auto *KernelFn =
+              mdconst::dyn_extract_or_null<Function>(Op->getOperand(0)))
+        ProcessKernel(*KernelFn);
+    }
 
-    // We are only interested in OpenMP target regions. Others, such as kernels
-    // generated by CUDA but linked together, are not interesting to this pass.
-    if (isOpenMPKernel(*KernelFn)) {
-      ++NumOpenMPTargetRegionKernels;
-      Kernels.insert(KernelFn);
-    } else
-      ++NumNonOpenMPTargetRegionKernels;
-  }
+  for (Function &F : M)
+    if (isKernelCC(F))
+      ProcessKernel(F);
 
   return Kernels;
 }
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 6028ff5278037..9c5b19f7a6c88 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -17,7 +17,7 @@
 ; CHECK: @G = external global i8
 ; CHECK: @kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
 ; CHECK: Function Attrs: norecurse nounwind
 ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
 ; CHECK-NEXT:  entry:
@@ -79,12 +79,10 @@ attributes #1 = { convergent nounwind "frame-pointer"="all" "min-legal-vector-wi
 attributes #2 = { convergent }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3, !4, !5, !6}
 !llvm.ident = !{!7}
 
 !0 = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-!1 = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
 !2 = !{i32 1, !"wchar_size", i32 4}
 !3 = !{i32 7, !"openmp", i32 50}
 !4 = !{i32 7, !"openmp-device", i32 50}
@@ -97,11 +95,10 @@ attributes #2 = { convergent }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 -1064087614, !"foo", i32 4, i32 0}
-; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_c0934fc2_foo_l4, !"kernel", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
index 9c0416af359d4..3f4790ee15ac8 100644
--- a/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
+++ b/llvm/test/Transforms/OpenMP/attributor_module_slice_reproducer.ll
@@ -13,10 +13,6 @@ define linkonce_odr hidden i8 @_ZStplIdESt7complexIT_ERKS2_S4_() local_unnamed_a
   ret i8 undef
 }
 
-declare void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
+declare ptx_kernel void @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148(i64, i64, i64, ptr, ptr, i64, ptr, ptr, ptr, i64) local_unnamed_addr
 
 declare dso_local fastcc void @__kmpc_for_static_init_8u() unnamed_addr
-
-!nvvm.annotations = !{!0}
-
-!0 = !{ptr @__omp_offloading_2b_4010cad__ZN11qmcplusplus7ompBLAS17gemv_batched_implIfEEiRiciiPKT_PKS5_iS7_iS5_PKPS3_ii_l148, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 47a5d5104aa8b..5b7544b1a7961 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -28,7 +28,7 @@ declare void @llvm.assume(i1)
 ; CHECK: @G1 = global i32 42
 ; CHECK: @G2 = addrspace(1) global i32 0
 ;.
-define void @pos_empty_1(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_1(i1 %c) "kernel" {
 ; MODULE-LABEL: define {{[^@]+}}@pos_empty_1
 ; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4:[0-9]+]] {
 ; MODULE-NEXT:    ret void
@@ -45,7 +45,7 @@ define void @pos_empty_1(i1 %c) "kernel" {
   call void @llvm.assume(i1 %c)
   ret void
 }
-define void @pos_empty_2() "kernel" {
+define amdgpu_kernel void @pos_empty_2() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_2
 ; CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 ; CHECK-NEXT:    ret void
@@ -53,7 +53,7 @@ define void @pos_empty_2() "kernel" {
   call void @aligned_barrier()
   ret void
 }
-define void @pos_empty_3() "kernel" {
+define amdgpu_kernel void @pos_empty_3() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_3
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    ret void
@@ -61,7 +61,7 @@ define void @pos_empty_3() "kernel" {
   call void @llvm.nvvm.barrier0()
   ret void
 }
-define void @pos_empty_4() "kernel" {
+define amdgpu_kernel void @pos_empty_4() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_4
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    ret void
@@ -69,7 +69,7 @@ define void @pos_empty_4() "kernel" {
   call i32 @llvm.nvvm.barrier0.and(i32 0)
   ret void
 }
-define void @pos_empty_5() "kernel" {
+define amdgpu_kernel void @pos_empty_5() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_5
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    ret void
@@ -77,7 +77,7 @@ define void @pos_empty_5() "kernel" {
   call i32 @llvm.nvvm.barrier0.or(i32 0)
   ret void
 }
-define void @pos_empty_6() "kernel" {
+define amdgpu_kernel void @pos_empty_6() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_6
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    ret void
@@ -85,7 +85,7 @@ define void @pos_empty_6() "kernel" {
   call i32 @llvm.nvvm.barrier0.popc(i32 0)
   ret void
 }
-define void @pos_empty_7a() "kernel" {
+define amdgpu_kernel void @pos_empty_7a() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_7a
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    call void @unknown()
@@ -96,7 +96,7 @@ define void @pos_empty_7a() "kernel" {
   ret void
 }
 ; FIXME: We should remove the barrier.
-define void @pos_empty_7b() "kernel" {
+define amdgpu_kernel void @pos_empty_7b() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_7b
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    call void @unknown() #[[ATTR5:[0-9]+]]
@@ -109,7 +109,7 @@ define void @pos_empty_7b() "kernel" {
   call void @unknown()
   ret void
 }
-define void @pos_empty_8(i1 %c) "kernel" {
+define amdgpu_kernel void @pos_empty_8(i1 %c) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_8
 ; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -126,7 +126,7 @@ t:
 f:
   ret void
 }
-define void @neg_empty_8() "kernel" {
+define amdgpu_kernel void @neg_empty_8() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@neg_empty_8
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    call void @unknown()
@@ -137,7 +137,7 @@ define void @neg_empty_8() "kernel" {
   call void @llvm.amdgcn.s.barrier()
   ret void
 }
-define void @neg_empty_9(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_9(i1 %c) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@neg_empty_9
 ; CHECK-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -173,7 +173,7 @@ m:
   ret void
 }
 ; FIXME: We should remove the barrier
-define void @pos_empty_10() "kernel" {
+define amdgpu_kernel void @pos_empty_10() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_10
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    br label [[M:%.*]]
@@ -186,7 +186,7 @@ m:
   call void @llvm.amdgcn.s.barrier()
   ret void
 }
-define void @pos_empty_11() "kernel" {
+define amdgpu_kernel void @pos_empty_11() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_empty_11
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    br label [[M:%.*]]
@@ -206,7 +206,7 @@ define void @empty() {
   ret void
 }
 ; FIXME: We should remove the barrier in the end but not the first one.
-define void @neg_empty_12(i1 %c) "kernel" {
+define amdgpu_kernel void @neg_empty_12(i1 %c) "kernel" {
 ; MODULE-LABEL: define {{[^@]+}}@neg_empty_12
 ; MODULE-SAME: (i1 [[C:%.*]]) #[[ATTR4]] {
 ; MODULE-NEXT:    br i1 [[C]], label [[T:%.*]], label [[F:%.*]]
@@ -266,7 +266,7 @@ define void @neg_empty_2() "kernel" {
 @GC1 = constant i32 42
 @GC2 = addrspace(4) global i32 0
 @GPtr4 = addrspace(4) global ptr addrspace(4) null
-define void @pos_constant_loads() "kernel" {
+define amdgpu_kernel void @pos_constant_loads() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_constant_loads
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(4), ptr addrspace(4) @GPtr4, align 8
@@ -296,7 +296,7 @@ define void @pos_constant_loads() "kernel" {
 @GS = addrspace(3) global i32 0
 @GPtr = global ptr null
 ; TODO: We could remove some of the barriers due to the lack of write effects.
-define void @neg_loads() "kernel" {
+define amdgpu_kernel void @neg_loads() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@neg_loads
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -327,7 +327,7 @@ define void @neg_loads() "kernel" {
 @PG1 = thread_local global i32 42
 @PG2 = addrspace(5) global i32 0
 @GPtr5 = global ptr addrspace(5) null
-define void @pos_priv_mem() "kernel" {
+define amdgpu_kernel void @pos_priv_mem() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
@@ -358,7 +358,7 @@ define void @pos_priv_mem() "kernel" {
 }
 @G1 = global i32 42
 @G2 = addrspace(1) global i32 0
-define void @neg_mem() "kernel" {
+define amdgpu_kernel void @neg_mem() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@neg_mem
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    [[ARG:%.*]] = load ptr, ptr @GPtr, align 8
@@ -388,7 +388,7 @@ define void @neg_mem() "kernel" {
   ret void
 }
 
-define void @pos_multiple() "kernel" {
+define amdgpu_kernel void @pos_multiple() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_multiple
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    ret void
@@ -404,7 +404,7 @@ define void @pos_multiple() "kernel" {
   ret void
 }
 
-define void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_1(i1 %c0, i1 %c1) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_1
 ; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    br i1 [[C0]], label [[T0:%.*]], label [[F0:%.*]]
@@ -461,7 +461,7 @@ m:
   ret void
 }
 
-define void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_kernel_2(i1 %c0, i1 %c1, ptr %p) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@multiple_blocks_kernel_2
 ; CHECK-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    store i32 4, ptr [[P]], align 4
@@ -727,7 +727,7 @@ define internal void @barrier_then_write_then_barrier0(ptr %p) {
   call void @aligned_barrier()
   ret void
 }
-define void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
+define amdgpu_kernel void @multiple_blocks_functions_kernel_effects_0(i1 %c0, i1 %c1, ptr %p) "kernel" {
 ; MODULE-LABEL: define {{[^@]+}}@multiple_blocks_functions_kernel_effects_0
 ; MODULE-SAME: (i1 [[C0:%.*]], i1 [[C1:%.*]], ptr [[P:%.*]]) #[[ATTR4]] {
 ; MODULE-NEXT:    call void @barrier_then_write_then_barrier0(ptr [[P]])
@@ -1040,7 +1040,7 @@ define internal void @callee_barrier() {
   call void @aligned_barrier()
   ret void
 }
-define void @caller_barrier1() "kernel" {
+define amdgpu_kernel void @caller_barrier1() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@caller_barrier1
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    call void @callee_barrier()
@@ -1051,7 +1051,7 @@ define void @caller_barrier1() "kernel" {
   call void @aligned_barrier()
   ret void
 }
-define void @caller_barrier2() "kernel" {
+define amdgpu_kernel void @caller_barrier2() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@caller_barrier2
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:    call void @unknown()
@@ -1065,7 +1065,7 @@ define void @caller_barrier2() "kernel" {
   ret void
 }
 
-define void @loop_barrier() "kernel" {
+define amdgpu_kernel void @loop_barrier() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@loop_barrier
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
@@ -1095,7 +1095,7 @@ exit:
   ret void
 }
 
-define void @loop_barrier_end_barriers() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
@@ -1129,7 +1129,7 @@ exit:
   ret void
 }
 
-define void @loop_barrier_end_barriers_unknown() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_unknown() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_unknown
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
@@ -1165,7 +1165,7 @@ exit:
   ret void
 }
 
-define void @loop_barrier_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_store() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_store
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
@@ -1195,7 +1195,7 @@ exit:
   ret void
 }
 
-define void @loop_barrier_end_barriers_store() "kernel" {
+define amdgpu_kernel void @loop_barrier_end_barriers_store() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@loop_barrier_end_barriers_store
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
@@ -1232,37 +1232,7 @@ exit:
 }
 
 !llvm.module.flags = !{!16,!15}
-!nvvm.annotations = !{!0,!1,!2,!3,!4,!5,!6,!7,!8,!9,!10,!11,!12,!13,!14,!17,!18,!19,!20,!21,!22,!23,!24,!25,!26,!27,!28,!29,!30}
 
-!0 = !{ptr @pos_empty_1, !"kernel", i32 1}
-!1 = !{ptr @pos_empty_2, !"kernel", i32 1}
-!2 = !{ptr @pos_empty_3, !"kernel", i32 1}
-!3 = !{ptr @pos_empty_4, !"kernel", i32 1}
-!4 = !{ptr @pos_empty_5, !"kernel", i32 1}
-!5 = !{ptr @pos_empty_6, !"kernel", i32 1}
-!17 = !{ptr @pos_empty_7a, !"kernel", i32 1}
-!18 = !{ptr @pos_empty_7b, !"kernel", i32 1}
-!23 = !{ptr @pos_empty_8, !"kernel", i32 1}
-!24 = !{ptr @caller_barrier1, !"kernel", i32 1}
-!25 = !{ptr @caller_barrier2, !"kernel", i32 1}
-!26 = !{ptr @loop_barrier, !"kernel", i32 1}
-!27 = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-!28 = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
-!29 = !{ptr @loop_barrier_store, !"kernel", i32 1}
-!30 = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
-!6 = !{ptr @neg_empty_8, !"kernel", i32 1}
-!19 = !{ptr @neg_empty_9, !"kernel", i32 1}
-!20 = !{ptr @pos_empty_10, !"kernel", i32 1}
-!21 = !{ptr @pos_empty_11, !"kernel", i32 1}
-!22 = !{ptr @neg_empty_12, !"kernel", i32 1}
-!7 = !{ptr @pos_constant_loads, !"kernel", i32 1}
-!8 = !{ptr @neg_loads, !"kernel", i32 1}
-!9 = !{ptr @pos_priv_mem, !"kernel", i32 1}
-!10 = !{ptr @neg_mem, !"kernel", i32 1}
-!11 = !{ptr @pos_multiple, !"kernel", i32 1}
-!12 = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
-!13 = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
-!14 = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
 !15 = !{i32 7, !"openmp", i32 50}
 !16 = !{i32 7, !"openmp-device", i32 50}
 ;.
@@ -1282,65 +1252,7 @@ exit:
 ;.
 ; MODULE: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; MODULE: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; MODULE: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
-; MODULE: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
-; MODULE: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
-; MODULE: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
-; MODULE: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
-; MODULE: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
-; MODULE: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
-; MODULE: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
-; MODULE: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
-; MODULE: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
-; MODULE: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
-; MODULE: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
-; MODULE: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
-; MODULE: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
-; MODULE: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
-; MODULE: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
-; MODULE: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
-; MODULE: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
-; MODULE: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
-; MODULE: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
-; MODULE: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
-; MODULE: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
-; MODULE: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
-; MODULE: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
-; MODULE: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
-; MODULE: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-; MODULE: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
-; MODULE: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
-; MODULE: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CGSCC: [[META2:![0-9]+]] = !{ptr @pos_empty_1, !"kernel", i32 1}
-; CGSCC: [[META3:![0-9]+]] = !{ptr @pos_empty_2, !"kernel", i32 1}
-; CGSCC: [[META4:![0-9]+]] = !{ptr @pos_empty_3, !"kernel", i32 1}
-; CGSCC: [[META5:![0-9]+]] = !{ptr @pos_empty_4, !"kernel", i32 1}
-; CGSCC: [[META6:![0-9]+]] = !{ptr @pos_empty_5, !"kernel", i32 1}
-; CGSCC: [[META7:![0-9]+]] = !{ptr @pos_empty_6, !"kernel", i32 1}
-; CGSCC: [[META8:![0-9]+]] = !{ptr @neg_empty_8, !"kernel", i32 1}
-; CGSCC: [[META9:![0-9]+]] = !{ptr @pos_constant_loads, !"kernel", i32 1}
-; CGSCC: [[META10:![0-9]+]] = !{ptr @neg_loads, !"kernel", i32 1}
-; CGSCC: [[META11:![0-9]+]] = !{ptr @pos_priv_mem, !"kernel", i32 1}
-; CGSCC: [[META12:![0-9]+]] = !{ptr @neg_mem, !"kernel", i32 1}
-; CGSCC: [[META13:![0-9]+]] = !{ptr @pos_multiple, !"kernel", i32 1}
-; CGSCC: [[META14:![0-9]+]] = !{ptr @multiple_blocks_kernel_1, !"kernel", i32 1}
-; CGSCC: [[META15:![0-9]+]] = !{ptr @multiple_blocks_kernel_2, !"kernel", i32 1}
-; CGSCC: [[META16:![0-9]+]] = !{ptr @multiple_blocks_functions_kernel_effects_0, !"kernel", i32 1}
-; CGSCC: [[META17:![0-9]+]] = !{ptr @pos_empty_7a, !"kernel", i32 1}
-; CGSCC: [[META18:![0-9]+]] = !{ptr @pos_empty_7b, !"kernel", i32 1}
-; CGSCC: [[META19:![0-9]+]] = !{ptr @neg_empty_9, !"kernel", i32 1}
-; CGSCC: [[META20:![0-9]+]] = !{ptr @pos_empty_10, !"kernel", i32 1}
-; CGSCC: [[META21:![0-9]+]] = !{ptr @pos_empty_11, !"kernel", i32 1}
-; CGSCC: [[META22:![0-9]+]] = !{ptr @neg_empty_12, !"kernel", i32 1}
-; CGSCC: [[META23:![0-9]+]] = !{ptr @pos_empty_8, !"kernel", i32 1}
-; CGSCC: [[META24:![0-9]+]] = !{ptr @caller_barrier1, !"kernel", i32 1}
-; CGSCC: [[META25:![0-9]+]] = !{ptr @caller_barrier2, !"kernel", i32 1}
-; CGSCC: [[META26:![0-9]+]] = !{ptr @loop_barrier, !"kernel", i32 1}
-; CGSCC: [[META27:![0-9]+]] = !{ptr @loop_barrier_end_barriers, !"kernel", i32 1}
-; CGSCC: [[META28:![0-9]+]] = !{ptr @loop_barrier_end_barriers_unknown, !"kernel", i32 1}
-; CGSCC: [[META29:![0-9]+]] = !{ptr @loop_barrier_store, !"kernel", i32 1}
-; CGSCC: [[META30:![0-9]+]] = !{ptr @loop_barrier_end_barriers_store, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/bug66687.ll b/llvm/test/Transforms/OpenMP/bug66687.ll
index e0a9b825a8804..9bb069b1735be 100644
--- a/llvm/test/Transforms/OpenMP/bug66687.ll
+++ b/llvm/test/Transforms/OpenMP/bug66687.ll
@@ -5,25 +5,22 @@ source_filename = "bug66687.ll"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64-nvidia-cuda"
 
-define weak void @openmp_kernel() "kernel" {
-; CHECK-LABEL: define weak void @openmp_kernel(
+define weak ptx_kernel void @openmp_kernel() "kernel" {
+; CHECK-LABEL: define weak ptx_kernel void @openmp_kernel(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
-define weak_odr void @non_openmp_kernel() {
-; CHECK-LABEL: define weak_odr void @non_openmp_kernel() {
+define weak_odr ptx_kernel void @non_openmp_kernel() {
+; CHECK-LABEL: define weak_odr ptx_kernel void @non_openmp_kernel() {
 ; CHECK-NEXT:    ret void
 ;
   ret void
 }
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3}
 
 !0 = !{i32 7, !"openmp", i32 51}
 !1 = !{i32 7, !"openmp-device", i32 51}
-!2 = !{ptr @openmp_kernel, !"kernel", i32 1}
-!3 = !{ptr @non_openmp_kernel, !"kernel", i32 1}
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index e6ddf16f06763..10e521bbfcc10 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -138,7 +138,7 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -197,7 +197,7 @@ declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -290,7 +290,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -367,7 +367,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -453,7 +453,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -537,7 +537,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -624,7 +624,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -679,7 +679,7 @@ return:                                           ; preds = %if.end, %if.then
 
 declare i32 @omp_get_thread_num(...) #4
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -812,7 +812,6 @@ attributes #8 = { convergent "llvm.assume"="omp_no_openmp" }
 attributes #9 = { convergent nounwind readonly willreturn }
 
 !omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7}
-!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15}
 !llvm.module.flags = !{!16, !17, !18}
 
 !0 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
@@ -823,14 +822,6 @@ attributes #9 = { convergent nounwind readonly willreturn }
 !5 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 !6 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 !7 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-!8 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-!9 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-!10 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-!11 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-!12 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-!13 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-!14 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-!15 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
 !16 = !{i32 1, !"wchar_size", i32 4}
 !17 = !{i32 7, !"openmp", i32 50}
 !18 = !{i32 7, !"openmp-device", i32 50}
@@ -4107,17 +4098,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; AMDGPU: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; AMDGPU: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; AMDGPU: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; AMDGPU: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; AMDGPU: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; AMDGPU: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; AMDGPU: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; AMDGPU: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; AMDGPU: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; AMDGPU: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; AMDGPU: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -4127,17 +4110,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; NVPTX: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; NVPTX: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; NVPTX: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; NVPTX: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; NVPTX: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; NVPTX: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; NVPTX: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; NVPTX: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; NVPTX: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; NVPTX: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; NVPTX: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -4147,17 +4122,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU-DISABLED: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -4167,15 +4134,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; NVPTX-DISABLED: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; NVPTX-DISABLED: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; NVPTX-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX-DISABLED: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX-DISABLED: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX-DISABLED: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX-DISABLED: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX-DISABLED: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index d20821d450365..9576ff6ca6aee 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -139,7 +139,7 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -196,7 +196,7 @@ declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -289,7 +289,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -366,7 +366,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -452,7 +452,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -536,7 +536,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -623,7 +623,7 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -678,7 +678,7 @@ return:                                           ; preds = %if.end, %if.then
 
 declare i32 @omp_get_thread_num(...) #4
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -811,7 +811,6 @@ attributes #8 = { convergent "llvm.assume"="omp_no_openmp" }
 attributes #9 = { convergent nounwind readonly willreturn }
 
 !omp_offload.info = !{!0, !1, !2, !3, !4, !5, !6, !7}
-!nvvm.annotations = !{!8, !9, !10, !11, !12, !13, !14, !15}
 !llvm.module.flags = !{!16, !17, !18}
 
 !0 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
@@ -822,14 +821,6 @@ attributes #9 = { convergent nounwind readonly willreturn }
 !5 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 !6 = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 !7 = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-!8 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-!9 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-!10 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-!11 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-!12 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-!13 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-!14 = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-!15 = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
 !16 = !{i32 1, !"wchar_size", i32 4}
 !17 = !{i32 7, !"openmp", i32 50}
 !18 = !{i32 7, !"openmp-device", i32 50}
@@ -4976,17 +4967,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU1: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; AMDGPU1: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; AMDGPU1: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; AMDGPU1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; AMDGPU1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; AMDGPU1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; AMDGPU1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; AMDGPU1: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; AMDGPU1: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; AMDGPU1: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; AMDGPU1: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; AMDGPU1: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU1: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU1: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU1: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU1: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU1: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; NVPTX1: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; NVPTX1: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -4996,17 +4979,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX1: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; NVPTX1: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; NVPTX1: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; NVPTX1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; NVPTX1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; NVPTX1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; NVPTX1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; NVPTX1: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; NVPTX1: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; NVPTX1: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; NVPTX1: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; NVPTX1: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX1: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX1: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX1: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX1: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX1: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; AMDGPU2: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; AMDGPU2: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -5016,17 +4991,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU2: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; AMDGPU2: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; AMDGPU2: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; AMDGPU2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; AMDGPU2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; AMDGPU2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; AMDGPU2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; AMDGPU2: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; AMDGPU2: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; AMDGPU2: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; AMDGPU2: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; AMDGPU2: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU2: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU2: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU2: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU2: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU2: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; AMDGPU3: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; AMDGPU3: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -5036,17 +5003,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU3: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; AMDGPU3: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; AMDGPU3: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; AMDGPU3: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; AMDGPU3: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; AMDGPU3: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; AMDGPU3: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; AMDGPU3: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; AMDGPU3: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; AMDGPU3: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; AMDGPU3: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; AMDGPU3: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU3: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU3: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU3: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU3: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU3: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; NVPTX2: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; NVPTX2: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -5056,17 +5015,9 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX2: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; NVPTX2: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; NVPTX2: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; NVPTX2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; NVPTX2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; NVPTX2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; NVPTX2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; NVPTX2: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; NVPTX2: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; NVPTX2: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; NVPTX2: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; NVPTX2: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX2: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX2: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX2: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX2: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX2: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
 ; NVPTX3: [[META0:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural", i32 39, i32 2}
 ; NVPTX3: [[META1:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_no_openmp_attr", i32 66, i32 4}
@@ -5076,15 +5027,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX3: [[META5:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine_interprocedural_nested_recursive", i32 92, i32 6}
 ; NVPTX3: [[META6:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"no_state_machine_weak_callee", i32 112, i32 7}
 ; NVPTX3: [[META7:![0-9]+]] = !{i32 0, i32 20, i32 171331627, !"simple_state_machine", i32 22, i32 1}
-; NVPTX3: [[META8:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14, !"kernel", i32 1}
-; NVPTX3: [[META9:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_l22, !"kernel", i32 1}
-; NVPTX3: [[META10:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39, !"kernel", i32 1}
-; NVPTX3: [[META11:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55, !"kernel", i32 1}
-; NVPTX3: [[META12:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66, !"kernel", i32 1}
-; NVPTX3: [[META13:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77, !"kernel", i32 1}
-; NVPTX3: [[META14:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92, !"kernel", i32 1}
-; NVPTX3: [[META15:![0-9]+]] = !{ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112, !"kernel", i32 1}
-; NVPTX3: [[META16:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX3: [[META17:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX3: [[META18:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX3: [[META8:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX3: [[META9:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX3: [[META10:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index f7bfd30650694..ad41639511e99 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -59,7 +59,7 @@ target triple = "nvptx64"
 
 
 ; Function Attrs: convergent norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
+define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
 entry:
   %captured_vars_addrs.i.i = alloca [0 x ptr], align 8
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18
@@ -104,7 +104,7 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
 ; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
+define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
 entry:
   %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33
@@ -175,7 +175,6 @@ attributes #8 = { "llvm.assume"="omp_no_parallelism" }
 
 !llvm.dbg.cu = !{!0}
 !omp_offload.info = !{!3, !4}
-!nvvm.annotations = !{!5, !6}
 !llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
 !llvm.ident = !{!14}
 
@@ -184,8 +183,6 @@ attributes #8 = { "llvm.assume"="omp_no_parallelism" }
 !2 = !{}
 !3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1}
 !4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0}
-!5 = !{ptr @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1}
-!6 = !{ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1}
 !7 = !{i32 7, !"Dwarf Version", i32 2}
 !8 = !{i32 2, !"Debug Info Version", i32 3}
 !9 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index 6b0563365c648..7027c3275b932 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -15,7 +15,7 @@ target triple = "nvptx64"
 
 declare void @use(i32)
 
-define weak void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -56,11 +56,9 @@ attributes #0 = { convergent noinline norecurse nounwind "kernel" "frame-pointer
 attributes #1 = { nounwind }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3, !4}
 
 !0 = !{i32 0, i32 80, i32 -1545561096, !"foo", i32 2, i32 0}
-!1 = !{ptr @__omp_offloading_50_a3e09bf8_foo_l2, !"kernel", i32 1}
 !2 = !{i32 1, !"wchar_size", i32 4}
 !3 = !{i32 7, !"openmp", i32 50}
 !4 = !{i32 7, !"openmp-device", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index 6102201ad4bac..6a4519a161fd6 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -19,7 +19,7 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
-define weak void @kernel0(ptr %dyn) "kernel" #0 {
+define weak ptx_kernel void @kernel0(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment, ptr [[DYN]])
@@ -43,7 +43,7 @@ define weak void @kernel0(ptr %dyn) "kernel" #0 {
   ret void
 }
 
-define weak void @kernel1(ptr %dyn) "kernel" #0 {
+define weak ptx_kernel void @kernel1(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel1
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment, ptr [[DYN]])
@@ -63,7 +63,7 @@ define weak void @kernel1(ptr %dyn) "kernel" #0 {
   ret void
 }
 
-define weak void @kernel2(ptr %dyn) "kernel" #0 {
+define weak ptx_kernel void @kernel2(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel2
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -200,15 +200,11 @@ declare i32 @__kmpc_global_thread_num(ptr)
 
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3, !4}
 
 attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
 
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
-!2 = !{ptr @kernel0, !"kernel", i32 1}
-!3 = !{ptr @kernel1, !"kernel", i32 1}
-!4 = !{ptr @kernel2, !"kernel", i32 1}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "kernel" "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
 ; CHECK: attributes #[[ATTR1]] = { nounwind }
@@ -217,7 +213,4 @@ attributes #0 = { "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @kernel0, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @kernel1, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
index 0cf6e7488b4dd..3037d24b8c448 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
@@ -7,7 +7,7 @@ target triple = "nvptx64"
 ;.
 ; CHECK: @G = external global i32
 ;.
-define weak void @kernel0() #0 {
+define weak ptx_kernel void @kernel0() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
@@ -25,7 +25,7 @@ define weak void @kernel0() #0 {
   ret void
 }
 
-define weak void @kernel1() #0 {
+define weak ptx_kernel void @kernel1() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel1
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
@@ -39,7 +39,7 @@ define weak void @kernel1() #0 {
   ret void
 }
 
-define weak void @kernel2() #0 {
+define weak ptx_kernel void @kernel2() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel2
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
@@ -107,15 +107,11 @@ declare void @__kmpc_target_deinit() #1
 
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3, !4}
 
 attributes #0 = { optnone noinline "kernel" "omp_target_thread_limit"="666" "omp_target_num_teams"="777"}
 
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
-!2 = !{ptr @kernel0, !"kernel", i32 1}
-!3 = !{ptr @kernel1, !"kernel", i32 1}
-!4 = !{ptr @kernel2, !"kernel", i32 1}
 ;
 ;.
 ; CHECK: attributes #[[ATTR0]] = { noinline optnone "kernel" "omp_target_num_teams"="777" "omp_target_thread_limit"="666" }
@@ -123,7 +119,4 @@ attributes #0 = { optnone noinline "kernel" "omp_target_thread_limit"="666" "omp
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @kernel0, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @kernel1, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index 804b910dcd308..1d18e527e1466 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -10,7 +10,7 @@
 @_ZL6Device = internal global double 0.000000e+00, align 8
 @__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
+define weak ptx_kernel void @__omp_offloading_fd02_85283c04_main_l11(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
 entry:
   %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment, ptr %dyn) #0
   %exec_user_code = icmp eq i32 %0, -1
@@ -39,7 +39,7 @@ declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
 
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
-define weak void @__omp_offloading__fd02_85283c04_Device_l6_ctor() "kernel" {
+define weak ptx_kernel void @__omp_offloading__fd02_85283c04_Device_l6_ctor() "kernel" {
 entry:
   %call.i = tail call double @__nv_log(double noundef 2.000000e+00) #1
   %call.i2 = tail call double @__nv_log(double noundef 2.000000e+00) #1
@@ -58,15 +58,12 @@ attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
 
 !omp_offload.info = !{!0, !1, !2}
-!nvvm.annotations = !{!3, !4}
 !llvm.module.flags = !{!5, !6, !7, !8, !9}
 !llvm.ident = !{!10}
 
 !0 = !{i32 0, i32 64770, i32 -2060960764, !"__omp_offloading__fd02_85283c04_Device_l6_ctor", i32 6, i32 1}
 !1 = !{i32 0, i32 64770, i32 -2060960764, !"main", i32 11, i32 2}
 !2 = !{i32 1, !"_ZL6Device", i32 0, i32 0}
-!3 = !{ptr @__omp_offloading__fd02_85283c04_Device_l6_ctor, !"kernel", i32 1}
-!4 = !{ptr @__omp_offloading_fd02_85283c04_main_l11, !"kernel", i32 1}
 !5 = !{i32 1, !"wchar_size", i32 4}
 !6 = !{i32 7, !"openmp", i32 50}
 !7 = !{i32 7, !"openmp-device", i32 50}
@@ -86,12 +83,12 @@ attributes #1 = { convergent nounwind }
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    ret void
 ; CHECK:       user_code.entry:
-; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA11:![0-9]+]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr @_ZL6Device, align 8, !tbaa [[TBAA9:![0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @__kmpc_get_hardware_thread_id_in_block() #[[ATTR1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; CHECK:       region.guarded:
-; CHECK-NEXT:    store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA11]]
+; CHECK-NEXT:    store double [[TMP1]], ptr [[X]], align 8, !tbaa [[TBAA9]]
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
 ; CHECK:       region.barrier:
 ; CHECK-NEXT:    tail call void @__kmpc_barrier_simple_spmd(ptr nonnull @[[GLOB1:[0-9]+]], i32 [[TMP2]]) #[[ATTR1]]
@@ -105,6 +102,6 @@ attributes #1 = { convergent nounwind }
 ; CHECK-NEXT:    [[CALL_I:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[CALL_I2:%.*]] = tail call double @__nv_log(double noundef 2.000000e+00) #[[ATTR2]]
 ; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[CALL_I]], [[CALL_I2]]
-; CHECK-NEXT:    store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA11]]
+; CHECK-NEXT:    store double [[DIV]], ptr @_ZL6Device, align 8, !tbaa [[TBAA9]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
index 878ac9010a7dc..0f37b3e070acd 100644
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -13,7 +13,7 @@ target triple = "nvptx64"
 @S = external local_unnamed_addr global ptr
 @foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
-define void @foo() "kernel" {
+define ptx_kernel void @foo() "kernel" {
 entry:
   %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr null)
   %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !10
@@ -39,7 +39,6 @@ declare void @__kmpc_target_deinit()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
-!nvvm.annotations = !{!7, !8}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c")
@@ -48,7 +47,6 @@ declare void @__kmpc_target_deinit()
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"openmp", i32 50}
 !6 = !{i32 7, !"openmp-device", i32 50}
-!7 = !{ptr @foo, !"kernel", i32 1}
 !8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !9, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !9 = !DISubroutineType(types: !2)
 !10 = !DILocation(line: 5, column: 7, scope: !8)
diff --git a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll
index b029efbbe3c68..ce17ffcbb2084 100644
--- a/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_kernel_detection_remarks.ll
@@ -3,11 +3,11 @@
 ; CHECK-DAG: remark: <unknown>:0:0: OpenMP GPU kernel kernel1
 ; CHECK-DAG: remark: <unknown>:0:0: OpenMP GPU kernel kernel2
 
-define void @kernel1() "kernel" {
+define ptx_kernel void @kernel1() "kernel" {
   ret void
 }
 
-define void @kernel2() "kernel" {
+define ptx_kernel void @kernel2() "kernel" {
   ret void
 }
 
@@ -19,10 +19,5 @@ define void @non_kernel() {
 declare dso_local void @__kmpc_kernel_prepare_parallel(ptr)
 
 !llvm.module.flags = !{!4}
-!nvvm.annotations = !{!2, !0, !1, !3, !1, !2}
 
-!0 = !{ptr @kernel1, !"kernel", i32 1}
-!1 = !{ptr @non_kernel, !"non_kernel", i32 1}
-!2 = !{null, !"align", i32 1}
-!3 = !{ptr @kernel2, !"kernel", i32 1}
 !4 = !{i32 7, !"openmp", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 936f7d1c46781..760c5a354a37c 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -44,7 +44,7 @@
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_10301_87b2c_foo_l7() "kernel" {
+define weak ptx_kernel void @__omp_offloading_10301_87b2c_foo_l7() "kernel" {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -173,10 +173,8 @@ entry:
 }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3}
 
 !0 = !{i32 0, i32 66305, i32 555956, !"foo", i32 7, i32 0}
-!1 = !{ptr @__omp_offloading_10301_87b2c_foo_l7, !"kernel", i32 1}
 !2 = !{i32 7, !"openmp", i32 50}
 !3 = !{i32 7, !"openmp-device", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 310ac0a8296c3..2b3a7fabfb459 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -18,7 +18,7 @@ target triple = "nvptx64"
 ; CHECK: @none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ; CHECK: @will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-define weak void @is_spmd() "kernel" {
+define weak ptx_kernel void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment, ptr null)
@@ -36,7 +36,7 @@ define weak void @is_spmd() "kernel" {
   ret void
 }
 
-define weak void @will_be_spmd() "kernel" {
+define weak ptx_kernel void @will_be_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@will_be_spmd
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -70,7 +70,7 @@ user_code.entry:
   ret void
 }
 
-define weak void @non_spmd() "kernel" {
+define weak ptx_kernel void @non_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@non_spmd
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
@@ -88,7 +88,7 @@ define weak void @non_spmd() "kernel" {
   ret void
 }
 
-define weak void @will_not_be_spmd() "kernel" {
+define weak ptx_kernel void @will_not_be_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@will_not_be_spmd
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment, ptr null)
@@ -207,14 +207,9 @@ declare void @foo()
 declare void @bar()
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3, !4, !5}
 
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
-!2 = !{ptr @is_spmd, !"kernel", i32 1}
-!3 = !{ptr @will_be_spmd, !"kernel", i32 1}
-!4 = !{ptr @non_spmd, !"kernel", i32 1}
-!5 = !{ptr @will_not_be_spmd, !"kernel", i32 1}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "kernel" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" }
@@ -223,8 +218,4 @@ declare void @bar()
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @is_spmd, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @will_be_spmd, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @non_spmd, !"kernel", i32 1}
-; CHECK: [[META5:![0-9]+]] = !{ptr @will_not_be_spmd, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 5c4386b24a3d5..1679a27fdae8b 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -43,7 +43,7 @@ target triple = "nvptx64"
 ; CHECK: @__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
+define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
@@ -127,7 +127,7 @@ entry:
 
 declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr
 
-define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
+define weak_odr protected ptx_kernel void @__omp_offloading_10302_bd7e0_main_l16(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
@@ -315,13 +315,10 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11
 
 
 !omp_offload.info = !{!0, !1}
-!nvvm.annotations = !{!2, !3}
 !llvm.module.flags = !{!4, !5}
 
 !0 = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0}
 !1 = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1}
-!2 = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1}
-!3 = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1}
 
 !4 = !{i32 7, !"openmp", i32 50}
 !5 = !{i32 7, !"openmp-device", i32 50}
@@ -336,8 +333,6 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #11
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 13, i32 0, i32 0}
 ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 66306, i32 776160, !"main", i32 16, i32 0, i32 1}
-; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l13, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_10302_bd7e0_main_l16, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index fd6e7683af8e3..e5f65b26ed223 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -16,7 +16,7 @@ target triple = "nvptx64"
 ; CHECK: @spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ; CHECK: @parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-define weak void @none_spmd() "kernel" {
+define weak ptx_kernel void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
@@ -32,7 +32,7 @@ define weak void @none_spmd() "kernel" {
   ret void
 }
 
-define weak void @spmd() "kernel" {
+define weak ptx_kernel void @spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@spmd
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment, ptr null)
@@ -48,7 +48,7 @@ define weak void @spmd() "kernel" {
   ret void
 }
 
-define weak void @parallel() "kernel" {
+define weak ptx_kernel void @parallel() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@parallel
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment, ptr null)
@@ -136,20 +136,13 @@ declare i32 @__kmpc_target_init(ptr, ptr) #1
 declare void @__kmpc_target_deinit() #1
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3, !4}
 
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
-!2 = !{ptr @none_spmd, !"kernel", i32 1}
-!3 = !{ptr @spmd, !"kernel", i32 1}
-!4 = !{ptr @parallel, !"kernel", i32 1}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "kernel" }
 ; CHECK: attributes #[[ATTR1]] = { alwaysinline }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META2:![0-9]+]] = !{ptr @none_spmd, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @spmd, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{ptr @parallel, !"kernel", i32 1}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 31e3ef2b9079f..29f2030c4d42b 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -40,7 +40,7 @@ define weak i32 @__kmpc_target_init(ptr %0, ptr) {
 }
 declare void @__kmpc_target_deinit()
 
-define void @kernel(ptr %dyn) "kernel" {
+define ptx_kernel void @kernel(ptr %dyn) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@kernel
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -98,14 +98,14 @@ define internal void @bar() {
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
-; CHECK-NEXT:    call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG7:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@bar
 ; CHECK-DISABLED-SAME: () #[[ATTR1]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[DOTH2S:%.*]] = alloca i8, i64 4, align 4
-; CHECK-DISABLED-NEXT:    call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG8:![0-9]+]]
+; CHECK-DISABLED-NEXT:    call void @share(ptr nofree [[DOTH2S]]) #[[ATTR5:[0-9]+]], !dbg [[DBG7:![0-9]+]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 entry:
@@ -146,7 +146,7 @@ define void @unused() {
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@unused() {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]], !dbg [[DBG11:![0-9]+]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]], !dbg [[DBG10:![0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_free_shared(ptr [[TMP0]], i64 4) #[[ATTR6]]
 ; CHECK-DISABLED-NEXT:    ret void
 ;
@@ -234,14 +234,12 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !6, !7}
-!nvvm.annotations = !{!5}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 13.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "remove_globalization.c", directory: "/tmp/remove_globalization.c")
 !2 = !{}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{ptr @kernel, !"kernel", i32 1}
 !6 = !{i32 7, !"openmp", i32 50}
 !7 = !{i32 7, !"openmp-device", i32 50}
 !8 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
@@ -276,10 +274,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CHECK: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]])
-; CHECK: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[META10]] = !DISubroutineType(types: [[META2]])
+; CHECK: [[DBG7]] = !DILocation(line: 4, column: 2, scope: [[META8:![0-9]+]])
+; CHECK: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META9]] = !DISubroutineType(types: [[META2]])
 ;.
 ; CHECK-DISABLED: [[META0:![0-9]+]] = distinct !DICompileUnit(language: DW_LANG_C99, file: [[META1:![0-9]+]], producer: "{{.*}}clang version {{.*}}", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: [[META2:![0-9]+]], splitDebugInlining: false, nameTableKind: None)
 ; CHECK-DISABLED: [[META1]] = !DIFile(filename: "remove_globalization.c", directory: {{.*}})
@@ -288,11 +285,10 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK-DISABLED: [[META7:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CHECK-DISABLED: [[DBG8]] = !DILocation(line: 4, column: 2, scope: [[META9:![0-9]+]])
-; CHECK-DISABLED: [[META9]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META10:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK-DISABLED: [[META10]] = !DISubroutineType(types: [[META2]])
-; CHECK-DISABLED: [[DBG11]] = !DILocation(line: 6, column: 2, scope: [[META9]])
+; CHECK-DISABLED: [[DBG7]] = !DILocation(line: 4, column: 2, scope: [[META8:![0-9]+]])
+; CHECK-DISABLED: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK-DISABLED: [[META9]] = !DISubroutineType(types: [[META2]])
+; CHECK-DISABLED: [[DBG10]] = !DILocation(line: 6, column: 2, scope: [[META8]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-REMARKS: {{.*}}
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 6e4fb9e57388b..92cfd75049226 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -25,7 +25,7 @@ target triple = "nvptx64"
 @baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
-define dso_local void @foo(ptr %dyn) "kernel" {
+define dso_local ptx_kernel void @foo(ptr %dyn) "kernel" {
 entry:
   %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr %dyn)
   %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
@@ -36,7 +36,7 @@ entry:
   ret void
 }
 
-define void @bar(ptr %dyn) "kernel" {
+define ptx_kernel void @bar(ptr %dyn) "kernel" {
   %c = call i32 @__kmpc_target_init(ptr @bar_kernel_environment, ptr %dyn)
   call void @unknown_no_openmp()
   %cmp = icmp eq i32 %c, -1
@@ -60,7 +60,7 @@ exit:
   ret void
 }
 
-define void @baz_spmd(ptr %dyn) "kernel" {
+define ptx_kernel void @baz_spmd(ptr %dyn) "kernel" {
   %c = call i32 @__kmpc_target_init(ptr @baz_kernel_environment, ptr %dyn)
   call void @unknown_no_openmp()
   %c0 = icmp eq i32 %c, -1
@@ -109,7 +109,6 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
-!nvvm.annotations = !{!7, !8, !13}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "replace_globalization.c", directory: "/tmp/replace_globalization.c")
@@ -118,9 +117,6 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"openmp", i32 50}
 !6 = !{i32 7, !"openmp-device", i32 50}
-!7 = !{ptr @foo, !"kernel", i32 1}
-!8 = !{ptr @bar, !"kernel", i32 1}
-!13 = !{ptr @baz_spmd, !"kernel", i32 1}
 !9 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 1, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !10 = !DISubroutineType(types: !2)
 !11 = !DILocation(line: 5, column: 7, scope: !9)
@@ -177,7 +173,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]]
 ; CHECK:       master3:
-; CHECK-NEXT:    [[Z:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG10:![0-9]+]]
+; CHECK-NEXT:    [[Z:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 24) #[[ATTR6]], !dbg [[DBG7:![0-9]+]]
 ; CHECK-NEXT:    call void @use.internalized(ptr nofree [[Z]]) #[[ATTR7]]
 ; CHECK-NEXT:    call void @__kmpc_free_shared(ptr [[Z]], i64 24) #[[ATTR8]]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -231,12 +227,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 ; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META7:![0-9]+]] = !{ptr @foo, !"kernel", i32 1}
-; CHECK: [[META8:![0-9]+]] = !{ptr @bar, !"kernel", i32 1}
-; CHECK: [[META9:![0-9]+]] = !{ptr @baz_spmd, !"kernel", i32 1}
-; CHECK: [[DBG10]] = !DILocation(line: 5, column: 14, scope: [[META11:![0-9]+]])
-; CHECK: [[META11]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META12:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
-; CHECK: [[META12]] = !DISubroutineType(types: [[META2]])
+; CHECK: [[DBG7]] = !DILocation(line: 5, column: 14, scope: [[META8:![0-9]+]])
+; CHECK: [[META8]] = distinct !DISubprogram(name: "bar", scope: [[META1]], file: [[META1]], line: 1, type: [[META9:![0-9]+]], scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: [[META0]], retainedNodes: [[META2]])
+; CHECK: [[META9]] = !DISubroutineType(types: [[META2]])
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CHECK-LIMIT: {{.*}}
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index c186e5f04f092..70b9ce41c1a43 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -16,7 +16,7 @@
 ; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread.
-define void @kernel(ptr %dyn) "kernel" {
+define ptx_kernel void @kernel(ptr %dyn) "kernel" {
   %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr %dyn)
   %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %if.then, label %if.else
@@ -116,7 +116,6 @@ attributes #0 = { cold noinline }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
-!nvvm.annotations = !{!7}
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
 !1 = !DIFile(filename: "single_threaded_execution.c", directory: "/tmp/single_threaded_execution.c")
@@ -125,7 +124,6 @@ attributes #0 = { cold noinline }
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"openmp", i32 50}
 !6 = !{i32 7, !"openmp-device", i32 50}
-!7 = !{ptr @kernel, !"kernel", i32 1}
 !8 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 8, type: !10, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !9 = distinct !DISubprogram(name: "cold", scope: !1, file: !1, line: 8, type: !10, scopeLine: 2, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
 !10 = !DISubroutineType(types: !2)
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 6ff4b96b57556..983175382f0f0 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -105,36 +105,6 @@
 @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; AMDGPU-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; AMDGPU-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
-; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
-; NVPTX-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED: @[[__OMP_OUTLINED__7_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
-; NVPTX-DISABLED: @[[__OMP_OUTLINED__9_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
@@ -226,7 +196,7 @@
 ; NVPTX-DISABLED2: @__omp_outlined__7_wrapper.ID = private constant i8 undef
 ; NVPTX-DISABLED2: @__omp_outlined__9_wrapper.ID = private constant i8 undef
 ;.
-define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
 ; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
 ; AMDGPU-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
@@ -256,15 +226,6 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_l5() #0 {
 ; NVPTX-DISABLED2-SAME: () #[[ATTR0:[0-9]+]] {
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
-; AMDGPU-DISABLED-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5
-; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
-; NVPTX-DISABLED-NEXT:    call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
-; NVPTX-DISABLED-NEXT:    ret void
   call void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
   ret void
 }
@@ -282,7 +243,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    br label [[COMMON_RET]]
@@ -299,7 +260,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; NVPTX-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-NEXT:    br label [[COMMON_RET]]
@@ -350,7 +311,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       user_code.entry:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -401,7 +362,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       user_code.entry:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
@@ -451,7 +412,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       user_code.entry:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -501,114 +462,10 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       user_code.entry:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; AMDGPU-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.execute:
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_l5__debug
-; NVPTX-DISABLED-SAME: () #[[ATTR1:[0-9]+]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__1_wrapper.ID
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.execute:
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4:[0-9]+]]
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18:![0-9]+]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -643,10 +500,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -661,10 +518,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -679,10 +536,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -697,10 +554,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -715,10 +572,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -733,45 +590,10 @@ define internal void @__omp_outlined__(ptr noalias %.global_tid., ptr noalias %.
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED:       for.cond:
-; AMDGPU-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED:       for.cond.cleanup:
-; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       for.body:
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; AMDGPU-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED:       for.cond:
-; NVPTX-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED:       for.cond.cleanup:
-; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7:[0-9]+]]
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       for.body:
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; NVPTX-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
   br label %for.cond
@@ -829,17 +651,6 @@ define internal void @__omp_outlined__1(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    call void @unknown() #[[ATTR8:[0-9]+]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   call void @unknown() #11
   ret void
@@ -906,25 +717,6 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
-; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__1(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %.addr1 = alloca i32, align 4
   %.zero.addr = alloca i32, align 4
@@ -937,7 +729,7 @@ entry:
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
 ; AMDGPU-SAME: () #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
@@ -950,7 +742,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    br label [[COMMON_RET]]
@@ -967,7 +759,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-NEXT:    br label [[COMMON_RET]]
@@ -1018,7 +810,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       user_code.entry:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -1069,7 +861,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       user_code.entry:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
@@ -1119,7 +911,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       user_code.entry:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -1169,114 +961,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       user_code.entry:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.execute:
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__3_wrapper.ID
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.execute:
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__2(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -1314,10 +1002,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -1334,10 +1022,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -1355,10 +1043,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -1376,10 +1064,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -1396,10 +1084,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__2
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -1416,50 +1104,10 @@ define internal void @__omp_outlined__2(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-DISABLED-NEXT:    [[MALLOC_CAST:%.*]] = addrspacecast ptr addrspace(5) [[X_H2S]] to ptr
-; AMDGPU-DISABLED-NEXT:    call void @use(ptr nocapture [[MALLOC_CAST]]) #[[ATTR7]]
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED:       for.cond:
-; AMDGPU-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED:       for.cond.cleanup:
-; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       for.body:
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; AMDGPU-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__2
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[X_H2S:%.*]] = alloca i8, i64 4, align 4
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-DISABLED-NEXT:    call void @use(ptr nocapture [[X_H2S]]) #[[ATTR7]]
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED:       for.cond:
-; NVPTX-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED:       for.cond.cleanup:
-; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       for.body:
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; NVPTX-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
   %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
@@ -1519,17 +1167,6 @@ define internal void @__omp_outlined__3(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   call void @unknown() #11
   ret void
@@ -1596,25 +1233,6 @@ define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__3_wrapper
-; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__3(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %.addr1 = alloca i32, align 4
   %.zero.addr = alloca i32, align 4
@@ -1628,7 +1246,7 @@ entry:
 
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
 ; AMDGPU-SAME: () #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
@@ -1641,7 +1259,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    br label [[COMMON_RET]]
@@ -1658,7 +1276,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-NEXT:    br label [[COMMON_RET]]
@@ -1709,7 +1327,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       user_code.entry:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -1760,7 +1378,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       user_code.entry:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
@@ -1810,7 +1428,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       user_code.entry:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -1860,114 +1478,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       user_code.entry:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.execute:
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__5_wrapper.ID
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.execute:
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__4(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -2002,11 +1516,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2021,11 +1535,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2040,11 +1554,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2059,11 +1573,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2078,11 +1592,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__4
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2097,48 +1611,11 @@ define internal void @__omp_outlined__4(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED:       for.cond:
-; AMDGPU-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED:       for.cond.cleanup:
-; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       for.body:
-; AMDGPU-DISABLED-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
-; AMDGPU-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__4
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED:       for.cond:
-; NVPTX-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED:       for.cond.cleanup:
-; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       for.body:
-; NVPTX-DISABLED-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26:![0-9]+]]
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
-; NVPTX-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 entry:
   %captured_vars_addrs = alloca [1 x ptr], align 8
   %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
@@ -2167,73 +1644,56 @@ define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-DISABLED1-NEXT:  entry:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-DISABLED2-NEXT:  entry:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-DISABLED1-NEXT:  entry:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-DISABLED2-NEXT:  entry:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %0 = load i32, ptr %x, align 4, !tbaa !18
   %inc = add nsw i32 %0, 1
@@ -2252,7 +1712,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
@@ -2264,7 +1724,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -2276,7 +1736,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-DISABLED1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
@@ -2288,7 +1748,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-DISABLED2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
@@ -2300,7 +1760,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
@@ -2312,32 +1772,9 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__5_wrapper
-; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %.addr1 = alloca i32, align 4
   %.zero.addr = alloca i32, align 4
@@ -2352,7 +1789,7 @@ entry:
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
 ; AMDGPU-SAME: () #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
@@ -2365,7 +1802,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    br label [[COMMON_RET]]
@@ -2382,7 +1819,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-NEXT:    br label [[COMMON_RET]]
@@ -2433,7 +1870,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       user_code.entry:
 ; AMDGPU-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -2484,7 +1921,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       user_code.entry:
 ; AMDGPU-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-DISABLED2-NEXT:    br label [[COMMON_RET]]
@@ -2534,7 +1971,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       user_code.entry:
 ; NVPTX-DISABLED1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED1-NEXT:    br label [[COMMON_RET]]
@@ -2584,114 +2021,10 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       user_code.entry:
 ; NVPTX-DISABLED2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.execute:
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__7_wrapper.ID
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.execute:
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__6(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -2723,7 +2056,7 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
 ; AMDGPU-NEXT:    br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; AMDGPU:       region.guarded:
-; AMDGPU-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; AMDGPU:       region.guarded.end:
 ; AMDGPU-NEXT:    br label [[REGION_BARRIER]]
@@ -2740,11 +2073,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
@@ -2756,7 +2089,7 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
 ; NVPTX-NEXT:    br i1 [[TMP1]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; NVPTX:       region.guarded:
-; NVPTX-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; NVPTX:       region.guarded.end:
 ; NVPTX-NEXT:    br label [[REGION_BARRIER]]
@@ -2773,17 +2106,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; NVPTX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 ; AMDGPU-DISABLED1-NEXT:  entry:
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; AMDGPU-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU-DISABLED1:       for.cond:
 ; AMDGPU-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
@@ -2793,17 +2126,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ; AMDGPU-DISABLED1:       for.body:
-; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; AMDGPU-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 ; AMDGPU-DISABLED2-NEXT:  entry:
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; AMDGPU-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
 ; AMDGPU-DISABLED2:       for.cond:
 ; AMDGPU-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
@@ -2813,17 +2146,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ; AMDGPU-DISABLED2:       for.body:
-; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; AMDGPU-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 ; NVPTX-DISABLED1-NEXT:  entry:
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; NVPTX-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND:%.*]]
 ; NVPTX-DISABLED1:       for.cond:
 ; NVPTX-DISABLED1-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
@@ -2833,17 +2166,17 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED1-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ; NVPTX-DISABLED1:       for.body:
-; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-DISABLED1-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; NVPTX-DISABLED1-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__6
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; NVPTX-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND:%.*]]
 ; NVPTX-DISABLED2:       for.cond:
 ; NVPTX-DISABLED2-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
@@ -2853,50 +2186,11 @@ define internal void @__omp_outlined__6(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ; NVPTX-DISABLED2:       for.body:
-; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-DISABLED2-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; AMDGPU-DISABLED-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; AMDGPU-DISABLED:       for.cond:
-; AMDGPU-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; AMDGPU-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; AMDGPU-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; AMDGPU-DISABLED:       for.cond.cleanup:
-; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       for.body:
-; AMDGPU-DISABLED-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared.1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
-; AMDGPU-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__6
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
-; NVPTX-DISABLED-NEXT:    store i32 42, ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND:%.*]]
-; NVPTX-DISABLED:       for.cond:
-; NVPTX-DISABLED-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; NVPTX-DISABLED-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], 100
-; NVPTX-DISABLED-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; NVPTX-DISABLED:       for.cond.cleanup:
-; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       for.body:
-; NVPTX-DISABLED-NEXT:    store ptr addrspacecast (ptr addrspace(3) @x_shared1 to ptr), ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA26]]
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
-; NVPTX-DISABLED-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-DISABLED-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+; NVPTX-DISABLED2-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 entry:
   %captured_vars_addrs = alloca [1 x ptr], align 8
   %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
@@ -2926,73 +2220,56 @@ define internal void @__omp_outlined__7(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-NEXT:    ret void
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-NEXT:    ret void
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; AMDGPU-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-DISABLED1-NEXT:  entry:
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED1-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; AMDGPU-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-DISABLED2-NEXT:  entry:
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; AMDGPU-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-DISABLED2-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; NVPTX-DISABLED1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-DISABLED1-NEXT:  entry:
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED1-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED1-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__omp_outlined__7
 ; NVPTX-DISABLED2-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-DISABLED2-NEXT:  entry:
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
+; NVPTX-DISABLED2-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-DISABLED2-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-DISABLED-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
-; AMDGPU-DISABLED-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-DISABLED-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA18]]
-; NVPTX-DISABLED-NEXT:    call void @unknowni32p(ptr [[X]]) #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %0 = load i32, ptr %x, align 4, !tbaa !18
   %inc = add nsw i32 %0, 1
@@ -3011,7 +2288,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-NEXT:    ret void
 ;
@@ -3023,7 +2300,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -3035,7 +2312,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-DISABLED1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-DISABLED1-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED1-NEXT:    ret void
 ;
@@ -3047,7 +2324,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-DISABLED2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; AMDGPU-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-DISABLED2-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; AMDGPU-DISABLED2-NEXT:    ret void
 ;
@@ -3059,7 +2336,7 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-DISABLED1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-DISABLED1-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED1-NEXT:    ret void
 ;
@@ -3071,32 +2348,9 @@ define internal void @__omp_outlined__7_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED2-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
+; NVPTX-DISABLED2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__7_wrapper
-; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA26]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__7(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %.addr1 = alloca i32, align 4
   %.zero.addr = alloca i32, align 4
@@ -3111,7 +2365,7 @@ entry:
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
 ; AMDGPU-SAME: () #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
@@ -3384,96 +2638,6 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__8(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR5]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
@@ -3530,24 +2694,13 @@ define internal void @__omp_outlined__8(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__8
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   call void @unknown() #11
   ret void
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
 ; AMDGPU-SAME: () #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
@@ -3862,110 +3015,6 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-DISABLED2-NEXT:    br label [[COMMON_RET]]
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AMDGPU-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; AMDGPU-DISABLED:       is_worker_check:
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; AMDGPU-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; AMDGPU-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; AMDGPU-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; AMDGPU-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.begin:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast ptr addrspace(5) [[WORKER_WORK_FN_ADDR]] to ptr
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR_GENERIC]])
-; AMDGPU-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR_GENERIC]], align 8
-; AMDGPU-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.finished:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       worker_state_machine.is_active.check:
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.check:
-; AMDGPU-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; AMDGPU-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.execute:
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; AMDGPU-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; AMDGPU-DISABLED:       worker_state_machine.parallel_region.end:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; AMDGPU-DISABLED:       worker_state_machine.done.barrier:
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; AMDGPU-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; AMDGPU-DISABLED:       thread.user_code.check:
-; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; AMDGPU-DISABLED:       common.ret:
-; AMDGPU-DISABLED-NEXT:    ret void
-; AMDGPU-DISABLED:       user_code.entry:
-; AMDGPU-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; AMDGPU-DISABLED-NEXT:    br label [[COMMON_RET]]
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
-; NVPTX-DISABLED:       is_worker_check:
-; NVPTX-DISABLED-NEXT:    [[BLOCK_HW_SIZE:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
-; NVPTX-DISABLED-NEXT:    [[WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
-; NVPTX-DISABLED-NEXT:    [[BLOCK_SIZE:%.*]] = sub i32 [[BLOCK_HW_SIZE]], [[WARP_SIZE]]
-; NVPTX-DISABLED-NEXT:    [[THREAD_IS_MAIN_OR_WORKER:%.*]] = icmp slt i32 [[TMP0]], [[BLOCK_SIZE]]
-; NVPTX-DISABLED-NEXT:    br i1 [[THREAD_IS_MAIN_OR_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.begin:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(ptr [[WORKER_WORK_FN_ADDR]])
-; NVPTX-DISABLED-NEXT:    [[WORKER_WORK_FN:%.*]] = load ptr, ptr [[WORKER_WORK_FN_ADDR]], align 8
-; NVPTX-DISABLED-NEXT:    [[WORKER_IS_DONE:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], null
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.finished:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       worker_state_machine.is_active.check:
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.check:
-; NVPTX-DISABLED-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq ptr [[WORKER_WORK_FN]], @__omp_outlined__9_wrapper.ID
-; NVPTX-DISABLED-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.execute:
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__9_wrapper(i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.fallback.execute:
-; NVPTX-DISABLED-NEXT:    call void [[WORKER_WORK_FN]](i16 0, i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]]
-; NVPTX-DISABLED:       worker_state_machine.parallel_region.end:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_kernel_end_parallel()
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_DONE_BARRIER]]
-; NVPTX-DISABLED:       worker_state_machine.done.barrier:
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_barrier_simple_generic(ptr @[[GLOB1]], i32 [[TMP0]])
-; NVPTX-DISABLED-NEXT:    br label [[WORKER_STATE_MACHINE_BEGIN]]
-; NVPTX-DISABLED:       thread.user_code.check:
-; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
-; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
-; NVPTX-DISABLED:       common.ret:
-; NVPTX-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED:       user_code.entry:
-; NVPTX-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    [[TMP2:%.*]] = call ptr @__kmpc_omp_task_alloc(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i64 40, i64 0, ptr @"_omp_task_entry$") #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_omp_task(ptr @[[GLOB1]], i32 [[TMP1]], ptr [[TMP2]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__9, ptr @__omp_outlined__9_wrapper.ID, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_target_deinit()
-; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
   %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
@@ -4021,17 +3070,6 @@ define internal void @.omp_outlined.(i32 %.global_tid., ptr noalias %.part_id.,
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    call void @spmd_amenable() #[[ATTR7]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
-; AMDGPU-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@.omp_outlined.
-; NVPTX-DISABLED-SAME: (i32 [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTPART_ID_:%.*]], ptr noalias [[DOTPRIVATES_:%.*]], ptr noalias [[DOTCOPY_FN_:%.*]], ptr [[DOTTASK_T_:%.*]], ptr noalias [[__CONTEXT:%.*]]) #[[ATTR3:[0-9]+]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    call void @spmd_amenable() #[[ATTR7]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   call void @spmd_amenable() #10
   ret void
@@ -4093,13 +3131,6 @@ define weak i32 @__kmpc_target_init(ptr, ptr) {
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
 ; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED2-NEXT:    ret i32 0
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
-; AMDGPU-DISABLED-NEXT:    ret i32 0
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
-; NVPTX-DISABLED-NEXT:    ret i32 0
   ret i32 0
 }
 
@@ -4158,16 +3189,6 @@ define internal void @__omp_outlined__9(ptr noalias %.global_tid., ptr noalias %
 ; NVPTX-DISABLED2-NEXT:    call void @unknown() #[[ATTR8]]
 ; NVPTX-DISABLED2-NEXT:    ret void
 ;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
-; AMDGPU-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9
-; NVPTX-DISABLED-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    call void @unknown() #[[ATTR8]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   call void @unknown() #11
   ret void
@@ -4234,25 +3255,6 @@ define internal void @__omp_outlined__9_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-DISABLED2-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-DISABLED2-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 ; NVPTX-DISABLED2-NEXT:    ret void
-;
-; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; AMDGPU-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; AMDGPU-DISABLED-NEXT:  entry:
-; AMDGPU-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; AMDGPU-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; AMDGPU-DISABLED-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; AMDGPU-DISABLED-NEXT:    ret void
-; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_outlined__9_wrapper
-; NVPTX-DISABLED-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; NVPTX-DISABLED-NEXT:  entry:
-; NVPTX-DISABLED-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-; NVPTX-DISABLED-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
-; NVPTX-DISABLED-NEXT:    call void @__omp_outlined__9(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
-; NVPTX-DISABLED-NEXT:    ret void
 entry:
   %.addr1 = alloca i32, align 4
   %.zero.addr = alloca i32, align 4
@@ -4280,7 +3282,6 @@ attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" }
 attributes #11 = { convergent }
 
 !omp_offload.info = !{!0, !1, !2, !3, !4, !5}
-!nvvm.annotations = !{!6, !7, !8, !9, !10, !11}
 !llvm.module.flags = !{!12, !13, !14, !15, !16}
 !llvm.ident = !{!17}
 
@@ -4290,12 +3291,6 @@ attributes #11 = { convergent }
 !3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 !4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 !5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-!6 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-!7 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-!8 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-!9 = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-!10 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-!11 = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
 !12 = !{i32 1, !"wchar_size", i32 4}
 !13 = !{i32 7, !"openmp", i32 50}
 !14 = !{i32 7, !"openmp-device", i32 50}
@@ -4317,92 +3312,6 @@ attributes #11 = { convergent }
 !30 = !{!31, !27, i64 0}
 !31 = !{!"kmp_task_t_with_privates", !32, i64 0}
 !32 = !{!"kmp_task_t", !27, i64 0, !27, i64 8, !19, i64 16, !20, i64 24, !20, i64 32}
-; AMDGPU-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR1]] = { norecurse }
-; AMDGPU-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR5]] = { nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind }
-; AMDGPU-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
-; AMDGPU-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
-; AMDGPU-DISABLED: attributes #[[ATTR9]] = { convergent }
-; AMDGPU-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; AMDGPU-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline }
-; AMDGPU-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR1]] = { norecurse }
-; NVPTX-DISABLED: attributes #[[ATTR2]] = { convergent norecurse nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR3]] = { alwaysinline norecurse nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR4]] = { alwaysinline convergent nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR5]] = { nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR6:[0-9]+]] = { nosync nounwind }
-; NVPTX-DISABLED: attributes #[[ATTR7:[0-9]+]] = { nofree nosync nounwind allocsize(0) }
-; NVPTX-DISABLED: attributes #[[ATTR8]] = { convergent "llvm.assume"="ompx_spmd_amenable" }
-; NVPTX-DISABLED: attributes #[[ATTR9]] = { convergent }
-; NVPTX-DISABLED: attributes #[[ATTR10:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-; NVPTX-DISABLED: attributes #[[ATTR11:[0-9]+]] = { alwaysinline }
-; NVPTX-DISABLED: attributes #[[ATTR12:[0-9]+]] = { convergent nounwind }
-; AMDGPU-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
-; AMDGPU-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
-; AMDGPU-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
-; AMDGPU-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
-; AMDGPU-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
-; AMDGPU-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; AMDGPU-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; AMDGPU-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; AMDGPU-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; AMDGPU-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; AMDGPU-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
-; AMDGPU-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; AMDGPU-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; AMDGPU-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
-; AMDGPU-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
-; AMDGPU-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
-; AMDGPU-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; AMDGPU-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
-; AMDGPU-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
-; NVPTX-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
-; NVPTX-DISABLED: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
-; NVPTX-DISABLED: [[META2:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop", i32 5, i32 0}
-; NVPTX-DISABLED: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
-; NVPTX-DISABLED: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
-; NVPTX-DISABLED: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; NVPTX-DISABLED: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; NVPTX-DISABLED: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX-DISABLED: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX-DISABLED: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; NVPTX-DISABLED: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; NVPTX-DISABLED: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX-DISABLED: [[META17:![0-9]+]] = !{!"clang version 14.0.0"}
-; NVPTX-DISABLED: [[TBAA18]] = !{!19, !19, i64 0}
-; NVPTX-DISABLED: [[META19:![0-9]+]] = !{!"int", !20, i64 0}
-; NVPTX-DISABLED: [[META20:![0-9]+]] = !{!"omnipotent char", !21, i64 0}
-; NVPTX-DISABLED: [[META21:![0-9]+]] = !{!"Simple C/C++ TBAA"}
-; NVPTX-DISABLED: [[LOOP22]] = distinct !{!22, !23, !24}
-; NVPTX-DISABLED: [[META23:![0-9]+]] = !{!"llvm.loop.mustprogress"}
-; NVPTX-DISABLED: [[META24:![0-9]+]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX-DISABLED: [[LOOP25]] = distinct !{!25, !23, !24}
-; NVPTX-DISABLED: [[TBAA26]] = !{!27, !27, i64 0}
-; NVPTX-DISABLED: [[META27:![0-9]+]] = !{!"any pointer", !20, i64 0}
-; NVPTX-DISABLED: [[LOOP28]] = distinct !{!28, !23, !24}
-; NVPTX-DISABLED: [[LOOP29]] = distinct !{!29, !23, !24}
 ;.
 ; AMDGPU: attributes #[[ATTR0]] = { alwaysinline convergent norecurse nounwind "kernel" }
 ; AMDGPU: attributes #[[ATTR1]] = { norecurse }
@@ -4488,30 +3397,24 @@ attributes #11 = { convergent }
 ; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; AMDGPU: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; AMDGPU: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; AMDGPU: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; AMDGPU: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; AMDGPU: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; AMDGPU: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; AMDGPU: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; AMDGPU: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; AMDGPU: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; AMDGPU: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; AMDGPU: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; AMDGPU: [[META21]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; AMDGPU: [[META23]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; AMDGPU: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; AMDGPU: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; AMDGPU: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4519,30 +3422,24 @@ attributes #11 = { convergent }
 ; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; NVPTX: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; NVPTX: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; NVPTX: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; NVPTX: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; NVPTX: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; NVPTX: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; NVPTX: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; NVPTX: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; NVPTX: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; NVPTX: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; NVPTX: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; NVPTX: [[META21]] = !{!"Simple C/C++ TBAA"}
-; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; NVPTX: [[META23]] = !{!"llvm.loop.mustprogress"}
-; NVPTX: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; NVPTX: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; NVPTX: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; NVPTX: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"}
+; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"}
+; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; AMDGPU-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; AMDGPU-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4550,30 +3447,24 @@ attributes #11 = { convergent }
 ; AMDGPU-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; AMDGPU-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; AMDGPU-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; AMDGPU-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; AMDGPU-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; AMDGPU-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; AMDGPU-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; AMDGPU-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; AMDGPU-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; AMDGPU-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; AMDGPU-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; AMDGPU-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; AMDGPU-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; AMDGPU-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; AMDGPU-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED1: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU-DISABLED1: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU-DISABLED1: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; AMDGPU-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; AMDGPU-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; AMDGPU-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; AMDGPU-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU-DISABLED1: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; AMDGPU-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; AMDGPU-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; AMDGPU-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; AMDGPU-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; AMDGPU-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; AMDGPU-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4581,30 +3472,24 @@ attributes #11 = { convergent }
 ; AMDGPU-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; AMDGPU-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; AMDGPU-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; AMDGPU-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; AMDGPU-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; AMDGPU-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; AMDGPU-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; AMDGPU-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; AMDGPU-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; AMDGPU-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; AMDGPU-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; AMDGPU-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; AMDGPU-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; AMDGPU-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; AMDGPU-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; AMDGPU-DISABLED2: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU-DISABLED2: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU-DISABLED2: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; AMDGPU-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; AMDGPU-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; AMDGPU-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; AMDGPU-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU-DISABLED2: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; AMDGPU-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; AMDGPU-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; AMDGPU-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; AMDGPU-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; NVPTX-DISABLED1: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX-DISABLED1: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4612,30 +3497,24 @@ attributes #11 = { convergent }
 ; NVPTX-DISABLED1: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; NVPTX-DISABLED1: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; NVPTX-DISABLED1: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; NVPTX-DISABLED1: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; NVPTX-DISABLED1: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX-DISABLED1: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX-DISABLED1: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; NVPTX-DISABLED1: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; NVPTX-DISABLED1: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX-DISABLED1: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX-DISABLED1: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; NVPTX-DISABLED1: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; NVPTX-DISABLED1: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; NVPTX-DISABLED1: [[META21]] = !{!"Simple C/C++ TBAA"}
-; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; NVPTX-DISABLED1: [[META23]] = !{!"llvm.loop.mustprogress"}
-; NVPTX-DISABLED1: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX-DISABLED1: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; NVPTX-DISABLED1: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; NVPTX-DISABLED1: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; NVPTX-DISABLED1: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; NVPTX-DISABLED1: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; NVPTX-DISABLED1: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX-DISABLED1: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX-DISABLED1: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX-DISABLED1: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; NVPTX-DISABLED1: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; NVPTX-DISABLED1: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX-DISABLED1: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX-DISABLED1: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; NVPTX-DISABLED1: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; NVPTX-DISABLED1: [[META15]] = !{!"Simple C/C++ TBAA"}
+; NVPTX-DISABLED1: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; NVPTX-DISABLED1: [[META17]] = !{!"llvm.loop.mustprogress"}
+; NVPTX-DISABLED1: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX-DISABLED1: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; NVPTX-DISABLED1: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX-DISABLED1: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; NVPTX-DISABLED1: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; NVPTX-DISABLED1: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; NVPTX-DISABLED2: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_task", i32 74, i32 5}
 ; NVPTX-DISABLED2: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -4643,28 +3522,22 @@ attributes #11 = { convergent }
 ; NVPTX-DISABLED2: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; NVPTX-DISABLED2: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; NVPTX-DISABLED2: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; NVPTX-DISABLED2: [[META6:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META7:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META8:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74, !"kernel", i32 1}
-; NVPTX-DISABLED2: [[META12:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX-DISABLED2: [[META13:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX-DISABLED2: [[META14:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; NVPTX-DISABLED2: [[META15:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; NVPTX-DISABLED2: [[META16:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX-DISABLED2: [[META17:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX-DISABLED2: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0}
-; NVPTX-DISABLED2: [[META19]] = !{!"int", [[META20:![0-9]+]], i64 0}
-; NVPTX-DISABLED2: [[META20]] = !{!"omnipotent char", [[META21:![0-9]+]], i64 0}
-; NVPTX-DISABLED2: [[META21]] = !{!"Simple C/C++ TBAA"}
-; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META23:![0-9]+]], [[META24:![0-9]+]]}
-; NVPTX-DISABLED2: [[META23]] = !{!"llvm.loop.mustprogress"}
-; NVPTX-DISABLED2: [[META24]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX-DISABLED2: [[LOOP25]] = distinct !{[[LOOP25]], [[META23]], [[META24]]}
-; NVPTX-DISABLED2: [[TBAA26]] = !{[[META27:![0-9]+]], [[META27]], i64 0}
-; NVPTX-DISABLED2: [[META27]] = !{!"any pointer", [[META20]], i64 0}
-; NVPTX-DISABLED2: [[LOOP28]] = distinct !{[[LOOP28]], [[META23]], [[META24]]}
-; NVPTX-DISABLED2: [[LOOP29]] = distinct !{[[LOOP29]], [[META23]], [[META24]]}
+; NVPTX-DISABLED2: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX-DISABLED2: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX-DISABLED2: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX-DISABLED2: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; NVPTX-DISABLED2: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; NVPTX-DISABLED2: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX-DISABLED2: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX-DISABLED2: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; NVPTX-DISABLED2: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; NVPTX-DISABLED2: [[META15]] = !{!"Simple C/C++ TBAA"}
+; NVPTX-DISABLED2: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; NVPTX-DISABLED2: [[META17]] = !{!"llvm.loop.mustprogress"}
+; NVPTX-DISABLED2: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX-DISABLED2: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; NVPTX-DISABLED2: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX-DISABLED2: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; NVPTX-DISABLED2: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; NVPTX-DISABLED2: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index 2f43a4e4286a2..99715cf5b4032 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -28,7 +28,7 @@ target triple = "nvptx64"
 ; CHECK: @__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
-define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
+define weak ptx_kernel void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
 ; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -47,7 +47,7 @@ define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull a
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; CHECK:       region.guarded:
-; CHECK-NEXT:    store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA8:![0-9]+]]
+; CHECK-NEXT:    store double [[CALL_I]], ptr [[X]], align 8, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
@@ -127,12 +127,10 @@ attributes #5 = { convergent "frame-pointer"="all" "no-trapping-math"="true" "st
 attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3, !4, !5, !6}
 !llvm.ident = !{!7}
 
 !0 = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0}
-!1 = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1}
 !2 = !{i32 1, !"wchar_size", i32 4}
 !3 = !{i32 7, !"openmp", i32 50}
 !4 = !{i32 7, !"openmp-device", i32 50}
@@ -154,15 +152,14 @@ attributes #6 = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
 ; CHECK: attributes #[[ATTR7]] = { convergent nounwind "llvm.assume"="ompx_spmd_amenable" }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 1078211522, !"main", i32 5, i32 0}
-; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_fd02_404433c2_main_l5, !"kernel", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; CHECK: [[TBAA8]] = !{[[META9:![0-9]+]], [[META9]], i64 0}
-; CHECK: [[META9]] = !{!"double", [[META10:![0-9]+]], i64 0}
-; CHECK: [[META10]] = !{!"omnipotent char", [[META11:![0-9]+]], i64 0}
-; CHECK: [[META11]] = !{!"Simple C/C++ TBAA"}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[TBAA7]] = !{[[META8:![0-9]+]], [[META8]], i64 0}
+; CHECK: [[META8]] = !{!"double", [[META9:![0-9]+]], i64 0}
+; CHECK: [[META9]] = !{!"omnipotent char", [[META10:![0-9]+]], i64 0}
+; CHECK: [[META10]] = !{!"Simple C/C++ TBAA"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
index 75e01f3295fe2..953ecb2ddd8a6 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
@@ -297,12 +297,10 @@ attributes #14 = { convergent nounwind "llvm.assume"="ompx_aligned_barrier,ompx_
 attributes #15 = { convergent nounwind }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3, !4, !5}
 !llvm.ident = !{!6}
 
 !0 = !{i32 0, i32 32, i32 18757968, !"main", i32 12, i32 0}
-!1 = !{ptr @__omp_offloading_20_11e3950_main_l12, !"kernel", i32 1}
 !2 = !{i32 1, !"wchar_size", i32 4}
 !3 = !{i32 7, !"openmp", i32 50}
 !4 = !{i32 7, !"openmp-device", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 229a49d784559..bbf1de253de92 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -58,7 +58,7 @@ target triple = "nvptx64"
 ; CHECK-DISABLED: @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLED: @__omp_outlined__1_wrapper.ID = private constant i8 undef
 ;.
-define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 {
+define weak ptx_kernel void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2a_fbfa7a_sequential_loop_l6
 ; CHECK-SAME: (ptr [[DYN:%.*]], ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -84,9 +84,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TMP3]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]]
 ; CHECK:       region.guarded:
-; CHECK-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]]
-; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]]
-; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]]
+; CHECK-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END:%.*]]
 ; CHECK:       region.guarded.end:
 ; CHECK-NEXT:    br label [[REGION_BARRIER]]
@@ -111,7 +111,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[REGION_GUARDED4:%.*]], label [[REGION_BARRIER2:%.*]]
 ; CHECK:       region.guarded4:
-; CHECK-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END1:%.*]]
 ; CHECK:       region.guarded.end1:
 ; CHECK-NEXT:    br label [[REGION_BARRIER2]]
@@ -120,10 +120,10 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    br label [[REGION_EXIT3]]
 ; CHECK:       region.exit3:
 ; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
-; CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       __omp_outlined__.exit:
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr null, i64 0)
-; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]]
 ; CHECK-NEXT:    [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID10:%.*]]
@@ -132,7 +132,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[REGION_GUARDED9:%.*]], label [[REGION_BARRIER7:%.*]]
 ; CHECK:       region.guarded9:
-; CHECK-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END6:%.*]]
 ; CHECK:       region.guarded.end6:
 ; CHECK-NEXT:    br label [[REGION_BARRIER7]]
@@ -140,7 +140,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP6]])
 ; CHECK-NEXT:    br label [[REGION_EXIT8:%.*]]
 ; CHECK:       region.exit8:
-; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-NEXT:    [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID15:%.*]]
@@ -149,7 +149,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[REGION_GUARDED14:%.*]], label [[REGION_BARRIER12:%.*]]
 ; CHECK:       region.guarded14:
-; CHECK-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END11:%.*]]
 ; CHECK:       region.guarded.end11:
 ; CHECK-NEXT:    br label [[REGION_BARRIER12]]
@@ -157,7 +157,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP8]])
 ; CHECK-NEXT:    br label [[REGION_EXIT13:%.*]]
 ; CHECK:       region.exit13:
-; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-NEXT:    [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
 ; CHECK-NEXT:    br label [[REGION_CHECK_TID20:%.*]]
@@ -166,7 +166,7 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 0
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[REGION_GUARDED19:%.*]], label [[REGION_BARRIER17:%.*]]
 ; CHECK:       region.guarded19:
-; CHECK-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]]
+; CHECK-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]]
 ; CHECK-NEXT:    br label [[REGION_GUARDED_END16:%.*]]
 ; CHECK:       region.guarded.end16:
 ; CHECK-NEXT:    br label [[REGION_BARRIER17]]
@@ -174,9 +174,9 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-NEXT:    call void @__kmpc_barrier_simple_spmd(ptr @[[GLOB2]], i32 [[TMP10]])
 ; CHECK-NEXT:    br label [[REGION_EXIT18:%.*]]
 ; CHECK:       region.exit18:
-; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
-; CHECK-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
-; CHECK-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
+; CHECK-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
+; CHECK-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-NEXT:    call void @__kmpc_target_deinit() #[[ATTR6]]
 ; CHECK-NEXT:    ret void
 ; CHECK:       worker.exit:
@@ -230,13 +230,13 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-DISABLED-NEXT:    [[SELECT:%.*]] = select i1 [[C]], ptr [[AL32]], ptr addrspacecast (ptr addrspace(5) @LocGlob to ptr)
 ; CHECK-DISABLED-NEXT:    store ptr [[SELECT]], ptr [[LOC]], align 8
 ; CHECK-DISABLED-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1]]) #[[ATTR6]]
-; CHECK-DISABLED-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META8:![0-9]+]]
+; CHECK-DISABLED-NEXT:    store i32 0, ptr [[X]], align 4, !noalias [[META7:![0-9]+]]
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX1_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
-; CHECK-DISABLED-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 1, ptr [[ARRAYIDX1_I]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[SEXT:%.*]] = shl i64 [[N]], 32
 ; CHECK-DISABLED-NEXT:    [[IDXPROM_I:%.*]] = ashr exact i64 [[SEXT]], 32
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX2_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 [[N_ADDR_SROA_0_0_EXTRACT_TRUNC]], ptr [[ARRAYIDX2_I]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    call void @usei8ptr(ptr nocapture [[HEAP2STACK_H2S]]) #[[ATTR9:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I:%.*]]
 ; CHECK-DISABLED:       for.cond.i:
@@ -248,26 +248,26 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x
 ; CHECK-DISABLED-NEXT:    [[SUB3_I:%.*]] = add nsw i32 [[I_0_I]], -1
 ; CHECK-DISABLED-NEXT:    [[IDXPROM4_I:%.*]] = zext i32 [[I_0_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX5_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM4_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 [[SUB3_I]], ptr [[ARRAYIDX5_I]], align 4, !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1
-; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-DISABLED-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-DISABLED:       __omp_outlined__.exit:
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_parallel_51(ptr null, i32 0, i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper.ID, ptr null, i64 0)
-; CHECK-DISABLED-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    [[CALL_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10:[0-9]+]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM6_I:%.*]] = sext i32 [[CALL_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX7_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM6_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META8]]
-; CHECK-DISABLED-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 [[CALL_I]], ptr [[ARRAYIDX7_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[CALL8_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM9_I:%.*]] = sext i32 [[CALL8_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX10_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM9_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META8]]
-; CHECK-DISABLED-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 [[CALL8_I]], ptr [[ARRAYIDX10_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[CALL11_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    [[IDXPROM12_I:%.*]] = sext i32 [[CALL11_I]] to i64
 ; CHECK-DISABLED-NEXT:    [[ARRAYIDX13_I:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM12_I]]
-; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META8]]
-; CHECK-DISABLED-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
-; CHECK-DISABLED-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
-; CHECK-DISABLED-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META8]]
+; CHECK-DISABLED-NEXT:    store i32 [[CALL11_I]], ptr [[ARRAYIDX13_I]], align 4, !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[CALL14_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[CALL15_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
+; CHECK-DISABLED-NEXT:    [[CALL16_I:%.*]] = call i32 @no_openmp(ptr nonnull [[X]]) #[[ATTR10]], !noalias [[META7]]
 ; CHECK-DISABLED-NEXT:    call void @__kmpc_target_deinit() #[[ATTR6]]
 ; CHECK-DISABLED-NEXT:    ret void
 ; CHECK-DISABLED:       worker.exit:
@@ -404,12 +404,10 @@ attributes #4 = { inaccessiblememonly nofree nosync nounwind willreturn }
 attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_amenable" }
 
 !omp_offload.info = !{!0}
-!nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3, !4, !5, !6}
 !llvm.ident = !{!7}
 
 !0 = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0}
-!1 = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1}
 !2 = !{i32 1, !"wchar_size", i32 4}
 !3 = !{i32 7, !"openmp", i32 50}
 !4 = !{i32 7, !"openmp-device", i32 50}
@@ -447,30 +445,28 @@ attributes #5 = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_ame
 ; CHECK-DISABLED: attributes #[[ATTR10]] = { convergent nounwind "llvm.assume"="omp_no_openmp,ompx_spmd_amenable" }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0}
-; CHECK: [[META1:![0-9]+]] = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1}
-; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; CHECK: [[META8]] = !{[[META9:![0-9]+]]}
-; CHECK: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"}
-; CHECK: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
-; CHECK: [[META12]] = !{!"llvm.loop.mustprogress"}
+; CHECK: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"__omp_outlined__: %__context"}
+; CHECK: [[META9]] = distinct !{[[META9]], !"__omp_outlined__"}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]]}
+; CHECK: [[META11]] = !{!"llvm.loop.mustprogress"}
 ;.
 ; CHECK-DISABLED: [[META0:![0-9]+]] = !{i32 0, i32 42, i32 16513658, !"sequential_loop", i32 6, i32 0}
-; CHECK-DISABLED: [[META1:![0-9]+]] = !{ptr @__omp_offloading_2a_fbfa7a_sequential_loop_l6, !"kernel", i32 1}
-; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK-DISABLED: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLED: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; CHECK-DISABLED: [[META8]] = !{[[META9:![0-9]+]]}
-; CHECK-DISABLED: [[META9]] = distinct !{[[META9]], [[META10:![0-9]+]], !"__omp_outlined__: %__context"}
-; CHECK-DISABLED: [[META10]] = distinct !{[[META10]], !"__omp_outlined__"}
-; CHECK-DISABLED: [[LOOP11]] = distinct !{[[LOOP11]], [[META12:![0-9]+]]}
-; CHECK-DISABLED: [[META12]] = !{!"llvm.loop.mustprogress"}
+; CHECK-DISABLED: [[META1:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK-DISABLED: [[META2:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK-DISABLED: [[META3:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK-DISABLED: [[META4:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK-DISABLED: [[META5:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK-DISABLED: [[META6:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK-DISABLED: [[META7]] = !{[[META8:![0-9]+]]}
+; CHECK-DISABLED: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]], !"__omp_outlined__: %__context"}
+; CHECK-DISABLED: [[META9]] = distinct !{[[META9]], !"__omp_outlined__"}
+; CHECK-DISABLED: [[LOOP10]] = distinct !{[[LOOP10]], [[META11:![0-9]+]]}
+; CHECK-DISABLED: [[META11]] = !{!"llvm.loop.mustprogress"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index 11405b7eb447c..a644fe1b2f821 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -56,7 +56,7 @@ target triple = "nvptx64"
 ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 {
+define weak ptx_kernel void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -113,7 +113,7 @@ define weak i32 @__kmpc_target_init(ptr, ptr) {
 declare void @__kmpc_target_deinit()
 
 ; Function Attrs: convergent noinline norecurse nounwind
-define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -321,14 +321,12 @@ attributes #4 = { alwaysinline }
 attributes #5 = { convergent }
 
 !omp_offload.info = !{!0, !1}
-!nvvm.annotations = !{!2, !3}
 !llvm.module.flags = !{!4, !5, !6, !7, !8}
 !llvm.ident = !{!9}
 
 !0 = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 !1 = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-!2 = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-!3 = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
+
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"openmp", i32 50}
 !6 = !{i32 7, !"openmp-device", i32 50}
@@ -358,23 +356,19 @@ attributes #5 = { convergent }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
 ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index f348825446c63..6dfc14e9270ed 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -28,7 +28,7 @@
 ; NVPTX: @spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak void @spmd_callees(i1 %c) #0 {
+define weak ptx_kernel void @spmd_callees(i1 %c) #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
 ; AMDGPU-SAME: (i1 [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-NEXT:    call void @spmd_callees__debug(i1 [[C]])
@@ -57,7 +57,7 @@ define internal void @spmd_callees__debug(i1 %c) {
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
 ; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17:![0-9]+]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
 ; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
@@ -88,7 +88,7 @@ define internal void @spmd_callees__debug(i1 %c) {
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10:[0-9]+]]
 ; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17:![0-9]+]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12:![0-9]+]]
 ; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable1, ptr @__omp_outlined_spmd_amenable2
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable2
 ; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
@@ -143,10 +143,10 @@ define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., p
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable1
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
@@ -161,10 +161,10 @@ define internal void @__omp_outlined_spmd_amenable1(ptr noalias %.global_tid., p
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6:[0-9]+]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__1, ptr @__omp_outlined__1_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
@@ -262,10 +262,10 @@ define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., p
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable2
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
@@ -282,10 +282,10 @@ define internal void @__omp_outlined_spmd_amenable2(ptr noalias %.global_tid., p
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__3, ptr @__omp_outlined__3_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
@@ -367,7 +367,7 @@ entry:
 
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
+define weak ptx_kernel void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ;
 ;
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callee
@@ -413,7 +413,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
 ; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
@@ -473,7 +473,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    [[FP:%.*]] = select i1 [[C]], ptr @__omp_outlined_spmd_amenable3, ptr @__omp_outlined_not_spmd_amenable
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_not_spmd_amenable
 ; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
@@ -530,11 +530,11 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p
 ; AMDGPU-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA25:![0-9]+]]
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable3
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
@@ -551,11 +551,11 @@ define internal void @__omp_outlined_spmd_amenable3(ptr noalias %.global_tid., p
 ; NVPTX-NEXT:    call void @__kmpc_free_shared(ptr [[X]], i64 4) #[[ATTR10]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA25:![0-9]+]]
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    store ptr [[X]], ptr [[CAPTURED_VARS_ADDRS]], align 8, !tbaa [[TBAA20:![0-9]+]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__5, ptr @__omp_outlined__5_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 1)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 ;
 entry:
   %captured_vars_addrs = alloca [1 x ptr], align 8
@@ -587,18 +587,18 @@ define internal void @__omp_outlined__5(ptr noalias %.global_tid., ptr noalias %
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; AMDGPU-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; AMDGPU-NEXT:  entry:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @unknown() #[[ATTR7]]
 ; AMDGPU-NEXT:    ret void
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined__5
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 4 dereferenceable(4) [[X:%.*]]) {
 ; NVPTX-NEXT:  entry:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    store i32 [[INC]], ptr [[X]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @unknown() #[[ATTR7]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -622,7 +622,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; AMDGPU-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; AMDGPU-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA25]]
+; AMDGPU-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; AMDGPU-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    ret void
 ;
@@ -634,7 +634,7 @@ define internal void @__omp_outlined__5_wrapper(i16 zeroext %0, i32 %1) #3 {
 ; NVPTX-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 ; NVPTX-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
-; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA25]]
+; NVPTX-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8, !tbaa [[TBAA20]]
 ; NVPTX-NEXT:    call void @__omp_outlined__5(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP3]]) #[[ATTR10]]
 ; NVPTX-NEXT:    ret void
 ;
@@ -652,7 +652,7 @@ entry:
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @spmd_callees_metadata(ptr %fp) #0 {
+define weak ptx_kernel void @spmd_callees_metadata(ptr %fp) #0 {
 ;
 ;
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees_metadata
@@ -668,7 +668,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 {
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
 ; AMDGPU-NEXT:    call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    br label [[COMMON_RET]]
@@ -686,7 +686,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 {
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__omp_outlined_spmd_amenable_external(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]])
 ; NVPTX-NEXT:    call void @__kmpc_target_deinit()
 ; NVPTX-NEXT:    br label [[COMMON_RET]]
@@ -711,7 +711,7 @@ user_code.entry:                                  ; preds = %entry
 }
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
+define weak ptx_kernel void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ;
 ;
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_and_non_spmd_callees_metadata
@@ -757,7 +757,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; AMDGPU:       user_code.entry:
 ; AMDGPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; AMDGPU-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
 ; AMDGPU-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; AMDGPU:       3:
@@ -816,7 +816,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; NVPTX:       user_code.entry:
 ; NVPTX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]]) #[[ATTR10]]
 ; NVPTX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    [[TMP2:%.*]] = icmp eq ptr [[FP]], @__omp_outlined_spmd_amenable_external
 ; NVPTX-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
 ; NVPTX:       3:
@@ -868,10 +868,10 @@ define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., pt
 ; AMDGPU-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; AMDGPU-NEXT:    ret void
 ; AMDGPU:       for.body:
-; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; AMDGPU-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; AMDGPU-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
 ; AMDGPU-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; AMDGPU-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_outlined_spmd_amenable_external
 ; NVPTX-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) {
@@ -885,10 +885,10 @@ define void @__omp_outlined_spmd_amenable_external(ptr noalias %.global_tid., pt
 ; NVPTX-NEXT:    call void @spmd_amenable() #[[ATTR6]]
 ; NVPTX-NEXT:    ret void
 ; NVPTX:       for.body:
-; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA17]]
+; NVPTX-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !tbaa [[TBAA12]]
 ; NVPTX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, ptr @__omp_outlined__7, ptr @__omp_outlined__7_wrapper, ptr undef, i64 0)
 ; NVPTX-NEXT:    [[INC]] = add nsw i32 [[I_0]], 1
-; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+; NVPTX-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 ;
 entry:
   br label %for.cond
@@ -1069,7 +1069,6 @@ attributes #10 = { convergent "llvm.assume"="ompx_spmd_amenable" }
 attributes #11 = { convergent }
 
 !omp_offload.info = !{!0, !1, !2, !3, !4, !5}
-!nvvm.annotations = !{!6, !7, !8, !9, !10, !11}
 !llvm.module.flags = !{!12, !13, !14, !15, !16}
 !llvm.ident = !{!17}
 
@@ -1079,12 +1078,6 @@ attributes #11 = { convergent }
 !3 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 !4 = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 !5 = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-!6 = !{ptr @spmd_callees, !"kernel", i32 1}
-!7 = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1}
-!8 = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1}
-!9 = !{ptr @spmd_callees_metadata, !"kernel", i32 1}
-!10 = !{i32 1}
-!11 = !{i32 1}
 !12 = !{i32 1, !"wchar_size", i32 4}
 !13 = !{i32 7, !"openmp", i32 50}
 !14 = !{i32 7, !"openmp-device", i32 50}
@@ -1139,29 +1132,24 @@ attributes #11 = { convergent }
 ; AMDGPU: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; AMDGPU: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; AMDGPU: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; AMDGPU: [[META6:![0-9]+]] = !{ptr @spmd_callees, !"kernel", i32 1}
-; AMDGPU: [[META7:![0-9]+]] = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1}
-; AMDGPU: [[META8:![0-9]+]] = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1}
-; AMDGPU: [[META9:![0-9]+]] = !{ptr @spmd_callees_metadata, !"kernel", i32 1}
-; AMDGPU: [[META10:![0-9]+]] = !{i32 1}
-; AMDGPU: [[META11:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; AMDGPU: [[META12:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; AMDGPU: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; AMDGPU: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; AMDGPU: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; AMDGPU: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; AMDGPU: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-; AMDGPU: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0}
-; AMDGPU: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
-; AMDGPU: [[META20]] = !{!"Simple C/C++ TBAA"}
-; AMDGPU: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]}
-; AMDGPU: [[META22]] = !{!"llvm.loop.mustprogress"}
-; AMDGPU: [[META23]] = !{!"llvm.loop.unroll.disable"}
-; AMDGPU: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]}
-; AMDGPU: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
-; AMDGPU: [[META26]] = !{!"any pointer", [[META19]], i64 0}
-; AMDGPU: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]}
-; AMDGPU: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]}
+; AMDGPU: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; AMDGPU: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; AMDGPU: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; AMDGPU: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; AMDGPU: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; AMDGPU: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; AMDGPU: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; AMDGPU: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; AMDGPU: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; AMDGPU: [[META15]] = !{!"Simple C/C++ TBAA"}
+; AMDGPU: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; AMDGPU: [[META17]] = !{!"llvm.loop.mustprogress"}
+; AMDGPU: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; AMDGPU: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; AMDGPU: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; AMDGPU: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; AMDGPU: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; AMDGPU: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
 ; NVPTX: [[META0:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"", i32 74, i32 5}
 ; NVPTX: [[META1:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_stack_var", i32 20, i32 1}
@@ -1169,27 +1157,22 @@ attributes #11 = { convergent }
 ; NVPTX: [[META3:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var", i32 35, i32 2}
 ; NVPTX: [[META4:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"do_not_spmdize_target", i32 65, i32 4}
 ; NVPTX: [[META5:![0-9]+]] = !{i32 0, i32 64770, i32 541341486, !"sequential_loop_to_shared_var_guarded", i32 50, i32 3}
-; NVPTX: [[META6:![0-9]+]] = !{ptr @spmd_callees, !"kernel", i32 1}
-; NVPTX: [[META7:![0-9]+]] = !{ptr @spmd_and_non_spmd_callees_metadata, !"kernel", i32 1}
-; NVPTX: [[META8:![0-9]+]] = !{ptr @spmd_and_non_spmd_callee, !"kernel", i32 1}
-; NVPTX: [[META9:![0-9]+]] = !{ptr @spmd_callees_metadata, !"kernel", i32 1}
-; NVPTX: [[META10:![0-9]+]] = !{i32 1}
-; NVPTX: [[META11:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; NVPTX: [[META12:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; NVPTX: [[META13:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; NVPTX: [[META14:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; NVPTX: [[META15:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; NVPTX: [[META16:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-; NVPTX: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0}
-; NVPTX: [[META18]] = !{!"int", [[META19:![0-9]+]], i64 0}
-; NVPTX: [[META19]] = !{!"omnipotent char", [[META20:![0-9]+]], i64 0}
-; NVPTX: [[META20]] = !{!"Simple C/C++ TBAA"}
-; NVPTX: [[LOOP21]] = distinct !{[[LOOP21]], [[META22:![0-9]+]], [[META23:![0-9]+]]}
-; NVPTX: [[META22]] = !{!"llvm.loop.mustprogress"}
-; NVPTX: [[META23]] = !{!"llvm.loop.unroll.disable"}
-; NVPTX: [[LOOP24]] = distinct !{[[LOOP24]], [[META22]], [[META23]]}
-; NVPTX: [[TBAA25]] = !{[[META26:![0-9]+]], [[META26]], i64 0}
-; NVPTX: [[META26]] = !{!"any pointer", [[META19]], i64 0}
-; NVPTX: [[LOOP27]] = distinct !{[[LOOP27]], [[META22]], [[META23]]}
-; NVPTX: [[LOOP28]] = distinct !{[[LOOP28]], [[META22]], [[META23]]}
+; NVPTX: [[META6:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; NVPTX: [[META7:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; NVPTX: [[META8:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; NVPTX: [[META9:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; NVPTX: [[META10:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; NVPTX: [[META11:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; NVPTX: [[TBAA12]] = !{[[META13:![0-9]+]], [[META13]], i64 0}
+; NVPTX: [[META13]] = !{!"int", [[META14:![0-9]+]], i64 0}
+; NVPTX: [[META14]] = !{!"omnipotent char", [[META15:![0-9]+]], i64 0}
+; NVPTX: [[META15]] = !{!"Simple C/C++ TBAA"}
+; NVPTX: [[LOOP16]] = distinct !{[[LOOP16]], [[META17:![0-9]+]], [[META18:![0-9]+]]}
+; NVPTX: [[META17]] = !{!"llvm.loop.mustprogress"}
+; NVPTX: [[META18]] = !{!"llvm.loop.unroll.disable"}
+; NVPTX: [[LOOP19]] = distinct !{[[LOOP19]], [[META17]], [[META18]]}
+; NVPTX: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0}
+; NVPTX: [[META21]] = !{!"any pointer", [[META14]], i64 0}
+; NVPTX: [[LOOP22]] = distinct !{[[LOOP22]], [[META17]], [[META18]]}
+; NVPTX: [[LOOP23]] = distinct !{[[LOOP23]], [[META17]], [[META18]]}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index f28f61e053275..1cfce147ac81e 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -57,7 +57,7 @@ target triple = "nvptx64"
 ; CHECK-DISABLE-SPMDIZATION: @__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @__omp_outlined___wrapper.ID = private constant i8 undef
 ;.
-define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
@@ -147,7 +147,7 @@ define weak i32 @__kmpc_target_init(ptr, ptr) {
 declare void @__kmpc_target_deinit()
 
 ; Function Attrs: convergent noinline norecurse nounwind
-define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
+define weak ptx_kernel void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
 ; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
@@ -397,14 +397,11 @@ attributes #4 = { alwaysinline }
 attributes #5 = { convergent }
 
 !omp_offload.info = !{!0, !1}
-!nvvm.annotations = !{!2, !3}
 !llvm.module.flags = !{!4, !5, !6, !7, !8}
 !llvm.ident = !{!9}
 
 !0 = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 !1 = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-!2 = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-!3 = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
 !4 = !{i32 1, !"wchar_size", i32 4}
 !5 = !{i32 7, !"openmp", i32 50}
 !6 = !{i32 7, !"openmp-device", i32 50}
@@ -434,23 +431,19 @@ attributes #5 = { convergent }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-; CHECK: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-; CHECK: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
-; CHECK: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
 ; CHECK-DISABLE-SPMDIZATION: [[META0:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"spmd", i32 12, i32 0}
 ; CHECK-DISABLE-SPMDIZATION: [[META1:![0-9]+]] = !{i32 0, i32 43, i32 17011637, !"generic", i32 20, i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_spmd_l12, !"kernel", i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{ptr @__omp_offloading_2b_10393b5_generic_l20, !"kernel", i32 1}
-; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
-; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 7, !"openmp", i32 50}
-; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META8:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
-; CHECK-DISABLE-SPMDIZATION: [[META9:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+; CHECK-DISABLE-SPMDIZATION: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+; CHECK-DISABLE-SPMDIZATION: [[META3:![0-9]+]] = !{i32 7, !"openmp", i32 50}
+; CHECK-DISABLE-SPMDIZATION: [[META4:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
+; CHECK-DISABLE-SPMDIZATION: [[META5:![0-9]+]] = !{i32 8, !"PIC Level", i32 2}
+; CHECK-DISABLE-SPMDIZATION: [[META6:![0-9]+]] = !{i32 7, !"frame-pointer", i32 2}
+; CHECK-DISABLE-SPMDIZATION: [[META7:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
 ;.
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index f5a4cea9a841c..ef36937bc5734 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -62,7 +62,7 @@ target triple = "nvptx64"
 
 
 ; Function Attrs: convergent norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
+define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
 entry:
   %captured_vars_addrs.i.i = alloca [0 x ptr], align 8
   %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18
@@ -107,7 +107,7 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
 ; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
+define weak ptx_kernel void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
 entry:
   %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8
   %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33
@@ -175,7 +175,6 @@ attributes #7 = { "llvm.assume"="ompx_spmd_amenable" }
 
 !llvm.dbg.cu = !{!0}
 !omp_offload.info = !{!3, !4}
-!nvvm.annotations = !{!5, !6}
 !llvm.module.flags = !{!7, !8, !9, !10, !11, !12, !13}
 !llvm.ident = !{!14}
 
@@ -184,8 +183,6 @@ attributes #7 = { "llvm.assume"="ompx_spmd_amenable" }
 !2 = !{}
 !3 = !{i32 0, i32 42, i32 14159165, !"test_no_fallback", i32 20, i32 1}
 !4 = !{i32 0, i32 42, i32 14159165, !"test_fallback", i32 11, i32 0}
-!5 = !{ptr @__omp_offloading_2a_d80d3d_test_fallback_l11, !"kernel", i32 1}
-!6 = !{ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20, !"kernel", i32 1}
 !7 = !{i32 7, !"Dwarf Version", i32 2}
 !8 = !{i32 2, !"Debug Info Version", i32 3}
 !9 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 5e2abbae1811c..2842dfd030b11 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -53,7 +53,7 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: @str = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
 ; CHECK: @kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-define void @kernel(ptr %dyn) "kernel" {
+define amdgpu_kernel void @kernel(ptr %dyn) "kernel" {
 ;
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel
@@ -144,7 +144,7 @@ define void @test_assume() {
 }
 
 ; We can't ignore the sync, hence this might store 2 into %p
-define void @kernel2(ptr %p) "kernel" {
+define amdgpu_kernel void @kernel2(ptr %p) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@kernel2
 ; CHECK-SAME: (ptr [[P:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    store i32 1, ptr addrspace(3) @X, align 4
@@ -163,7 +163,7 @@ define void @kernel2(ptr %p) "kernel" {
 }
 
 ; We can't ignore the sync, hence this might store 2 into %p
-define void @kernel3(ptr %p) "kernel" {
+define amdgpu_kernel void @kernel3(ptr %p) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel3
 ; TUNIT-SAME: (ptr [[P:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    store i32 1, ptr addrspace(3) @X, align 4
@@ -199,7 +199,7 @@ define void @sync_def() {
   ret void
 }
 
-define void @kernel4a1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4a1(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4a1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    store i32 0, ptr addrspace(3) @QA1, align 4
@@ -242,7 +242,7 @@ S:
 }
 
 ; We should not replace the load or delete the second store.
-define void @kernel4b1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4b1(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4b1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    store i32 0, ptr addrspace(3) @QB1, align 4
@@ -281,7 +281,7 @@ S:
   ret void
 }
 
-define void @kernel4a2(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4a2(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4a2
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -317,7 +317,7 @@ S:
 }
 
 ; FIXME: We should not replace the load with undef.
-define void @kernel4b2(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4b2(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4b2
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -349,7 +349,7 @@ S:
   ret void
 }
 
-define void @kernel4a3(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4a3(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4a3
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    store i32 0, ptr addrspace(3) @QA3, align 4
@@ -401,7 +401,7 @@ S:
 }
 
 ; The load of QB3 should not be simplified to 0.
-define void @kernel4b3(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4b3(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4b3
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    store i32 0, ptr addrspace(3) @QB3, align 4
@@ -453,7 +453,7 @@ S:
 }
 
 
-define void @kernel4c1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4c1(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4c1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -488,7 +488,7 @@ S:
 }
 
 ; We should not replace the load or delete the second store.
-define void @kernel4d1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4d1(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4d1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -529,7 +529,7 @@ S:
   ret void
 }
 
-define void @kernel4c2(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4c2(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4c2
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -563,7 +563,7 @@ S:
 }
 
 ; We should not replace the load with undef.
-define void @kernel4d2(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4d2(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4d2
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -595,7 +595,7 @@ S:
   ret void
 }
 
-define void @kernel4c3(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4c3(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4c3
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -629,7 +629,7 @@ S:
 }
 
 ; We should not replace the load with undef.
-define void @kernel4d3(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel4d3(i1 %c) "kernel" {
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel4d3
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR0]] {
@@ -661,7 +661,7 @@ S:
   ret void
 }
 
-define void @kernel_unknown_and_aligned1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel_unknown_and_aligned1(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -700,7 +700,7 @@ S:
   ret void
 }
 
-define void @kernel_unknown_and_aligned2(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel_unknown_and_aligned2(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned2
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -741,7 +741,7 @@ S:
   ret void
 }
 
-define void @kernel_unknown_and_aligned3(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel_unknown_and_aligned3(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_aligned3
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -782,7 +782,7 @@ S:
   ret void
 }
 
-define void @kernel_unknown_and_not_aligned1(i1 %c) "kernel" {
+define amdgpu_kernel void @kernel_unknown_and_not_aligned1(i1 %c) "kernel" {
 ; TUNIT-LABEL: define {{[^@]+}}@kernel_unknown_and_not_aligned1
 ; TUNIT-SAME: (i1 [[C:%.*]]) #[[ATTR1]] {
 ; TUNIT-NEXT:    br i1 [[C]], label [[S:%.*]], label [[L:%.*]]
@@ -828,29 +828,9 @@ declare void @__kmpc_target_deinit() nocallback
 declare void @llvm.assume(i1)
 
 !llvm.module.flags = !{!0, !1}
-!nvvm.annotations = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20}
 
 !0 = !{i32 7, !"openmp", i32 50}
 !1 = !{i32 7, !"openmp-device", i32 50}
-!2 = !{ptr @kernel, !"kernel", i32 1}
-!3 = !{ptr @kernel2, !"kernel", i32 1}
-!4 = !{ptr @kernel3, !"kernel", i32 1}
-!5 = !{ptr @kernel4a1, !"kernel", i32 1}
-!6 = !{ptr @kernel4b1, !"kernel", i32 1}
-!7 = !{ptr @kernel4a2, !"kernel", i32 1}
-!8 = !{ptr @kernel4b2, !"kernel", i32 1}
-!9 = !{ptr @kernel4a3, !"kernel", i32 1}
-!10 = !{ptr @kernel4b3, !"kernel", i32 1}
-!11 = !{ptr @kernel4c1, !"kernel", i32 1}
-!12 = !{ptr @kernel4d1, !"kernel", i32 1}
-!13 = !{ptr @kernel4c2, !"kernel", i32 1}
-!14 = !{ptr @kernel4d2, !"kernel", i32 1}
-!15 = !{ptr @kernel4c3, !"kernel", i32 1}
-!16 = !{ptr @kernel4d3, !"kernel", i32 1}
-!17 = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
-!18 = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
-!19 = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
-!20 = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
 
 ;.
 ; TUNIT: attributes #[[ATTR0]] = { norecurse "kernel" }
@@ -872,45 +852,7 @@ declare void @llvm.assume(i1)
 ;.
 ; TUNIT: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; TUNIT: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; TUNIT: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; TUNIT: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
-; TUNIT: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1}
-; TUNIT: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1}
-; TUNIT: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1}
-; TUNIT: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1}
-; TUNIT: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1}
-; TUNIT: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1}
-; TUNIT: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1}
-; TUNIT: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1}
-; TUNIT: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1}
-; TUNIT: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1}
-; TUNIT: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1}
-; TUNIT: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1}
-; TUNIT: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1}
-; TUNIT: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
-; TUNIT: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
-; TUNIT: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
-; TUNIT: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
 ;.
 ; CGSCC: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50}
 ; CGSCC: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}
-; CGSCC: [[META2:![0-9]+]] = !{ptr @kernel, !"kernel", i32 1}
-; CGSCC: [[META3:![0-9]+]] = !{ptr @kernel2, !"kernel", i32 1}
-; CGSCC: [[META4:![0-9]+]] = !{ptr @kernel3, !"kernel", i32 1}
-; CGSCC: [[META5:![0-9]+]] = !{ptr @kernel4a1, !"kernel", i32 1}
-; CGSCC: [[META6:![0-9]+]] = !{ptr @kernel4b1, !"kernel", i32 1}
-; CGSCC: [[META7:![0-9]+]] = !{ptr @kernel4a2, !"kernel", i32 1}
-; CGSCC: [[META8:![0-9]+]] = !{ptr @kernel4b2, !"kernel", i32 1}
-; CGSCC: [[META9:![0-9]+]] = !{ptr @kernel4a3, !"kernel", i32 1}
-; CGSCC: [[META10:![0-9]+]] = !{ptr @kernel4b3, !"kernel", i32 1}
-; CGSCC: [[META11:![0-9]+]] = !{ptr @kernel4c1, !"kernel", i32 1}
-; CGSCC: [[META12:![0-9]+]] = !{ptr @kernel4d1, !"kernel", i32 1}
-; CGSCC: [[META13:![0-9]+]] = !{ptr @kernel4c2, !"kernel", i32 1}
-; CGSCC: [[META14:![0-9]+]] = !{ptr @kernel4d2, !"kernel", i32 1}
-; CGSCC: [[META15:![0-9]+]] = !{ptr @kernel4c3, !"kernel", i32 1}
-; CGSCC: [[META16:![0-9]+]] = !{ptr @kernel4d3, !"kernel", i32 1}
-; CGSCC: [[META17:![0-9]+]] = !{ptr @kernel_unknown_and_aligned1, !"kernel", i32 1}
-; CGSCC: [[META18:![0-9]+]] = !{ptr @kernel_unknown_and_aligned2, !"kernel", i32 1}
-; CGSCC: [[META19:![0-9]+]] = !{ptr @kernel_unknown_and_aligned3, !"kernel", i32 1}
-; CGSCC: [[META20:![0-9]+]] = !{ptr @kernel_unknown_and_not_aligned1, !"kernel", i32 1}
 ;.
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 42616155f0cc3..64fef02c8f3f8 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -7622,26 +7622,6 @@ TEST_F(OpenMPIRBuilderTest, createGPUOffloadEntry) {
                                 /* Size = */ 0,
                                 /* Flags = */ 0, GlobalValue::WeakAnyLinkage);
 
-  // Check nvvm.annotations only created for GPU kernels
-  NamedMDNode *MD = M->getNamedMetadata("nvvm.annotations");
-  EXPECT_NE(MD, nullptr);
-  EXPECT_EQ(MD->getNumOperands(), 1u);
-
-  MDNode *Annotations = MD->getOperand(0);
-  EXPECT_EQ(Annotations->getNumOperands(), 3u);
-
-  Constant *ConstVal =
-      dyn_cast<ConstantAsMetadata>(Annotations->getOperand(0))->getValue();
-  EXPECT_TRUE(isa<Function>(Fn));
-  EXPECT_EQ(ConstVal, cast<Function>(Fn));
-
-  EXPECT_TRUE(Annotations->getOperand(1).equalsStr("kernel"));
-
-  EXPECT_TRUE(mdconst::hasa<ConstantInt>(Annotations->getOperand(2)));
-  APInt IntVal =
-      mdconst::extract<ConstantInt>(Annotations->getOperand(2))->getValue();
-  EXPECT_EQ(IntVal, 1);
-
   // Check kernel attributes
   EXPECT_TRUE(Fn->hasFnAttribute("kernel"));
   EXPECT_TRUE(Fn->hasFnAttribute(Attribute::MustProgress));

From 4ea44eb1e292369b0b3f2f8ad4680081558f1e01 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Fri, 24 Jan 2025 16:56:26 -0800
Subject: [PATCH 072/432] [WebAssembly] Fix EH feature flags when compiling
 multiple files (#124374)

#124042 caused a problem that when invoking `clang` with multiple files,
the static `HasRun` variables were set when processing the first file so
the appropriate feature flags were not added from the second file. This
fixes the problem by making those `HasRun` variables just normal
variables within the enclosing function.
---
 clang/lib/Driver/ToolChains/WebAssembly.cpp | 13 +++++++------
 clang/test/Driver/wasm-toolchain.c          |  8 ++++++++
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index eebe3becada65..bd25fd1a8933a 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -344,12 +344,15 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
     }
   }
 
+  bool HasBannedIncompatibleOptionsForWasmEHSjLj = false;
+  bool HasEnabledFeaturesForWasmEHSjLj = false;
+
   // Bans incompatible options for Wasm EH / SjLj. We don't allow using
   // different modes for EH and SjLj.
   auto BanIncompatibleOptionsForWasmEHSjLj = [&](StringRef CurOption) {
-    static bool HasRun = false;
-    if (HasRun)
+    if (HasBannedIncompatibleOptionsForWasmEHSjLj)
       return;
+    HasBannedIncompatibleOptionsForWasmEHSjLj = true;
     if (DriverArgs.hasFlag(options::OPT_mno_exception_handing,
                            options::OPT_mexception_handing, false))
       getDriver().Diag(diag::err_drv_argument_not_allowed_with)
@@ -373,14 +376,13 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
               << CurOption << Option;
       }
     }
-    HasRun = true;
   };
 
   // Enable necessary features for Wasm EH / SjLj in the backend.
   auto EnableFeaturesForWasmEHSjLj = [&]() {
-    static bool HasRun = false;
-    if (HasRun)
+    if (HasEnabledFeaturesForWasmEHSjLj)
       return;
+    HasEnabledFeaturesForWasmEHSjLj = true;
     CC1Args.push_back("-target-feature");
     CC1Args.push_back("+exception-handling");
     // The standardized Wasm EH spec requires multivalue and reference-types.
@@ -390,7 +392,6 @@ void WebAssembly::addClangTargetOptions(const ArgList &DriverArgs,
     CC1Args.push_back("+reference-types");
     // Backend needs '-exception-model=wasm' to use Wasm EH instructions
     CC1Args.push_back("-exception-model=wasm");
-    HasRun = true;
   };
 
   if (DriverArgs.getLastArg(options::OPT_fwasm_exceptions)) {
diff --git a/clang/test/Driver/wasm-toolchain.c b/clang/test/Driver/wasm-toolchain.c
index 2d14052082776..f516a4e457da7 100644
--- a/clang/test/Driver/wasm-toolchain.c
+++ b/clang/test/Driver/wasm-toolchain.c
@@ -224,6 +224,14 @@
 // RUN:   | FileCheck -check-prefix=WASM_LEGACY_EH_NO_EH %s
 // WASM_LEGACY_EH_NO_EH: invalid argument '-wasm-use-legacy-eh' not allowed with '-mno-exception-handling'
 
+// When invoking clang with multiple files in a single command line, target
+// feature flags should be equally added to the multiple clang-cc1 command lines
+// RUN: %clang -### --target=wasm32-unknown-unknown \
+// RUN:    --sysroot=/foo %s %s -mllvm -wasm-enable-sjlj 2>&1 \
+// RUN:  | FileCheck -check-prefix=WASM_SJLJ_MULTI_FILES %s
+// WASM_SJLJ_MULTI_FILES: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm"
+// WASM_SJLJ_MULTI_FILES: "-cc1" {{.*}} "-target-feature" "+exception-handling" "-target-feature" "+multivalue" "-target-feature" "+reference-types" "-exception-model=wasm"
+
 // RUN: %clang -### %s -fsanitize=address --target=wasm32-unknown-emscripten 2>&1 | FileCheck -check-prefix=CHECK-ASAN-EMSCRIPTEN %s
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize=address"
 // CHECK-ASAN-EMSCRIPTEN: "-fsanitize-address-globals-dead-stripping"

From 8e31050bc2e02d7a3c654def7d7af899ce1cdb1d Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 24 Jan 2025 17:57:04 -0800
Subject: [PATCH 073/432] [clang-format] Fix a bug in annotating overloaded
 co_await decl (#124240)

Fixes #124223.
---
 clang/lib/Format/TokenAnnotator.cpp           | 2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index f36cf7b638e0d..bc41d43d1438c 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -3784,7 +3784,7 @@ static bool isFunctionDeclarationName(const LangOptions &LangOpts,
         return Next;
       if (Next->is(TT_OverloadedOperator))
         continue;
-      if (Next->isOneOf(tok::kw_new, tok::kw_delete)) {
+      if (Next->isOneOf(tok::kw_new, tok::kw_delete, tok::kw_co_await)) {
         // For 'new[]' and 'delete[]'.
         if (Next->Next &&
             Next->Next->startsSequence(tok::l_square, tok::r_square)) {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 9ac60ce73750b..10587449dcea9 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1025,6 +1025,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsOverloadedOperators) {
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_OverloadedOperatorLParen);
   EXPECT_TOKEN(Tokens[8], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[12], tok::amp, TT_PointerOrReference);
+
+  Tokens = annotate("SomeLoooooooooooooooooType::Awaitable\n"
+                    "SomeLoooooooooooooooooType::operator co_await();");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[7], tok::l_paren, TT_OverloadedOperatorLParen);
 }
 
 TEST_F(TokenAnnotatorTest, OverloadedOperatorInTemplate) {

From 3b35b4c7f9141c59fbac415e335489494b7d507e Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 24 Jan 2025 18:08:44 -0800
Subject: [PATCH 074/432] [mlir] Allow fallback from file line col range to loc
 (#124321)

This was discussed during the original review but I made it stricter
than discussed. Making it a pure view but adding a helper for bytecode
serialization (I could avoid the helper, but it ends up with more logic
and stronger coupling).
---
 mlir/include/mlir/IR/BuiltinDialectBytecode.td | 4 ++--
 mlir/include/mlir/IR/Location.h                | 9 +++++----
 mlir/lib/IR/Location.cpp                       | 6 ++----
 mlir/test/Target/LLVMIR/llvmir-debug.mlir      | 3 +++
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/IR/BuiltinDialectBytecode.td b/mlir/include/mlir/IR/BuiltinDialectBytecode.td
index 87da8fd3568fa..0208e8cdbf293 100644
--- a/mlir/include/mlir/IR/BuiltinDialectBytecode.td
+++ b/mlir/include/mlir/IR/BuiltinDialectBytecode.td
@@ -104,7 +104,7 @@ def FileLineColRange : DialectAttribute<(attr
     WithPrinter<"writeFileLineColRangeLocs($_writer, $_name)">>>>:$rawLocData
 )> {
   let cBuilder = "getFileLineColRange(context, filename, rawLocData)";
-  let printerPredicate = "!::llvm::isa<FileLineColLoc>($_val)";
+  let printerPredicate = "!isStrictFileLineColLoc($_val)";
 }
 
 def FileLineColLoc : DialectAttribute<(attr
@@ -112,7 +112,7 @@ def FileLineColLoc : DialectAttribute<(attr
   VarInt:$start_line,
   VarInt:$start_column
 )> {
-  let printerPredicate = "::llvm::isa<FileLineColLoc>($_val)";
+  let printerPredicate = "isStrictFileLineColLoc($_val)";
 }
 }
 
diff --git a/mlir/include/mlir/IR/Location.h b/mlir/include/mlir/IR/Location.h
index e206501f5ee6a..8ce36ed415ac1 100644
--- a/mlir/include/mlir/IR/Location.h
+++ b/mlir/include/mlir/IR/Location.h
@@ -177,7 +177,7 @@ class FusedLocWith : public FusedLoc {
 /// column number. This is similar to the type of location that you get from
 /// most source languages.
 ///
-/// FileLineColLoc is a FileLineColRange with exactly one line and column.
+/// FileLineColLoc is a view to FileLineColRange with one line and column.
 class FileLineColLoc : public FileLineColRange {
 public:
   using FileLineColRange::FileLineColRange;
@@ -190,11 +190,12 @@ class FileLineColLoc : public FileLineColRange {
   StringAttr getFilename() const;
   unsigned getLine() const;
   unsigned getColumn() const;
-
-  /// Methods for support type inquiry through isa, cast, and dyn_cast.
-  static bool classof(Attribute attr);
 };
 
+/// Returns true iff the given location is a FileLineColRange with exactly one
+/// line and column.
+bool isStrictFileLineColLoc(Location loc);
+
 //===----------------------------------------------------------------------===//
 // OpaqueLoc
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/Location.cpp b/mlir/lib/IR/Location.cpp
index ce78d30ee0a52..7a4df4fbd46d9 100644
--- a/mlir/lib/IR/Location.cpp
+++ b/mlir/lib/IR/Location.cpp
@@ -177,10 +177,8 @@ unsigned FileLineColLoc::getLine() const { return getStartLine(); }
 
 unsigned FileLineColLoc::getColumn() const { return getStartColumn(); }
 
-bool FileLineColLoc::classof(Attribute attr) {
-  // This could also have been for <= 2. But given this is matching previous
-  // behavior, it is left as is.
-  if (auto range = mlir::dyn_cast<FileLineColRange>(attr))
+bool mlir::isStrictFileLineColLoc(Location loc) {
+  if (auto range = mlir::dyn_cast<FileLineColRange>(loc))
     return range.getImpl()->size() == 2;
   return false;
 }
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index eac2c5090a5b5..d15274311d745 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -115,6 +115,9 @@ llvm.func @func_with_debug(%arg: i64) {
   // CHECK: call void @func_no_debug(), !dbg ![[FILE_LOC:[0-9]+]]
   llvm.call @func_no_debug() : () -> () loc("foo.mlir":1:2)
 
+  // CHECK: call void @func_no_debug(), !dbg ![[FILE_LOC:[0-9]+]]
+  llvm.call @func_no_debug() : () -> () loc("foo.mlir":1:2 to 5:6)
+
   // CHECK: call void @func_no_debug(), !dbg ![[NAMED_LOC:[0-9]+]]
   llvm.call @func_no_debug() : () -> () loc("named"("foo.mlir":10:10))
 

From ac1ba1f9dd7013852cd27f514467f57ee0e6ed16 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 24 Jan 2025 18:30:28 -0800
Subject: [PATCH 075/432] [CodeGen] Introduce a VirtRegOrUnit class to hold
 virtual reg or physical reg unit. NFC (#123768)

LiveIntervals and MachineVerifier were previously using Register to
store this, but reg units are different than physical registers. One
important difference is that 0 is a valid reg unit number, but it is not
a valid phyiscal register.

This patch introduces a new VirtRegOrUnit class that is distinct from
Register. It can be be converted to/from a virtual Register or a
MCRegUnit. I've made all conversions explicit and used assertions to
check the validity.

I also fixed a place in MachineVerifier that was ignoring reg unit 0.
---
 llvm/include/llvm/CodeGen/Register.h          |  31 +++
 .../include/llvm/CodeGen/TargetRegisterInfo.h |   4 +-
 llvm/lib/CodeGen/LiveIntervals.cpp            |  31 +--
 llvm/lib/CodeGen/MachineVerifier.cpp          | 182 +++++++++---------
 4 files changed, 144 insertions(+), 104 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
index fac5f00110ef7..f8c6159a3c2dc 100644
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -160,6 +160,37 @@ template <> struct DenseMapInfo<Register> {
   }
 };
 
+/// Wrapper class representing a virtual register or register unit.
+class VirtRegOrUnit {
+  unsigned VRegOrUnit;
+
+public:
+  constexpr explicit VirtRegOrUnit(MCRegUnit Unit) : VRegOrUnit(Unit) {
+    assert(!Register::isVirtualRegister(VRegOrUnit));
+  }
+  constexpr explicit VirtRegOrUnit(Register Reg) : VRegOrUnit(Reg.id()) {
+    assert(Reg.isVirtual());
+  }
+
+  constexpr bool isVirtualReg() const {
+    return Register::isVirtualRegister(VRegOrUnit);
+  }
+
+  constexpr MCRegUnit asMCRegUnit() const {
+    assert(!isVirtualReg() && "Not a register unit");
+    return VRegOrUnit;
+  }
+
+  constexpr Register asVirtualReg() const {
+    assert(isVirtualReg() && "Not a virtual register");
+    return Register(VRegOrUnit);
+  }
+
+  constexpr bool operator==(const VirtRegOrUnit &Other) const {
+    return VRegOrUnit == Other.VRegOrUnit;
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_REGISTER_H
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 0bf72637de398..63460f5a0dae3 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -466,9 +466,9 @@ class TargetRegisterInfo : public MCRegisterInfo {
   }
 
   /// Returns true if Reg contains RegUnit.
-  bool hasRegUnit(MCRegister Reg, Register RegUnit) const {
+  bool hasRegUnit(MCRegister Reg, MCRegUnit RegUnit) const {
     for (MCRegUnit Unit : regunits(Reg))
-      if (Register(Unit) == RegUnit)
+      if (Unit == RegUnit)
         return true;
     return false;
   }
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 4fdfcf547542d..3485a27335f13 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -1080,10 +1080,10 @@ class LiveIntervals::HMEditor {
           for (LiveInterval::SubRange &S : LI.subranges()) {
             if ((S.LaneMask & LaneMask).none())
               continue;
-            updateRange(S, Reg, S.LaneMask);
+            updateRange(S, VirtRegOrUnit(Reg), S.LaneMask);
           }
         }
-        updateRange(LI, Reg, LaneBitmask::getNone());
+        updateRange(LI, VirtRegOrUnit(Reg), LaneBitmask::getNone());
         // If main range has a hole and we are moving a subrange use across
         // the hole updateRange() cannot properly handle it since it only
         // gets the LiveRange and not the whole LiveInterval. As a result
@@ -1110,7 +1110,7 @@ class LiveIntervals::HMEditor {
       // precomputed live range.
       for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg()))
         if (LiveRange *LR = getRegUnitLI(Unit))
-          updateRange(*LR, Unit, LaneBitmask::getNone());
+          updateRange(*LR, VirtRegOrUnit(Unit), LaneBitmask::getNone());
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -1119,24 +1119,25 @@ class LiveIntervals::HMEditor {
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
+  void updateRange(LiveRange &LR, VirtRegOrUnit VRegOrUnit,
+                   LaneBitmask LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
     LLVM_DEBUG({
       dbgs() << "     ";
-      if (Reg.isVirtual()) {
-        dbgs() << printReg(Reg);
+      if (VRegOrUnit.isVirtualReg()) {
+        dbgs() << printReg(VRegOrUnit.asVirtualReg());
         if (LaneMask.any())
           dbgs() << " L" << PrintLaneMask(LaneMask);
       } else {
-        dbgs() << printRegUnit(Reg, &TRI);
+        dbgs() << printRegUnit(VRegOrUnit.asMCRegUnit(), &TRI);
       }
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
       handleMoveDown(LR);
     else
-      handleMoveUp(LR, Reg, LaneMask);
+      handleMoveUp(LR, VRegOrUnit, LaneMask);
     LLVM_DEBUG(dbgs() << "        -->\t" << LR << '\n');
     assert(LR.verify());
   }
@@ -1316,7 +1317,8 @@ class LiveIntervals::HMEditor {
 
   /// Update LR to reflect an instruction has been moved upwards from OldIdx
   /// to NewIdx (NewIdx < OldIdx).
-  void handleMoveUp(LiveRange &LR, Register Reg, LaneBitmask LaneMask) {
+  void handleMoveUp(LiveRange &LR, VirtRegOrUnit VRegOrUnit,
+                    LaneBitmask LaneMask) {
     LiveRange::iterator E = LR.end();
     // Segment going into OldIdx.
     LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
@@ -1340,7 +1342,7 @@ class LiveIntervals::HMEditor {
       SlotIndex DefBeforeOldIdx
         = std::max(OldIdxIn->start.getDeadSlot(),
                    NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
-      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, VRegOrUnit, LaneMask);
 
       // Did we have a Def at OldIdx? If not we are done now.
       OldIdxOut = std::next(OldIdxIn);
@@ -1498,11 +1500,12 @@ class LiveIntervals::HMEditor {
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(SlotIndex Before, Register Reg,
+  SlotIndex findLastUseBefore(SlotIndex Before, VirtRegOrUnit VRegOrUnit,
                               LaneBitmask LaneMask) {
-    if (Reg.isVirtual()) {
+    if (VRegOrUnit.isVirtualReg()) {
       SlotIndex LastUse = Before;
-      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+      for (MachineOperand &MO :
+           MRI.use_nodbg_operands(VRegOrUnit.asVirtualReg())) {
         if (MO.isUndef())
           continue;
         unsigned SubReg = MO.getSubReg();
@@ -1545,7 +1548,7 @@ class LiveIntervals::HMEditor {
       // Check if MII uses Reg.
       for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
         if (MO->isReg() && !MO->isUndef() && MO->getReg().isPhysical() &&
-            TRI.hasRegUnit(MO->getReg(), Reg))
+            TRI.hasRegUnit(MO->getReg(), VRegOrUnit.asMCRegUnit()))
           return Idx.getRegSlot();
     }
     // Didn't reach Before. It must be the first instruction in the block.
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index d41b11307e7bc..46ae3c6b05540 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -313,7 +313,7 @@ struct MachineVerifier {
   void report(const Twine &Msg, const MachineInstr *MI);
 
   void report_context(const LiveInterval &LI) const;
-  void report_context(const LiveRange &LR, Register VRegUnit,
+  void report_context(const LiveRange &LR, VirtRegOrUnit VRegOrUnit,
                       LaneBitmask LaneMask) const;
   void report_context(const LiveRange::Segment &S) const;
   void report_context(const VNInfo &VNI) const;
@@ -322,18 +322,18 @@ struct MachineVerifier {
   void report_context_liverange(const LiveRange &LR) const;
   void report_context_lanemask(LaneBitmask LaneMask) const;
   void report_context_vreg(Register VReg) const;
-  void report_context_vreg_regunit(Register VRegOrUnit) const;
+  void report_context_vreg_regunit(VirtRegOrUnit VRegOrUnit) const;
 
   void verifyInlineAsm(const MachineInstr *MI);
 
   void checkLiveness(const MachineOperand *MO, unsigned MONum);
   void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
                           SlotIndex UseIdx, const LiveRange &LR,
-                          Register VRegOrUnit,
+                          VirtRegOrUnit VRegOrUnit,
                           LaneBitmask LaneMask = LaneBitmask::getNone());
   void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
                           SlotIndex DefIdx, const LiveRange &LR,
-                          Register VRegOrUnit, bool SubRangeCheck = false,
+                          VirtRegOrUnit VRegOrUnit, bool SubRangeCheck = false,
                           LaneBitmask LaneMask = LaneBitmask::getNone());
 
   void markReachable(const MachineBasicBlock *MBB);
@@ -344,12 +344,12 @@ struct MachineVerifier {
   void verifyLiveVariables();
   void verifyLiveIntervals();
   void verifyLiveInterval(const LiveInterval &);
-  void verifyLiveRangeValue(const LiveRange &, const VNInfo *, Register,
+  void verifyLiveRangeValue(const LiveRange &, const VNInfo *, VirtRegOrUnit,
                             LaneBitmask);
   void verifyLiveRangeSegment(const LiveRange &,
-                              const LiveRange::const_iterator I, Register,
+                              const LiveRange::const_iterator I, VirtRegOrUnit,
                               LaneBitmask);
-  void verifyLiveRange(const LiveRange &, Register,
+  void verifyLiveRange(const LiveRange &, VirtRegOrUnit,
                        LaneBitmask LaneMask = LaneBitmask::getNone());
 
   void verifyStackFrame();
@@ -636,10 +636,11 @@ void MachineVerifier::report_context(const LiveInterval &LI) const {
   OS << "- interval:    " << LI << '\n';
 }
 
-void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit,
+void MachineVerifier::report_context(const LiveRange &LR,
+                                     VirtRegOrUnit VRegOrUnit,
                                      LaneBitmask LaneMask) const {
   report_context_liverange(LR);
-  report_context_vreg_regunit(VRegUnit);
+  report_context_vreg_regunit(VRegOrUnit);
   if (LaneMask.any())
     report_context_lanemask(LaneMask);
 }
@@ -664,11 +665,13 @@ void MachineVerifier::report_context_vreg(Register VReg) const {
   OS << "- v. register: " << printReg(VReg, TRI) << '\n';
 }
 
-void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const {
-  if (VRegOrUnit.isVirtual()) {
-    report_context_vreg(VRegOrUnit);
+void MachineVerifier::report_context_vreg_regunit(
+    VirtRegOrUnit VRegOrUnit) const {
+  if (VRegOrUnit.isVirtualReg()) {
+    report_context_vreg(VRegOrUnit.asVirtualReg());
   } else {
-    OS << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
+    OS << "- regunit:     " << printRegUnit(VRegOrUnit.asMCRegUnit(), TRI)
+       << '\n';
   }
 }
 
@@ -2828,7 +2831,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
                                          unsigned MONum, SlotIndex UseIdx,
                                          const LiveRange &LR,
-                                         Register VRegOrUnit,
+                                         VirtRegOrUnit VRegOrUnit,
                                          LaneBitmask LaneMask) {
   const MachineInstr *MI = MO->getParent();
 
@@ -2863,7 +2866,7 @@ void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
 void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
                                          unsigned MONum, SlotIndex DefIdx,
                                          const LiveRange &LR,
-                                         Register VRegOrUnit,
+                                         VirtRegOrUnit VRegOrUnit,
                                          bool SubRangeCheck,
                                          LaneBitmask LaneMask) {
   if (!LR.verify()) {
@@ -2908,7 +2911,7 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
   if (MO->isDead()) {
     LiveQueryResult LRQ = LR.Query(DefIdx);
     if (!LRQ.isDeadDef()) {
-      assert(VRegOrUnit.isVirtual() && "Expecting a virtual register.");
+      assert(VRegOrUnit.isVirtualReg() && "Expecting a virtual register.");
       // A dead subreg def only tells us that the specific subreg is dead. There
       // could be other non-dead defs of other subregs, or we could have other
       // parts of the register being live through the instruction. So unless we
@@ -2973,13 +2976,13 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
           if (MRI->isReservedRegUnit(Unit))
             continue;
           if (const LiveRange *LR = LiveInts->getCachedRegUnit(Unit))
-            checkLivenessAtUse(MO, MONum, UseIdx, *LR, Unit);
+            checkLivenessAtUse(MO, MONum, UseIdx, *LR, VirtRegOrUnit(Unit));
         }
       }
 
       if (Reg.isVirtual()) {
         // This is a virtual register interval.
-        checkLivenessAtUse(MO, MONum, UseIdx, *LI, Reg);
+        checkLivenessAtUse(MO, MONum, UseIdx, *LI, VirtRegOrUnit(Reg));
 
         if (LI->hasSubRanges() && !MO->isDef()) {
           LaneBitmask MOMask = SubRegIdx != 0
@@ -2989,7 +2992,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
           for (const LiveInterval::SubRange &SR : LI->subranges()) {
             if ((MOMask & SR.LaneMask).none())
               continue;
-            checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
+            checkLivenessAtUse(MO, MONum, UseIdx, SR, VirtRegOrUnit(Reg),
+                               SR.LaneMask);
             LiveQueryResult LRQ = SR.Query(UseIdx);
             if (LRQ.valueIn() || (MI->isPHI() && LRQ.valueOut()))
               LiveInMask |= SR.LaneMask;
@@ -3081,7 +3085,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
 
       if (Reg.isVirtual()) {
-        checkLivenessAtDef(MO, MONum, DefIdx, *LI, Reg);
+        checkLivenessAtDef(MO, MONum, DefIdx, *LI, VirtRegOrUnit(Reg));
 
         if (LI->hasSubRanges()) {
           LaneBitmask MOMask = SubRegIdx != 0
@@ -3090,7 +3094,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
           for (const LiveInterval::SubRange &SR : LI->subranges()) {
             if ((SR.LaneMask & MOMask).none())
               continue;
-            checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, true, SR.LaneMask);
+            checkLivenessAtDef(MO, MONum, DefIdx, SR, VirtRegOrUnit(Reg), true,
+                               SR.LaneMask);
           }
         }
       }
@@ -3532,11 +3537,12 @@ void MachineVerifier::verifyLiveIntervals() {
   // Verify all the cached regunit intervals.
   for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
     if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
-      verifyLiveRange(*LR, i);
+      verifyLiveRange(*LR, VirtRegOrUnit(i));
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
-                                           const VNInfo *VNI, Register Reg,
+                                           const VNInfo *VNI,
+                                           VirtRegOrUnit VRegOrUnit,
                                            LaneBitmask LaneMask) {
   if (VNI->isUnused())
     return;
@@ -3545,14 +3551,14 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
 
   if (!DefVNI) {
     report("Value not live at VNInfo def and not marked unused", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(*VNI);
     return;
   }
 
   if (DefVNI != VNI) {
     report("Live segment at def has different VNInfo", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(*VNI);
     return;
   }
@@ -3560,7 +3566,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
     report("Invalid VNInfo definition index", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(*VNI);
     return;
   }
@@ -3568,7 +3574,7 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
       report("PHIDef VNInfo is not defined at MBB start", MBB);
-      report_context(LR, Reg, LaneMask);
+      report_context(LR, VRegOrUnit, LaneMask);
       report_context(*VNI);
     }
     return;
@@ -3578,57 +3584,56 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
     report("No instruction at VNInfo def index", MBB);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(*VNI);
     return;
   }
 
-  if (Reg != 0) {
-    bool hasDef = false;
-    bool isEarlyClobber = false;
-    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
-      if (!MOI->isReg() || !MOI->isDef())
+  bool hasDef = false;
+  bool isEarlyClobber = false;
+  for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+    if (!MOI->isReg() || !MOI->isDef())
+      continue;
+    if (VRegOrUnit.isVirtualReg()) {
+      if (MOI->getReg() != VRegOrUnit.asVirtualReg())
         continue;
-      if (Reg.isVirtual()) {
-        if (MOI->getReg() != Reg)
-          continue;
-      } else {
-        if (!MOI->getReg().isPhysical() || !TRI->hasRegUnit(MOI->getReg(), Reg))
-          continue;
-      }
-      if (LaneMask.any() &&
-          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none())
+    } else {
+      if (!MOI->getReg().isPhysical() ||
+          !TRI->hasRegUnit(MOI->getReg(), VRegOrUnit.asMCRegUnit()))
         continue;
-      hasDef = true;
-      if (MOI->isEarlyClobber())
-        isEarlyClobber = true;
     }
+    if (LaneMask.any() &&
+        (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask).none())
+      continue;
+    hasDef = true;
+    if (MOI->isEarlyClobber())
+      isEarlyClobber = true;
+  }
 
-    if (!hasDef) {
-      report("Defining instruction does not modify register", MI);
-      report_context(LR, Reg, LaneMask);
-      report_context(*VNI);
-    }
+  if (!hasDef) {
+    report("Defining instruction does not modify register", MI);
+    report_context(LR, VRegOrUnit, LaneMask);
+    report_context(*VNI);
+  }
 
-    // Early clobber defs begin at USE slots, but other defs must begin at
-    // DEF slots.
-    if (isEarlyClobber) {
-      if (!VNI->def.isEarlyClobber()) {
-        report("Early clobber def must be at an early-clobber slot", MBB);
-        report_context(LR, Reg, LaneMask);
-        report_context(*VNI);
-      }
-    } else if (!VNI->def.isRegister()) {
-      report("Non-PHI, non-early clobber def must be at a register slot", MBB);
-      report_context(LR, Reg, LaneMask);
+  // Early clobber defs begin at USE slots, but other defs must begin at
+  // DEF slots.
+  if (isEarlyClobber) {
+    if (!VNI->def.isEarlyClobber()) {
+      report("Early clobber def must be at an early-clobber slot", MBB);
+      report_context(LR, VRegOrUnit, LaneMask);
       report_context(*VNI);
     }
+  } else if (!VNI->def.isRegister()) {
+    report("Non-PHI, non-early clobber def must be at a register slot", MBB);
+    report_context(LR, VRegOrUnit, LaneMask);
+    report_context(*VNI);
   }
 }
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             Register Reg,
+                                             VirtRegOrUnit VRegOrUnit,
                                              LaneBitmask LaneMask) {
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
@@ -3636,28 +3641,28 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
     report("Foreign valno in live segment", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(S);
     report_context(*VNI);
   }
 
   if (VNI->isUnused()) {
     report("Live segment valno is marked unused", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(S);
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
     report("Bad start of live segment, no basic block", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(S);
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
     report("Live segment must begin at MBB entry or valno def", MBB);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(S);
   }
 
@@ -3665,7 +3670,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
     report("Bad end of live segment, no basic block", MF);
-    report_context(LR, Reg, LaneMask);
+    report_context(LR, VRegOrUnit, LaneMask);
     report_context(S);
     return;
   }
@@ -3673,7 +3678,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   // Checks for non-live-out segments.
   if (S.end != LiveInts->getMBBEndIdx(EndMBB)) {
     // RegUnit intervals are allowed dead phis.
-    if (!Reg.isVirtual() && VNI->isPHIDef() && S.start == VNI->def &&
+    if (!VRegOrUnit.isVirtualReg() && VNI->isPHIDef() && S.start == VNI->def &&
         S.end == VNI->def.getDeadSlot())
       return;
 
@@ -3682,7 +3687,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
         LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
     if (!MI) {
       report("Live segment doesn't end at a valid instruction", EndMBB);
-      report_context(LR, Reg, LaneMask);
+      report_context(LR, VRegOrUnit, LaneMask);
       report_context(S);
       return;
     }
@@ -3690,7 +3695,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     // The block slot must refer to a basic block boundary.
     if (S.end.isBlock()) {
       report("Live segment ends at B slot of an instruction", EndMBB);
-      report_context(LR, Reg, LaneMask);
+      report_context(LR, VRegOrUnit, LaneMask);
       report_context(S);
     }
 
@@ -3699,7 +3704,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       // That means there must be a dead def.
       if (!SlotIndex::isSameInstr(S.start, S.end)) {
         report("Live segment ending at dead slot spans instructions", EndMBB);
-        report_context(LR, Reg, LaneMask);
+        report_context(LR, VRegOrUnit, LaneMask);
         report_context(S);
       }
     }
@@ -3715,21 +3720,21 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
         report("Live segment ending at early clobber slot must be "
                "redefined by an EC def in the same instruction",
                EndMBB);
-        report_context(LR, Reg, LaneMask);
+        report_context(LR, VRegOrUnit, LaneMask);
         report_context(S);
       }
     }
 
     // The following checks only apply to virtual registers. Physreg liveness
     // is too weird to check.
-    if (Reg.isVirtual()) {
+    if (VRegOrUnit.isVirtualReg()) {
       // A live segment can end with either a redefinition, a kill flag on a
       // use, or a dead flag on a def.
       bool hasRead = false;
       bool hasSubRegDef = false;
       bool hasDeadDef = false;
       for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
-        if (!MOI->isReg() || MOI->getReg() != Reg)
+        if (!MOI->isReg() || MOI->getReg() != VRegOrUnit.asVirtualReg())
           continue;
         unsigned Sub = MOI->getSubReg();
         LaneBitmask SLM =
@@ -3758,18 +3763,18 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
           report(
               "Instruction ending live segment on dead slot has no dead flag",
               MI);
-          report_context(LR, Reg, LaneMask);
+          report_context(LR, VRegOrUnit, LaneMask);
           report_context(S);
         }
       } else {
         if (!hasRead) {
           // When tracking subregister liveness, the main range must start new
           // values on partial register writes, even if there is no read.
-          if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask.any() ||
-              !hasSubRegDef) {
+          if (!MRI->shouldTrackSubRegLiveness(VRegOrUnit.asVirtualReg()) ||
+              LaneMask.any() || !hasSubRegDef) {
             report("Instruction ending live segment doesn't read the register",
                    MI);
-            report_context(LR, Reg, LaneMask);
+            report_context(LR, VRegOrUnit, LaneMask);
             report_context(S);
           }
         }
@@ -3790,14 +3795,14 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
   SmallVector<SlotIndex, 4> Undefs;
   if (LaneMask.any()) {
-    LiveInterval &OwnerLI = LiveInts->getInterval(Reg);
+    LiveInterval &OwnerLI = LiveInts->getInterval(VRegOrUnit.asVirtualReg());
     OwnerLI.computeSubRangeUndefs(Undefs, LaneMask, *MRI, *Indexes);
   }
 
   while (true) {
     assert(LiveInts->isLiveInToMBB(LR, &*MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!Reg.isVirtual() && MFI->isEHPad()) {
+    if (!VRegOrUnit.isVirtualReg() && MFI->isEHPad()) {
       if (&*MFI == EndMBB)
         break;
       ++MFI;
@@ -3830,7 +3835,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
         if (LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes))
           continue;
         report("Register not marked live out of predecessor", Pred);
-        report_context(LR, Reg, LaneMask);
+        report_context(LR, VRegOrUnit, LaneMask);
         report_context(*VNI);
         OS << " live into " << printMBBReference(*MFI) << '@'
            << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " << PEnd
@@ -3841,7 +3846,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
         report("Different value live out of predecessor", Pred);
-        report_context(LR, Reg, LaneMask);
+        report_context(LR, VRegOrUnit, LaneMask);
         OS << "Valno #" << PVNI->id << " live out of "
            << printMBBReference(*Pred) << '@' << PEnd << "\nValno #" << VNI->id
            << " live into " << printMBBReference(*MFI) << '@'
@@ -3854,19 +3859,20 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   }
 }
 
-void MachineVerifier::verifyLiveRange(const LiveRange &LR, Register Reg,
+void MachineVerifier::verifyLiveRange(const LiveRange &LR,
+                                      VirtRegOrUnit VRegOrUnit,
                                       LaneBitmask LaneMask) {
   for (const VNInfo *VNI : LR.valnos)
-    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
+    verifyLiveRangeValue(LR, VNI, VRegOrUnit, LaneMask);
 
   for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
-    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
+    verifyLiveRangeSegment(LR, I, VRegOrUnit, LaneMask);
 }
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   Register Reg = LI.reg();
   assert(Reg.isVirtual());
-  verifyLiveRange(LI, Reg);
+  verifyLiveRange(LI, VirtRegOrUnit(Reg));
 
   if (LI.hasSubRanges()) {
     LaneBitmask Mask;
@@ -3882,10 +3888,10 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
       }
       if (SR.empty()) {
         report("Subrange must not be empty", MF);
-        report_context(SR, LI.reg(), SR.LaneMask);
+        report_context(SR, VirtRegOrUnit(LI.reg()), SR.LaneMask);
       }
       Mask |= SR.LaneMask;
-      verifyLiveRange(SR, LI.reg(), SR.LaneMask);
+      verifyLiveRange(SR, VirtRegOrUnit(LI.reg()), SR.LaneMask);
       if (!LI.covers(SR)) {
         report("A Subrange is not covered by the main range", MF);
         report_context(LI);

From 8a6b44bf4cfe5df3db687a6b9519e99dbce8cf54 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Sat, 25 Jan 2025 03:55:09 +0000
Subject: [PATCH 076/432] Revert "[libc++] Fix tests for
 clang::no_specializations for C++17 and C++20"

This reverts commit 4df9c17e5f436702ca4f5439322972b0385d629a.

Reason: buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/24/builds/4598/steps/10/logs/stdio)
---
 libcxx/include/__type_traits/result_of.h      |  2 +-
 .../ranges/no_specializations.verify.cpp      |  4 +---
 .../type_traits/no_specializations.verify.cpp | 24 +++++++------------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h
index 8cc009dbe8baa..217ca70b4cd20 100644
--- a/libcxx/include/__type_traits/result_of.h
+++ b/libcxx/include/__type_traits/result_of.h
@@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS)
 template <class _Callable>
-struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of;
+struct _LIBCPP_DEPRECATED_IN_CXX17 result_of;
 
 template <class _Fp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {};
diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
index 489e3a6a73744..69d458a920558 100644
--- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
@@ -13,9 +13,7 @@
 
 #include <ranges>
 
-#include "test_macros.h"
-
-#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20
+#if !__has_warning("-Winvalid-specialization")
 // expected-no-diagnostics
 #else
 struct S {};
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index 807d01e381b49..e6d960667e8c0 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -36,22 +36,15 @@ SPECIALIZE_TRAIT(make_unsigned);        // expected-error {{cannot be specialize
 SPECIALIZE_TRAIT(remove_all_extents);   // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_const);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_cv);            // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(remove_cvref);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_extent);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_pointer);       // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_reference);     // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_volatile);      // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(type_identity);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(underlying_type);      // expected-error {{cannot be specialized}}
-
-#  if TEST_STD_VER <= 17
-SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}}
-#  endif
-
-#  if TEST_STD_VER >= 20
-SPECIALIZE_TRAIT(remove_cvref);     // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(type_identity);    // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}}
-#  endif
+SPECIALIZE_TRAIT(unwrap_reference);     // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(unwrap_ref_decay);     // expected-error {{cannot be specialized}}
 
 #  undef SPECIALIZE_TRAIT
 #  define SPECIALIZE_UTT(Trait)                                                                                        \
@@ -103,6 +96,7 @@ SPECIALIZE_UTT(is_move_assignable);                 // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_move_constructible);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(is_nothrow_assignable);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_constructible);           // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_BTT(is_nothrow_convertible);             // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_assignable);         // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_constructible);      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_default_constructible);   // expected-error 2 {{cannot be specialized}}
@@ -136,6 +130,7 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_trivially_destructible);          // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_assignable);       // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_constructible);    // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_unbounded_array);                 // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_union);                           // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_unsigned);                        // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_void);                            // expected-error 2 {{cannot be specialized}}
@@ -145,12 +140,11 @@ SPECIALIZE_UTT(rank);                               // expected-error 2 {{cannot
 
 #  if TEST_STD_VER <= 17
 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(result_of);       // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 20
-SPECIALIZE_UTT(is_bounded_array);       // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 23
@@ -177,8 +171,6 @@ struct std::conditional<true, S, S>; // expected-error {{cannot be specialized}}
 template <>
 struct std::enable_if<true, S>; // expected-error {{cannot be specialized}}
 
-#if TEST_STD_VER >= 20
 template <>
 struct std::integral_constant<S, {}>; // expected-error {{cannot be specialized}}
 #endif
-#endif

From 1f26ac10ca1bef40a80be8f81a6f109713bc586f Mon Sep 17 00:00:00 2001
From: mconst <mconst@gmail.com>
Date: Fri, 24 Jan 2025 20:03:57 -0800
Subject: [PATCH 077/432] [X86] Better handling of impossibly large stack
 frames (#124217)

If you try to create a stack frame of 4 GiB or larger with a 32-bit
stack pointer, we currently emit invalid instructions like `mov eax,
5000000000` (unless you specify `-fstack-clash-protection`, in which
case we emit a trap instead).

The trap seems nicer, so let's do that in all cases. This avoids
emitting invalid instructions, and also fixes the "can't have 32-bit
16GB stack frame" assertion in `X86FrameLowering::emitSPUpdate()` (which
used to be triggerable by user code, but is now correct).

This was originally part of #124041.

@phoebewang
---
 llvm/lib/Target/X86/X86FrameLowering.cpp      | 13 +++++++---
 llvm/test/CodeGen/X86/huge-stack-offset.ll    |  8 +++---
 .../CodeGen/X86/stack-clash-extra-huge.ll     | 26 ++-----------------
 3 files changed, 14 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 47cc6a18ef843..a7b60afb7f547 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -234,6 +234,14 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
   MachineInstr::MIFlag Flag =
       isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
+  if (!Uses64BitFramePtr && !isUInt<32>(Offset)) {
+    // We're being asked to adjust a 32-bit stack pointer by 4 GiB or more.
+    // This might be unreachable code, so don't complain now; just trap if
+    // it's reached at runtime.
+    BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP));
+    return;
+  }
+
   uint64_t Chunk = (1LL << 31) - 1;
 
   MachineFunction &MF = *MBB.getParent();
@@ -829,10 +837,7 @@ void X86FrameLowering::emitStackProbeInlineGenericLoop(
           .addReg(StackPtr)
           .setMIFlag(MachineInstr::FrameSetup);
     } else {
-      // We're being asked to probe a stack frame that's 4 GiB or larger,
-      // but our stack pointer is only 32 bits.  This might be unreachable
-      // code, so don't complain now; just trap if it's reached at runtime.
-      BuildMI(MBB, MBBI, DL, TII.get(X86::TRAP));
+      llvm_unreachable("Offset too large for 32-bit stack pointer");
     }
 
     // while in the loop, use loop-invariant reg for CFI,
diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll
index e825328ccd89a..6629811a59b23 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll
@@ -13,11 +13,9 @@ define void @foo() nounwind {
 ; CHECK-64-NEXT: addq    [[RAX]], %rsp
 
 ; CHECK-32-LABEL: foo:
-; CHECK-32:      movl    $50000000{{..}}, %eax
-; CHECK-32-NEXT: subl    %eax, %esp
+; CHECK-32:      ud2
 ; CHECK-32-NOT:  subl    $2147483647, %esp
-; CHECK-32:      movl    $50000000{{..}}, [[EAX:%e..]]
-; CHECK-32-NEXT: addl    [[EAX]], %esp
+; CHECK-32:      ud2
   %1 = alloca [5000000000 x i8], align 16
   call void @bar(ptr %1)
   ret void
@@ -46,7 +44,7 @@ define i32 @foo3(i32 inreg %x) nounwind {
 ; CHECK-64-NEXT: subq    %rax, %rsp
 
 ; CHECK-32-LABEL: foo3:
-; CHECK-32:      subl $2147483647, %esp
+; CHECK-32:      ud2
 ; CHECK-32-NOT:  movl ${{.*}}, %eax
   %1 = alloca [5000000000 x i8], align 16
   call void @bar(ptr %1)
diff --git a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
index b8031056fd6b0..d9b20f50e9a88 100644
--- a/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
+++ b/llvm/test/CodeGen/X86/stack-clash-extra-huge.ll
@@ -30,44 +30,22 @@ define i32 @foo() local_unnamed_addr #0 {
 ; CHECK-X86-LABEL: foo:
 ; CHECK-X86:       # %bb.0:
 ; CHECK-X86-NEXT:    ud2
-; CHECK-X86-NEXT:    .cfi_def_cfa_register %eax
-; CHECK-X86-NEXT:    .cfi_adjust_cfa_offset 4800000000
-; CHECK-X86-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-X86-NEXT:    subl $4096, %esp # imm = 0x1000
-; CHECK-X86-NEXT:    movl $0, (%esp)
-; CHECK-X86-NEXT:    cmpl %eax, %esp
-; CHECK-X86-NEXT:    jne .LBB0_1
-; CHECK-X86-NEXT:  # %bb.2:
-; CHECK-X86-NEXT:    subl $12, %esp
-; CHECK-X86-NEXT:    .cfi_def_cfa_register %esp
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4800000016
 ; CHECK-X86-NEXT:    movl $1, 392(%esp)
 ; CHECK-X86-NEXT:    movl $1, 28792(%esp)
 ; CHECK-X86-NEXT:    movl (%esp), %eax
-; CHECK-X86-NEXT:    movl $4800000012, %ecx # imm = 0x11E1A300C
-; CHECK-X86-NEXT:    addl %ecx, %esp
+; CHECK-X86-NEXT:    ud2
 ; CHECK-X86-NEXT:    .cfi_def_cfa_offset 4
 ; CHECK-X86-NEXT:    retl
 ;
 ; CHECK-X32-LABEL: foo:
 ; CHECK-X32:       # %bb.0:
 ; CHECK-X32-NEXT:    ud2
-; CHECK-X32-NEXT:    .cfi_def_cfa_register %r11
-; CHECK-X32-NEXT:    .cfi_adjust_cfa_offset 4799995904
-; CHECK-X32-NEXT:  .LBB0_1: # =>This Inner Loop Header: Depth=1
-; CHECK-X32-NEXT:    subl $4096, %esp # imm = 0x1000
-; CHECK-X32-NEXT:    movq $0, (%esp)
-; CHECK-X32-NEXT:    cmpl %r11d, %esp
-; CHECK-X32-NEXT:    jne .LBB0_1
-; CHECK-X32-NEXT:  # %bb.2:
-; CHECK-X32-NEXT:    subl $3976, %esp # imm = 0xF88
-; CHECK-X32-NEXT:    .cfi_def_cfa_register %rsp
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 4799999888
 ; CHECK-X32-NEXT:    movl $1, 264(%esp)
 ; CHECK-X32-NEXT:    movl $1, 28664(%esp)
 ; CHECK-X32-NEXT:    movl -128(%esp), %eax
-; CHECK-X32-NEXT:    movl $4799999880, %ecx # imm = 0x11E1A2F88
-; CHECK-X32-NEXT:    addl %ecx, %esp
+; CHECK-X32-NEXT:    ud2
 ; CHECK-X32-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-X32-NEXT:    retq
   %a = alloca i32, i64 1200000000, align 16

From f607e3fd23ef0019b2f3b289b4d46012400b8db5 Mon Sep 17 00:00:00 2001
From: Valentyn Yukhymenko <valentin.yukhymenko@gmail.com>
Date: Sat, 25 Jan 2025 07:01:40 +0000
Subject: [PATCH 078/432] [Clang][Sema] Reject declaring an alias template with
 the same name as its template parameter. (#123533)

The issue occurred because the template parameter scope was skipped
too early, before diagnosing the alias name shadowing.

To fix this, the patch moves it to after LookupName, such that the behavior
remains consistent with the typedef implementation.

Fixes llvm#123423
---
 clang/docs/ReleaseNotes.rst                            |  1 +
 clang/lib/Sema/SemaDeclCXX.cpp                         |  5 +++--
 .../temp/temp.decls/temp.variadic/fixed-expansion.cpp  |  4 ++--
 clang/test/SemaCXX/alias-template.cpp                  | 10 ++++++++--
 4 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f110b8cf76507..e9fffddd507c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -990,6 +990,7 @@ Bug Fixes to C++ Support
 - Fix immediate escalation not propagating through inherited constructors.  (#GH112677)
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
+- Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 839b3a1cccdcc..08065e3cad2bb 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -13406,8 +13406,6 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS,
                                   SourceLocation UsingLoc, UnqualifiedId &Name,
                                   const ParsedAttributesView &AttrList,
                                   TypeResult Type, Decl *DeclFromDeclSpec) {
-  // Get the innermost enclosing declaration scope.
-  S = S->getDeclParent();
 
   if (Type.isInvalid())
     return nullptr;
@@ -13458,6 +13456,9 @@ Decl *Sema::ActOnAliasDeclaration(Scope *S, AccessSpecifier AS,
   CheckTypedefForVariablyModifiedType(S, NewTD);
   Invalid |= NewTD->isInvalidDecl();
 
+  // Get the innermost enclosing declaration scope.
+  S = S->getDeclParent();
+
   bool Redeclaration = false;
 
   NamedDecl *NewND;
diff --git a/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp b/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp
index a990c82564aa4..ab4c663d24c7d 100644
--- a/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp
+++ b/clang/test/CXX/temp/temp.decls/temp.variadic/fixed-expansion.cpp
@@ -121,8 +121,8 @@ namespace PartialSpecialization {
 
 namespace FixedAliasTemplate {
   template<typename,typename,typename> struct S {};
-  template<typename T, typename U> using U = S<T, int, U>; // expected-note 2{{template parameter is declared here}}
-  template<typename...Ts> U<Ts...> &f(U<Ts...>, Ts...); // expected-error 2{{pack expansion used as argument for non-pack parameter of alias template}}
+  template<typename T, typename U> using Z = S<T, int, U>; // expected-note 2{{template parameter is declared here}}
+  template<typename...Ts> Z<Ts...> &f(Z<Ts...>, Ts...); // expected-error 2{{pack expansion used as argument for non-pack parameter of alias template}}
   S<int, int, double> &s1 = f({}, 0, 0.0); // expected-error {{no matching function}}
 }
 
diff --git a/clang/test/SemaCXX/alias-template.cpp b/clang/test/SemaCXX/alias-template.cpp
index 5189405e23db5..b49d36a6267e6 100644
--- a/clang/test/SemaCXX/alias-template.cpp
+++ b/clang/test/SemaCXX/alias-template.cpp
@@ -54,18 +54,24 @@ namespace LookupFilter {
   template<typename U> using S = S<U>*; // ok
 }
 
-namespace InFunctions {
+namespace UnexpandedPack {
   template<typename...T> struct S0 {
     template<typename Z> using U = T*; // expected-error {{declaration type contains unexpanded parameter pack 'T'}}
     U<char> u;
   };
+}
 
+namespace InvalidType {
   template<typename Z> using T1 = int;
   template<typename Z> using T2 = int[-1]; // expected-error {{array size is negative}}
+}
+
+namespace ShadowTemplateParam {
   template<typename...T> struct S3 { // expected-note {{template parameter is declared here}}
     template<typename Z> using T = int; // expected-error {{declaration of 'T' shadows template parameter}}
   };
-  template<typename Z> using Z = Z;
+  template<typename Z> // expected-note {{template parameter is declared here}}
+  using Z = Z; // expected-error {{declaration of 'Z' shadows template parameter}}
 }
 
 namespace ClassNameRedecl {

From c216081e981ea14536024b86df79ddee9fe517e0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 01:15:38 -0800
Subject: [PATCH 079/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124388)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect P to be nonnull.
---
 clang/lib/AST/DeclTemplate.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 40ee3753c2422..2933ba7fb8a29 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1049,7 +1049,7 @@ ClassTemplateSpecializationDecl::getSourceRange() const {
     assert(!Pattern.isNull() &&
            "Class template specialization without pattern?");
     if (const auto *CTPSD =
-            Pattern.dyn_cast<ClassTemplatePartialSpecializationDecl *>())
+            dyn_cast<ClassTemplatePartialSpecializationDecl *>(Pattern))
       return CTPSD->getSourceRange();
     return cast<ClassTemplateDecl *>(Pattern)->getSourceRange();
   }
@@ -1773,7 +1773,7 @@ TemplateParameterList *clang::getReplacedTemplateParameterList(Decl *D) {
     const auto *CTSD = cast<ClassTemplateSpecializationDecl>(D);
     auto P = CTSD->getSpecializedTemplateOrPartial();
     if (const auto *CTPSD =
-            P.dyn_cast<ClassTemplatePartialSpecializationDecl *>())
+            dyn_cast<ClassTemplatePartialSpecializationDecl *>(P))
       return CTPSD->getTemplateParameters();
     return cast<ClassTemplateDecl *>(P)->getTemplateParameters();
   }

From 186d6546d9c5898a0a32f4616558021d9a908786 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 01:16:00 -0800
Subject: [PATCH 080/432] [Index] Migrate away from PointerUnion::dyn_cast
 (NFC) (#124389)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect DclInfo.DeclOrMacro to be nonnull.
---
 clang/lib/Index/FileIndexRecord.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Index/FileIndexRecord.cpp b/clang/lib/Index/FileIndexRecord.cpp
index 449c33637eb7e..cf40a596f5094 100644
--- a/clang/lib/Index/FileIndexRecord.cpp
+++ b/clang/lib/Index/FileIndexRecord.cpp
@@ -55,7 +55,7 @@ void FileIndexRecord::removeHeaderGuardMacros() {
 void FileIndexRecord::print(llvm::raw_ostream &OS, SourceManager &SM) const {
   OS << "DECLS BEGIN ---\n";
   for (auto &DclInfo : Decls) {
-    if (const auto *D = DclInfo.DeclOrMacro.dyn_cast<const Decl *>()) {
+    if (const auto *D = dyn_cast<const Decl *>(DclInfo.DeclOrMacro)) {
       SourceLocation Loc = SM.getFileLoc(D->getLocation());
       PresumedLoc PLoc = SM.getPresumedLoc(Loc);
       OS << llvm::sys::path::filename(PLoc.getFilename()) << ':'

From 62bd217b5a1cf6b231b2413b5522533986d4e5df Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 01:16:17 -0800
Subject: [PATCH 081/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124391)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect DeclOrIterator to be nonnull.
---
 clang/lib/Sema/SemaCodeComplete.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 58f3efbe0daf8..bc0f6a9435f95 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -678,7 +678,7 @@ class ResultBuilder::ShadowMapEntry::iterator {
   }*/
 
   reference operator*() const {
-    if (const NamedDecl *ND = DeclOrIterator.dyn_cast<const NamedDecl *>())
+    if (const NamedDecl *ND = dyn_cast<const NamedDecl *>(DeclOrIterator))
       return reference(ND, SingleDeclIndex);
 
     return *cast<const DeclIndexPair *>(DeclOrIterator);

From 0cc74a8941884d56a4718c28cc5b8ef8dbe17047 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 01:17:22 -0800
Subject: [PATCH 082/432] [CodeGen] Avoid repeated hash lookups (NFC) (#124392)

---
 llvm/lib/CodeGen/ModuloSchedule.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index d99b6ace01000..f9fe812f7e65c 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -397,8 +397,9 @@ void ModuloScheduleExpander::generateExistingPhis(
     // The Phi value from the loop body typically is defined in the loop, but
     // not always. So, we need to check if the value is defined in the loop.
     unsigned PhiOp2 = LoopVal;
-    if (VRMap[LastStageNum].count(LoopVal))
-      PhiOp2 = VRMap[LastStageNum][LoopVal];
+    if (auto It = VRMap[LastStageNum].find(LoopVal);
+        It != VRMap[LastStageNum].end())
+      PhiOp2 = It->second;
 
     int StageScheduled = Schedule.getStage(&*BBI);
     int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
@@ -1055,8 +1056,8 @@ void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
         // Make an adjustment to get the last definition.
         StageNum -= StageDiff;
       }
-      if (VRMap[StageNum].count(reg))
-        MO.setReg(VRMap[StageNum][reg]);
+      if (auto It = VRMap[StageNum].find(reg); It != VRMap[StageNum].end())
+        MO.setReg(It->second);
     }
   }
 }
@@ -1710,8 +1711,8 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
     for (MachineOperand &MO : I->uses()) {
       if (!MO.isReg())
         continue;
-      if (Remaps.count(MO.getReg()))
-        MO.setReg(Remaps[MO.getReg()]);
+      if (auto It = Remaps.find(MO.getReg()); It != Remaps.end())
+        MO.setReg(It->second);
       else {
         // If we are using a phi from the source block we need to add a new phi
         // pointing to the old one.

From 72918fd11dd805b578bbc9c4f36bea3bc96f37b5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 01:17:38 -0800
Subject: [PATCH 083/432] [GlobalISel] Avoid repeated hash lookups (NFC)
 (#124393)

---
 llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
index 3a9069848ca1d..0222069cfc576 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -181,10 +181,7 @@ MachineInstr *GISelCSEInfo::getMachineInstrIfExists(FoldingSetNodeID &ID,
 
 void GISelCSEInfo::countOpcodeHit(unsigned Opc) {
 #ifndef NDEBUG
-  if (OpcodeHitTable.count(Opc))
-    OpcodeHitTable[Opc] += 1;
-  else
-    OpcodeHitTable[Opc] = 1;
+  ++OpcodeHitTable[Opc];
 #endif
   // Else do nothing.
 }

From 84d4037488f5b366e76be4fe723e0de7aeee264d Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 25 Jan 2025 10:12:19 +0100
Subject: [PATCH 084/432] Reapply "[libc++] Fix tests for
 clang::no_specializations for C++17 and C++20"

The missing diagnostic pragmas have been added.

This reverts commit 8a6b44bf4cfe5df3db687a6b9519e99dbce8cf54.
---
 libcxx/include/__type_traits/result_of.h      |  7 +++++-
 .../ranges/no_specializations.verify.cpp      |  4 +++-
 .../type_traits/no_specializations.verify.cpp | 24 ++++++++++++-------
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__type_traits/result_of.h b/libcxx/include/__type_traits/result_of.h
index 217ca70b4cd20..e6adec7f9c978 100644
--- a/libcxx/include/__type_traits/result_of.h
+++ b/libcxx/include/__type_traits/result_of.h
@@ -22,10 +22,15 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_TYPE_TRAITS)
 template <class _Callable>
-struct _LIBCPP_DEPRECATED_IN_CXX17 result_of;
+struct _LIBCPP_DEPRECATED_IN_CXX17 _LIBCPP_NO_SPECIALIZATIONS result_of;
 
+_LIBCPP_DIAGNOSTIC_PUSH
+#if __has_warning("-Winvalid-specialization")
+_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization")
+#endif
 template <class _Fp, class... _Args>
 struct _LIBCPP_TEMPLATE_VIS result_of<_Fp(_Args...)> : __invoke_result<_Fp, _Args...> {};
+_LIBCPP_DIAGNOSTIC_POP
 
 #  if _LIBCPP_STD_VER >= 14
 template <class _Tp>
diff --git a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
index 69d458a920558..489e3a6a73744 100644
--- a/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/ranges/no_specializations.verify.cpp
@@ -13,7 +13,9 @@
 
 #include <ranges>
 
-#if !__has_warning("-Winvalid-specialization")
+#include "test_macros.h"
+
+#if !__has_warning("-Winvalid-specialization") || TEST_STD_VER <= 20
 // expected-no-diagnostics
 #else
 struct S {};
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index e6d960667e8c0..807d01e381b49 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -36,15 +36,22 @@ SPECIALIZE_TRAIT(make_unsigned);        // expected-error {{cannot be specialize
 SPECIALIZE_TRAIT(remove_all_extents);   // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_const);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_cv);            // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(remove_cvref);         // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_extent);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_pointer);       // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_reference);     // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(remove_volatile);      // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(type_identity);        // expected-error {{cannot be specialized}}
 SPECIALIZE_TRAIT(underlying_type);      // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_reference);     // expected-error {{cannot be specialized}}
-SPECIALIZE_TRAIT(unwrap_ref_decay);     // expected-error {{cannot be specialized}}
+
+#  if TEST_STD_VER <= 17
+SPECIALIZE_TRAIT(result_of); // expected-error {{cannot be specialized}}
+#  endif
+
+#  if TEST_STD_VER >= 20
+SPECIALIZE_TRAIT(remove_cvref);     // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(type_identity);    // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(unwrap_reference); // expected-error {{cannot be specialized}}
+SPECIALIZE_TRAIT(unwrap_ref_decay); // expected-error {{cannot be specialized}}
+#  endif
 
 #  undef SPECIALIZE_TRAIT
 #  define SPECIALIZE_UTT(Trait)                                                                                        \
@@ -96,7 +103,6 @@ SPECIALIZE_UTT(is_move_assignable);                 // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_move_constructible);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(is_nothrow_assignable);              // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_constructible);           // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_BTT(is_nothrow_convertible);             // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_assignable);         // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_copy_constructible);      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_nothrow_default_constructible);   // expected-error 2 {{cannot be specialized}}
@@ -130,7 +136,6 @@ SPECIALIZE_UTT(is_trivially_default_constructible); // expected-error 2 {{cannot
 SPECIALIZE_UTT(is_trivially_destructible);          // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_assignable);       // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_trivially_move_constructible);    // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_unbounded_array);                 // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_union);                           // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_unsigned);                        // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_UTT(is_void);                            // expected-error 2 {{cannot be specialized}}
@@ -140,11 +145,12 @@ SPECIALIZE_UTT(rank);                               // expected-error 2 {{cannot
 
 #  if TEST_STD_VER <= 17
 SPECIALIZE_UTT(is_literal_type); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(result_of);       // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 20
-SPECIALIZE_UTT(is_bounded_array); // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_bounded_array);       // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_BTT(is_nothrow_convertible); // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 23
@@ -171,6 +177,8 @@ struct std::conditional<true, S, S>; // expected-error {{cannot be specialized}}
 template <>
 struct std::enable_if<true, S>; // expected-error {{cannot be specialized}}
 
+#if TEST_STD_VER >= 20
 template <>
 struct std::integral_constant<S, {}>; // expected-error {{cannot be specialized}}
 #endif
+#endif

From 7974f12b1e3682514bd58b35c5a784f35938fa04 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Sat, 25 Jan 2025 18:22:14 +0900
Subject: [PATCH 085/432] [HLSL] Suppress a warning in #122820
 [-Wunused-but-set-variable]

---
 clang/lib/Sema/SemaHLSL.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index a7033cb54886a..aa99b44958eaf 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -425,6 +425,7 @@ static CXXRecordDecl *createHostLayoutStruct(Sema &S,
   // copy base struct, create HLSL Buffer compatible version if needed
   if (unsigned NumBases = StructDecl->getNumBases()) {
     assert(NumBases == 1 && "HLSL supports only one base type");
+    (void)NumBases;
     CXXBaseSpecifier Base = *StructDecl->bases_begin();
     CXXRecordDecl *BaseDecl = Base.getType()->getAsCXXRecordDecl();
     if (requiresImplicitBufferLayoutStructure(BaseDecl)) {

From 2696e4fb9567d23ce065a067e7f4909b310daf50 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sat, 25 Jan 2025 10:36:43 +0100
Subject: [PATCH 086/432] [libc++] Reduce std::conjunction overhead (#124259)

The old and new implementation of `_And` are very close in terms of
performance according to my testing, but the new implementation can also
be used to implement `conjunction`, which make that ~50% faster.
---
 libcxx/include/__type_traits/conjunction.h | 42 ++++++++++------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/libcxx/include/__type_traits/conjunction.h b/libcxx/include/__type_traits/conjunction.h
index ad9656acd47ec..6b6717a50a468 100644
--- a/libcxx/include/__type_traits/conjunction.h
+++ b/libcxx/include/__type_traits/conjunction.h
@@ -10,8 +10,6 @@
 #define _LIBCPP___TYPE_TRAITS_CONJUNCTION_H
 
 #include <__config>
-#include <__type_traits/conditional.h>
-#include <__type_traits/enable_if.h>
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_same.h>
 
@@ -21,22 +19,29 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-template <class...>
-using __expand_to_true _LIBCPP_NODEBUG = true_type;
+template <bool>
+struct _AndImpl;
 
-template <class... _Pred>
-__expand_to_true<__enable_if_t<_Pred::value>...> __and_helper(int);
+template <>
+struct _AndImpl<true> {
+  template <class _Res, class _First, class... _Rest>
+  using _Result _LIBCPP_NODEBUG =
+      typename _AndImpl<bool(_First::value) && sizeof...(_Rest) != 0>::template _Result<_First, _Rest...>;
+};
 
-template <class...>
-false_type __and_helper(...);
+template <>
+struct _AndImpl<false> {
+  template <class _Res, class...>
+  using _Result _LIBCPP_NODEBUG = _Res;
+};
 
 // _And always performs lazy evaluation of its arguments.
 //
 // However, `_And<_Pred...>` itself will evaluate its result immediately (without having to
 // be instantiated) since it is an alias, unlike `conjunction<_Pred...>`, which is a struct.
 // If you want to defer the evaluation of `_And<_Pred...>` itself, use `_Lazy<_And, _Pred...>`.
-template <class... _Pred>
-using _And _LIBCPP_NODEBUG = decltype(std::__and_helper<_Pred...>(0));
+template <class... _Args>
+using _And _LIBCPP_NODEBUG = typename _AndImpl<sizeof...(_Args) != 0>::template _Result<true_type, _Args...>;
 
 template <bool... _Preds>
 struct __all_dummy;
@@ -46,22 +51,11 @@ struct __all : _IsSame<__all_dummy<_Pred...>, __all_dummy<((void)_Pred, true)...
 
 #if _LIBCPP_STD_VER >= 17
 
-template <class...>
-struct _LIBCPP_NO_SPECIALIZATIONS conjunction : true_type {};
-
-_LIBCPP_DIAGNOSTIC_PUSH
-#  if __has_warning("-Winvalid-specialization")
-_LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Winvalid-specialization")
-#  endif
-template <class _Arg>
-struct conjunction<_Arg> : _Arg {};
-
-template <class _Arg, class... _Args>
-struct conjunction<_Arg, _Args...> : conditional_t<!bool(_Arg::value), _Arg, conjunction<_Args...>> {};
-_LIBCPP_DIAGNOSTIC_POP
+template <class... _Args>
+struct _LIBCPP_NO_SPECIALIZATIONS conjunction : _And<_Args...> {};
 
 template <class... _Args>
-_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = conjunction<_Args...>::value;
+_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool conjunction_v = _And<_Args...>::value;
 
 #endif // _LIBCPP_STD_VER >= 17
 

From 52bffdf9f5bb72eb86249a012d08a40c90316dfb Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 25 Jan 2025 10:59:50 +0000
Subject: [PATCH 087/432] [IPSCCP][FuncSpec] Protect against metadata access
 from call args. (#124284)

Fixes an issue reported from #114964, where metadata arguments were
attempted to be accessed as constants.
---
 .../Transforms/IPO/FunctionSpecialization.cpp   |  2 ++
 .../solver-constant-strictfpmetadata.ll         | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll

diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 449d64d1614ff..c13305ce5056d 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -415,6 +415,8 @@ Constant *InstCostVisitor::visitCallBase(CallBase &I) {
 
   for (unsigned Idx = 0, E = I.getNumOperands() - 1; Idx != E; ++Idx) {
     Value *V = I.getOperand(Idx);
+    if (isa<MetadataAsValue>(V))
+      return nullptr;
     Constant *C = findConstantFor(V);
     if (!C)
       return nullptr;
diff --git a/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll b/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll
new file mode 100644
index 0000000000000..99224b4efba6b
--- /dev/null
+++ b/llvm/test/Transforms/FunctionSpecialization/solver-constant-strictfpmetadata.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=ipsccp -force-specialization -S < %s | FileCheck %s
+
+define float @test(ptr %this, float %cm, i1 %0) strictfp {
+; CHECK-LABEL: define float @test(
+; CHECK-SAME: ptr [[THIS:%.*]], float [[CM:%.*]], i1 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = call i1 @llvm.experimental.constrained.fcmps.f32(float [[CM]], float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict")
+; CHECK-NEXT:    [[CALL295:%.*]] = call float @test.specialized.1(ptr null, float 0.000000e+00, i1 false)
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  %cmp = call i1 @llvm.experimental.constrained.fcmps.f32(float %cm, float 0.000000e+00, metadata !"ole", metadata !"fpexcept.strict") #0
+  %call295 = call float @test(ptr null, float 0.000000e+00, i1 false) #0
+  ret float 0.000000e+00
+}
+

From 1a53d4baeb0242e00c494fd0a2b2ce58bcbf28b6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Sat, 25 Jan 2025 03:59:45 -0800
Subject: [PATCH 088/432] [clang][cmake] Apply bolt optimizations as part of
 the clang target (#119896)

This change removes the need to call the clang-bolt target in order to
apply bolt optimizations to clang. Now running `ninja clang` will build
a clang with bolt optimizations, and `ninja check-clang` and `ninja
install-clang` will test and install bolt optimized clang too.

The clang-bolt target has been kept for compatibilty reasons, but it is
now just an alias to the clang target.

Also, this new design for applying the bolt optimizations to clang will
be easier to generalize and use to optimize other binaries/libraries in
the project.

---------

Co-authored-by: Amir Ayupov <fads93@gmail.com>
Co-authored-by: Petr Hosek <phosek@google.com>
---
 clang/CMakeLists.txt                     | 52 -------------
 clang/tools/driver/CMakeLists.txt        | 52 +++++++++++++
 clang/utils/perf-training/CMakeLists.txt | 26 +------
 clang/utils/perf-training/perf-helper.py | 98 ++++++++++++++++++++++++
 4 files changed, 151 insertions(+), 77 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index b79e570667b2c..cacbf2ebf868f 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -872,58 +872,6 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
-set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
-  May be specified as Instrument or Perf or LBR to use a particular profiling \
-  mechanism.")
-string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-
-if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
-  set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
-  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
-  set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
-
-  # Pass extra flag in no-LBR mode
-  if (CLANG_BOLT STREQUAL "PERF")
-    set(BOLT_NO_LBR "-nl")
-  endif()
-
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    # Instrument clang with BOLT
-    add_custom_target(clang-instrumented
-      DEPENDS ${CLANG_INSTRUMENTED}
-    )
-    add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
-      DEPENDS clang llvm-bolt
-      COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-        -instrument --instrumentation-file-append-pid
-        --instrumentation-file=${BOLT_FDATA}
-      COMMENT "Instrumenting clang binary with BOLT"
-      USES_TERMINAL
-      VERBATIM
-    )
-    add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
-  else() # perf or LBR
-    add_custom_target(clang-bolt-training-deps DEPENDS clang)
-  endif()
-
-  # Optimize original (pre-bolt) Clang using the collected profile
-  add_custom_target(clang-bolt
-    DEPENDS clang-bolt-profile
-    COMMAND ${CMAKE_COMMAND} -E rename $<TARGET_FILE:clang> ${CLANG_PATH}-prebolt
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${CLANG_PATH}-prebolt ${CLANG_PATH}++-prebolt
-    COMMAND llvm-bolt ${CLANG_PATH}-prebolt
-      -o $<TARGET_FILE:clang>
-      -data ${BOLT_FDATA}
-      -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions
-      -split-all-cold -split-eh -dyno-stats -use-gnu-stack
-      -update-debug-sections
-      ${BOLT_NO_LBR}
-    COMMENT "Optimizing Clang with BOLT"
-    USES_TERMINAL
-    VERBATIM
-  )
-endif()
-
 if (LLVM_ADD_NATIVE_VISUALIZERS_TO_SOLUTION)
   add_subdirectory(utils/ClangVisualizers)
 endif()
diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index a4debc2dd2e89..ad336fcc45b60 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -23,6 +23,18 @@ if(CLANG_PLUGIN_SUPPORT)
   set(support_plugins SUPPORT_PLUGINS)
 endif()
 
+set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
+  May be specified as Instrument or Perf or LBR to use a particular profiling \
+  mechanism.")
+string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+  set(CLANG_BOLT_DEPS clear-bolt-fdata llvm-bolt llvm-readobj)
+  if (NOT CLANG_BOLT STREQUAL "INSTRUMENT")
+    list(APPEND CLANG_BOLT_DEPS clear-perf-data)
+  endif()
+endif()
+
 add_clang_tool(clang
   driver.cpp
   cc1_main.cpp
@@ -35,6 +47,7 @@ add_clang_tool(clang
   ARMTargetParserTableGen
   AArch64TargetParserTableGen
   ${support_plugins}
+  ${CLANG_BOLT_DEPS}
   GENERATE_DRIVER
   )
 
@@ -134,3 +147,42 @@ if(CLANG_ORDER_FILE AND
     set_target_properties(clang PROPERTIES LINK_DEPENDS ${CLANG_ORDER_FILE})
   endif()
 endif()
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+  # Add a clang-bolt target for backwards compatibility.
+  add_custom_target(clang-bolt DEPENDS clang)
+
+  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
+    "Name of BOLT-instrumented Clang binary")
+  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
+  set(PERF_TRAINING_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../utils/perf-training)
+  set(BOLT_FDATA ${PERF_TRAINING_BINARY_DIR}/prof.fdata)
+  get_llvm_lit_path(
+    lit_base_dir
+    lit_file_name
+    ALLOW_EXTERNAL
+  )
+  set(LIT_COMMAND "${lit_base_dir}/${lit_file_name}")
+
+  # This POST_BUILD command is executed unconditionally even if the clang target
+  # is already built.  We need to wrap the whole bolt optimization process in
+  # a single python wrapper, so that we can first check if the binary has
+  # already been optimized and then exit early with a 0 status if it has.
+  add_custom_command(
+    TARGET clang POST_BUILD
+    COMMAND  "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/../../utils/perf-training/perf-helper.py
+             bolt-optimize
+	     --method ${CLANG_BOLT}
+	     --input $<TARGET_FILE:clang>
+	     --instrumented-output ${CLANG_INSTRUMENTED}
+	     --fdata ${BOLT_FDATA}
+	     --perf-training-binary-dir ${PERF_TRAINING_BINARY_DIR}
+	     --readelf $<TARGET_FILE:llvm-readobj>
+	     --bolt $<TARGET_FILE:llvm-bolt>
+	     --lit "${LIT_COMMAND}"
+	     --merge-fdata $<TARGET_FILE:merge-fdata>
+    COMMENT "Optimizing Clang with BOLT"
+    USES_TERMINAL
+    VERBATIM
+  )
+endif()
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 49673790ff6e8..4aed086563ee9 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -83,8 +83,6 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
 endif()
 
 if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
-  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
-    "Name of BOLT-instrumented Clang binary")
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -93,7 +91,7 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
+    DEPENDS clear-bolt-fdata clear-perf-data
     )
 
   add_custom_target(clear-bolt-fdata
@@ -104,26 +102,4 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
     COMMENT "Clearing old perf data")
 
-  string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-  if (CLANG_BOLT STREQUAL "LBR")
-    set(BOLT_LBR "--lbr")
-  endif()
-
-  add_custom_target(merge-fdata-deps)
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    add_dependencies(merge-fdata-deps generate-bolt-fdata)
-  else()
-    # Convert perf profiles into fdata
-    add_custom_target(convert-perf-fdata
-      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
-      COMMENT "Converting perf files to BOLT fdata"
-      DEPENDS llvm-bolt generate-bolt-fdata)
-    add_dependencies(merge-fdata-deps convert-perf-fdata)
-  endif()
-
-  # Merge profiles into one using merge-fdata
-  add_custom_target(clang-bolt-profile
-    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
-    COMMENT "Merging BOLT fdata"
-    DEPENDS merge-fdata merge-fdata-deps)
 endif()
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index d76c6ede3fe5a..55c5160a71c4f 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -16,6 +16,8 @@
 import bisect
 import shlex
 import tempfile
+import re
+import shutil
 
 test_env = {"PATH": os.environ["PATH"]}
 
@@ -558,7 +560,103 @@ def genOrderFile(args):
     return 0
 
 
+def bolt_optimize(args):
+    parser = argparse.ArgumentParser("%prog  [options] ")
+    parser.add_argument("--method", choices=["INSTRUMENT", "PERF", "LBR"])
+    parser.add_argument("--input")
+    parser.add_argument("--instrumented-output")
+    parser.add_argument("--fdata")
+    parser.add_argument("--perf-training-binary-dir")
+    parser.add_argument("--readelf")
+    parser.add_argument("--bolt")
+    parser.add_argument("--lit")
+    parser.add_argument("--merge-fdata")
+
+    opts = parser.parse_args(args)
+
+    output = subprocess.check_output(
+        [opts.readelf, "-WS", opts.input], universal_newlines=True
+    )
+
+    # This binary has already been bolt-optimized, so skip further processing.
+    if re.search("\\.bolt\\.org\\.text", output, re.MULTILINE):
+        return 0
+
+    if opts.method == "INSTRUMENT":
+        process = subprocess.run(
+            [
+                opts.bolt,
+                opts.input,
+                "-o",
+                opts.instrumented_output,
+                "-instrument",
+                "--instrumentation-file-append-pid",
+                f"--instrumentation-file={opts.fdata}",
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+
+        print(process.args)
+        for line in process.stdout:
+            sys.stdout.write(line)
+        process.check_returncode()
+
+    process = subprocess.run(
+        [
+            sys.executable,
+            opts.lit,
+            os.path.join(opts.perf_training_binary_dir, "bolt-fdata"),
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+
+    print(process.args)
+    for line in process.stdout:
+        sys.stdout.write(line)
+    process.check_returncode()
+
+    if opts.method in ["PERF", "LBR"]:
+        perf2bolt([opts.bolt, opts.perf_training_binary_dir, opts.input])
+
+    merge_fdata([opts.merge_fdata, opts.fdata, opts.perf_training_binary_dir])
+
+    shutil.copy(opts.input, f"{opts.input}-prebolt")
+
+    process = subprocess.run(
+        [
+            opts.bolt,
+            f"{opts.input}-prebolt",
+            "-o",
+            opts.input,
+            "-data",
+            opts.fdata,
+            "-reorder-blocks=ext-tsp",
+            "-reorder-functions=cdsort",
+            "-split-functions",
+            "-split-all-cold",
+            "-split-eh",
+            "-dyno-stats",
+            "-use-gnu-stack",
+            "-update-debug-sections",
+            "-nl" if opts.method == "PERF" else "",
+        ],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+    )
+
+    print(process.args)
+    for line in process.stdout:
+        sys.stdout.write(line)
+    process.check_returncode()
+
+
 commands = {
+    "bolt-optimize": bolt_optimize,
     "clean": clean,
     "merge": merge,
     "dtrace": dtrace,

From de5ff8ad07ae824b86c5cefcba63f4b66607b759 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 25 Jan 2025 13:08:00 +0100
Subject: [PATCH 089/432] [libc++][test] Improves C++ Standard filtering.
 (#89499)

Adds a new lit directive to improve C++ Standard filtering. This is
based on the

[Discourse](https://discourse.llvm.org/t/rfc-improving-c-standard-filtering-in-the-lit-tests/78474)
discussion.
---
 libcxx/docs/TestingLibcxx.rst                 | 32 +++++++++++++++++++
 .../print.fun/includes.compile.pass.cpp       |  2 +-
 .../print.fun/no_file_description.pass.cpp    |  2 +-
 .../locale.stdcvt/depr.verify.cpp             |  2 +-
 .../conversions.buffer/depr.verify.cpp        |  2 +-
 .../conversions.string/depr.verify.cpp        |  2 +-
 .../reserve.deprecated_in_cxx20.verify.cpp    |  2 +-
 libcxx/utils/libcxx/test/params.py            |  3 +-
 8 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index e98b96bfb478f..4da7f3e85d291 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -435,6 +435,38 @@ writing tests easier. See `libc++-specific Lit Directives`_ for more information
        extension.)
 
 
+C++ Standard version tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Historically libc++ tests used to filter the tests for C++ Standard versions
+with lit directives like:
+
+.. code-block:: cpp
+
+   // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+With C++ Standards released every 3 years, this solution is not scalable.
+Instead use:
+
+.. code-block:: cpp
+
+   // UNSUPPORTED: std-at-least-c++26
+
+There is no corresponding ``std-at-most-c++23``. This could be useful when
+tests are only valid for a small set of standard versions. For example, a
+deprecation test is only valid when the feature is deprecated until it is
+removed from the Standard. These tests should be written like:
+
+.. code-block:: cpp
+
+   // REQUIRES: c++17 || c++20 || c++23
+
+.. note::
+
+   There are a lot of tests with the first style, these can remain as they are.
+   The new style is only intended to be used for new tests.
+
+
 Benchmarks
 ==========
 
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
index 9b9b0e404e6b7..38e4e4d3fb9ef 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/includes.compile.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// REQUIRES: std-at-least-c++23
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 
diff --git a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
index d3e4463fe0bc8..5561a1a8b3334 100644
--- a/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
+++ b/libcxx/test/std/input.output/iostream.format/print.fun/no_file_description.pass.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// REQUIRES: std-at-least-c++23
 // UNSUPPORTED: no-filesystem
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 
diff --git a/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp b/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp
index b3c6fc8674f8a..7bdcaa5190bd0 100644
--- a/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp
+++ b/libcxx/test/std/localization/locale.stdcvt/depr.verify.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++26
+// REQUIRES: c++17 || c++20 || c++23
 // UNSUPPORTED: no-wide-characters
 
 // <codecvt>
diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp
index cb067e99a4764..dcab5cef3a550 100644
--- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp
+++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/depr.verify.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// UNSUPPORTED: c++03, c++11, c++14, c++26
+// REQUIRES: c++17 || c++20 || c++23
 
 // XFAIL: no-wide-characters
 
diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp
index f8bd156bdd5f6..6eab4a5dd9223 100644
--- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp
+++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.string/depr.verify.cpp
@@ -8,7 +8,7 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_CXX26_REMOVED_WSTRING_CONVERT
 
-// UNSUPPORTED: c++03, c++11, c++14, c++26
+// REQUIRES: c++17 || c++20 || c++23
 // UNSUPPORTED: no-wide-characters
 
 // <codecvt>
diff --git a/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp b/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp
index 81edd9b83d184..87b56c06b9512 100644
--- a/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp
+++ b/libcxx/test/std/strings/basic.string/string.capacity/reserve.deprecated_in_cxx20.verify.cpp
@@ -10,7 +10,7 @@
 
 // void reserve(); // Deprecated in C++20
 
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++26
+// REQUIRES: c++20 || c++23
 
 #include <string>
 
diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 947cfd2651364..8fd3872cd8cbb 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -186,7 +186,8 @@ def getSuitableClangTidy(cfg):
             AddFeature(std),
             AddSubstitution("%{cxx_std}", re.sub(r"\+", "x", std)),
             AddCompileFlag(lambda cfg: getStdFlag(cfg, std)),
-        ],
+        ]
+        + [AddFeature(f"std-at-least-{s}") for s in _allStandards if s <= std],
     ),
     Parameter(
         name="optimization",

From 6383a12e3b4339fa4743bb97da4d51dea6d2e2ea Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 25 Jan 2025 13:32:00 +0000
Subject: [PATCH 090/432] [VPlan] Refactor HCFG builder to preserve original
 vector latch (NFC).

Update HCFG builder to preserve the original latch block of the initial
VPlan, ensuring there is always a latch.

It also skips creating the BranchOnCond for the latch of the top-level
loop, instead of removing it later. Exiting via the latch is controlled
by later recipes.

This further unifies HCFG construction and prepares for use to also
build an initial VPlan (VPlan0) for inner loops.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ---
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  3 ++
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp | 17 +++++--
 .../vplan-printing-outer-loop.ll              |  8 +--
 .../Transforms/Vectorize/VPlanHCFGTest.cpp    | 49 +++++++++++--------
 5 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 49694eb68e25b..3a4f637f177e1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9509,12 +9509,6 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
       *PSE.getSE(), *TLI);
 
-  // Remove the existing terminator of the exiting block of the top-most region.
-  // A BranchOnCount will be added instead when adding the canonical IV recipes.
-  auto *Term =
-      Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
-  Term->eraseFromParent();
-
   // Tail folding is not supported for outer loops, so the induction increment
   // is guaranteed to not wrap.
   bool HasNUW = true;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 4159a71469bd1..83c54a9b9c259 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1628,6 +1628,9 @@ void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
     VFRange SubRange = {VF, MaxVFTimes2};
     auto Plan = buildVPlan(SubRange);
     VPlanTransforms::optimize(*Plan);
+    // Update the name of the latch of the top-level vector loop region region
+    // after optimizations which includes block folding.
+    Plan->getVectorLoopRegion()->getExiting()->setName("vector.latch");
     VPlans.push_back(std::move(Plan));
     VF = SubRange.End;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 0f3aa8d08e7b8..32723e5db9c45 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -292,6 +292,11 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
            "Instruction shouldn't have been visited.");
 
     if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+      if (TheLoop->getLoopLatch() == BB ||
+          any_of(successors(BB),
+                 [this](BasicBlock *Succ) { return !TheLoop->contains(Succ); }))
+        continue;
+
       // Conditional branch instruction are represented using BranchOnCond
       // recipes.
       if (Br->isConditional()) {
@@ -356,11 +361,6 @@ void PlainCFGBuilder::buildPlainCFG() {
   VPBasicBlock *VectorLatchVPBB = TheRegion->getExitingBasicBlock();
   BB2VPBB[TheLoop->getHeader()] = VectorHeaderVPBB;
   VectorHeaderVPBB->clearSuccessors();
-  VectorLatchVPBB->clearPredecessors();
-  if (TheLoop->getHeader() != TheLoop->getLoopLatch())
-    BB2VPBB[TheLoop->getLoopLatch()] = VectorLatchVPBB;
-  else
-    TheRegion->setExiting(VectorHeaderVPBB);
 
   // 1. Scan the body of the loop in a topological order to visit each basic
   // block after having visited its predecessor basic blocks. Create a VPBB for
@@ -398,6 +398,13 @@ void PlainCFGBuilder::buildPlainCFG() {
         setRegionPredsFromBB(Region, BB);
     }
 
+    if (TheLoop->getLoopLatch() == BB) {
+      VPBB->setOneSuccessor(VectorLatchVPBB);
+      VectorLatchVPBB->clearPredecessors();
+      VectorLatchVPBB->setPredecessors({VPBB});
+      continue;
+    }
+
     // Set VPBB successors. We create empty VPBBs for successors if they don't
     // exist already. Recipes will be created when the successor is visited
     // during the RPO traversal.
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
index 2adeb5920cb5b..52b2bcd9aac11 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-outer-loop.ll
@@ -35,12 +35,14 @@ define void @foo(i64 %n) {
 ; CHECK-NEXT:       EMIT branch-on-cond ir<%inner.ec>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT:  }
-; CHECK-NEXT:  Successor(s): vector.latch
+; CHECK-NEXT:  Successor(s): outer.latch
 ; CHECK-EMPTY:
-; CHECK-NEXT:   vector.latch:
+; CHECK-NEXT:  outer.latch:
 ; CHECK-NEXT:     EMIT ir<%outer.iv.next> = add ir<%outer.iv>, ir<1>
 ; CHECK-NEXT:     EMIT ir<%outer.ec> = icmp ir<%outer.iv.next>, ir<8>
-; CHECK-NEXT:     EMIT branch-on-cond ir<%outer.ec>
+; CHECK-NEXT:  Successor(s): vector.latch
+; CHECK-EMPTY:
+; CHECK-NEXT:   vector.latch:
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT:  }
 ; CHECK-NEXT: Successor(s): middle.block
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 19c2483d34ed1..dcdaf008e10fe 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -48,16 +48,19 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) {
   EXPECT_EQ(0u, Entry->getNumPredecessors());
   EXPECT_EQ(1u, Entry->getNumSuccessors());
 
-  // Check that the region following the preheader is a single basic-block
-  // region (loop).
+  // Check that the region following the preheader consists of a block for the
+  // original header and a separate latch.
   VPBasicBlock *VecBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
-  EXPECT_EQ(8u, VecBB->size());
+  EXPECT_EQ(7u, VecBB->size());
   EXPECT_EQ(0u, VecBB->getNumPredecessors());
-  EXPECT_EQ(0u, VecBB->getNumSuccessors());
+  EXPECT_EQ(1u, VecBB->getNumSuccessors());
   EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB);
-  EXPECT_EQ(VecBB->getParent()->getExitingBasicBlock(), VecBB);
   EXPECT_EQ(&*Plan, VecBB->getPlan());
 
+  VPBlockBase *VecLatch = VecBB->getSingleSuccessor();
+  EXPECT_EQ(VecLatch->getParent()->getExitingBasicBlock(), VecLatch);
+  EXPECT_EQ(0u, VecLatch->getNumSuccessors());
+
   auto Iter = VecBB->begin();
   VPWidenPHIRecipe *Phi = dyn_cast<VPWidenPHIRecipe>(&*Iter++);
   EXPECT_NE(nullptr, Phi);
@@ -127,29 +130,33 @@ compound=true
       "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
       "  EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\>, ir\<1\>\l" +
       "  EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\>, ir\<%N\>\l" +
-      "  EMIT branch-on-cond ir\<%exitcond\>\l" +
+      "Successor(s): vector.latch\l"
+    ]
+    N2 -> N4 [ label=""]
+    N4 [label =
+      "vector.latch:\l" +
       "No successors\l"
     ]
   }
-  N2 -> N4 [ label="" ltail=cluster_N3]
-  N4 [label =
+  N4 -> N5 [ label="" ltail=cluster_N3]
+  N5 [label =
     "middle.block:\l" +
     "  EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" +
     "  EMIT branch-on-cond vp\<%cmp.n\>\l" +
     "Successor(s): ir-bb\<for.end\>, scalar.ph\l"
   ]
-  N4 -> N5 [ label="T"]
-  N4 -> N6 [ label="F"]
-  N5 [label =
+  N5 -> N6 [ label="T"]
+  N5 -> N7 [ label="F"]
+  N6 [label =
     "ir-bb\<for.end\>:\l" +
     "No successors\l"
   ]
-  N6 [label =
+  N7 [label =
     "scalar.ph:\l" +
     "Successor(s): ir-bb\<for.body\>\l"
   ]
-  N6 -> N7 [ label=""]
-  N7 [label =
+  N7 -> N8 [ label=""]
+  N8 [label =
     "ir-bb\<for.body\>:\l" +
     "  IR   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\l" +
     "  IR   %arr.idx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv\l" +
@@ -204,14 +211,17 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
   EXPECT_EQ(0u, Entry->getNumPredecessors());
   EXPECT_EQ(1u, Entry->getNumSuccessors());
 
-  // Check that the region following the preheader is a single basic-block
-  // region (loop).
+  // Check that the region following the preheader consists of a block for the
+  // original header and a separate latch.
   VPBasicBlock *VecBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
-  EXPECT_EQ(8u, VecBB->size());
+  EXPECT_EQ(7u, VecBB->size());
   EXPECT_EQ(0u, VecBB->getNumPredecessors());
-  EXPECT_EQ(0u, VecBB->getNumSuccessors());
+  EXPECT_EQ(1u, VecBB->getNumSuccessors());
   EXPECT_EQ(VecBB->getParent()->getEntryBasicBlock(), VecBB);
-  EXPECT_EQ(VecBB->getParent()->getExitingBasicBlock(), VecBB);
+
+  VPBlockBase *VecLatch = VecBB->getSingleSuccessor();
+  EXPECT_EQ(VecLatch->getParent()->getExitingBasicBlock(), VecLatch);
+  EXPECT_EQ(0u, VecLatch->getNumSuccessors());
 
   auto Iter = VecBB->begin();
   EXPECT_NE(nullptr, dyn_cast<VPWidenPHIRecipe>(&*Iter++));
@@ -221,7 +231,6 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
   EXPECT_NE(nullptr, dyn_cast<VPWidenMemoryRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenRecipe>(&*Iter++));
   EXPECT_NE(nullptr, dyn_cast<VPWidenRecipe>(&*Iter++));
-  EXPECT_NE(nullptr, dyn_cast<VPInstruction>(&*Iter++));
   EXPECT_EQ(VecBB->end(), Iter);
 }
 

From 9325a61aa0960595c22867799ebd157c8160fd86 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Sat, 25 Jan 2025 10:16:37 -0500
Subject: [PATCH 091/432] Revert "[GlobalMerge][NFC] Skip sorting by
 profitability when it is not needed" (#124411)

Reverts llvm/llvm-project#124146 -- new comparator is not a strict-weak
as required by stable_sort.

Co-authored-by: Michael Maitland <michaeltmaitland@gmail.com>
---
 llvm/lib/CodeGen/GlobalMerge.cpp | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 41e01a1d3ccd5..7b76155b175d1 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -423,12 +423,24 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     }
   }
 
+  // Now we found a bunch of sets of globals used together.  We accumulated
+  // the number of times we encountered the sets (i.e., the number of functions
+  // that use that exact set of globals).
+  //
+  // Multiply that by the size of the set to give us a crude profitability
+  // metric.
+  llvm::stable_sort(UsedGlobalSets,
+                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
+                      return UGS1.Globals.count() * UGS1.UsageCount <
+                             UGS2.Globals.count() * UGS2.UsageCount;
+                    });
+
   // We can choose to merge all globals together, but ignore globals never used
   // with another global.  This catches the obviously non-profitable cases of
   // having a single global, but is aggressive enough for any other case.
   if (GlobalMergeIgnoreSingleUse) {
     BitVector AllGlobals(Globals.size());
-    for (const UsedGlobalSet &UGS : UsedGlobalSets) {
+    for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
       if (UGS.UsageCount == 0)
         continue;
       if (UGS.Globals.count() > 1)
@@ -437,16 +449,6 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
   }
 
-  // Now we found a bunch of sets of globals used together. We accumulated
-  // the number of times we encountered the sets (i.e., the number of functions
-  // that use that exact set of globals). Multiply that by the size of the set
-  // to give us a crude profitability metric.
-  llvm::stable_sort(UsedGlobalSets,
-                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
-                      return UGS1.Globals.count() * UGS1.UsageCount >=
-                             UGS2.Globals.count() * UGS2.UsageCount;
-                    });
-
   // Starting from the sets with the best (=biggest) profitability, find a
   // good combination.
   // The ideal (and expensive) solution can only be found by trying all
@@ -456,7 +458,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
   BitVector PickedGlobals(Globals.size());
   bool Changed = false;
 
-  for (const UsedGlobalSet &UGS : UsedGlobalSets) {
+  for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
     if (UGS.UsageCount == 0)
       continue;
     if (PickedGlobals.anyCommon(UGS.Globals))

From 21f04b1458c52ba875a23b58b02cf6b1f8db0661 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 25 Jan 2025 16:28:21 +0100
Subject: [PATCH 092/432] Hold a queue of iterator ranges (not operations) in
 wouldOpBeTriviallyDead (#123642)

Ranges let us push the whole blocks onto the queue in constant time. If
one of the first ops in the block is side-effecting we'll be able to
provide the answer quickly. The previous implementation had to walk the
block and queue all the operations only to start traversing them again,
which was a considerable slowdown for compile times of large MLIR
programs in our benchmarks.

---------

Co-authored-by: Jacques Pienaar <jpienaar@google.com>
---
 mlir/lib/Interfaces/SideEffectInterfaces.cpp | 23 +++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Interfaces/SideEffectInterfaces.cpp b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
index c9feb001a1984..59fd19310cea5 100644
--- a/mlir/lib/Interfaces/SideEffectInterfaces.cpp
+++ b/mlir/lib/Interfaces/SideEffectInterfaces.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/IR/SymbolTable.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include <utility>
 
 using namespace mlir;
 
@@ -41,10 +42,18 @@ bool mlir::isOpTriviallyDead(Operation *op) {
 /// allows for marking region operations as trivially dead without always being
 /// conservative of terminators.
 static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) {
-  // The set of operations to consider when checking for side effects.
-  SmallVector<Operation *, 1> effectingOps(1, rootOp);
+  // The set of operation intervals (end-exclusive) to consider when checking
+  // for side effects.
+  SmallVector<std::pair<Block::iterator, Block::iterator>, 1> effectingOps = {
+      std::make_pair(Block::iterator(rootOp), ++Block::iterator(rootOp))};
   while (!effectingOps.empty()) {
-    Operation *op = effectingOps.pop_back_val();
+    Block::iterator &it = effectingOps.back().first;
+    Block::iterator end = effectingOps.back().second;
+    if (it == end) {
+      effectingOps.pop_back();
+      continue;
+    }
+    mlir::Operation *op = &*(it++);
 
     // If the operation has recursive effects, push all of the nested operations
     // on to the stack to consider.
@@ -53,8 +62,7 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) {
     if (hasRecursiveEffects) {
       for (Region &region : op->getRegions()) {
         for (auto &block : region) {
-          for (auto &nestedOp : block)
-            effectingOps.push_back(&nestedOp);
+          effectingOps.push_back(std::make_pair(block.begin(), block.end()));
         }
       }
     }
@@ -86,10 +94,9 @@ static bool wouldOpBeTriviallyDeadImpl(Operation *rootOp) {
         return false;
       }
       continue;
-
-      // Otherwise, if the op has recursive side effects we can treat the
-      // operation itself as having no effects.
     }
+    // Otherwise, if the op only has recursive side effects we can treat the
+    // operation itself as having no effects. We will visit its children next.
     if (hasRecursiveEffects)
       continue;
 

From 5cb2db3b51c2a9d516d57bd2f07d9899bd5fdae7 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Sat, 25 Jan 2025 08:19:27 -0800
Subject: [PATCH 093/432] [SandboxVec][Scheduler] Forbid crossing BBs (#124369)

This patch updates the scheduler to forbid scheduling across BBs. It
should eventually be able to handle this, but we disable it for now.
---
 .../Vectorize/SandboxVectorizer/Scheduler.h   |  6 ++-
 .../Vectorize/SandboxVectorizer/Scheduler.cpp |  8 ++-
 .../SandboxVectorizer/SchedulerTest.cpp       | 52 +++++++++++++++++++
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
index 25432e1396c73..0da1894c90613 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h
@@ -122,6 +122,8 @@ class Scheduler {
   std::optional<BasicBlock::iterator> ScheduleTopItOpt;
   // TODO: This is wasting memory in exchange for fast removal using a raw ptr.
   DenseMap<SchedBundle *, std::unique_ptr<SchedBundle>> Bndls;
+  /// The BB that we are currently scheduling.
+  BasicBlock *ScheduledBB = nullptr;
 
   /// \Returns a scheduling bundle containing \p Instrs.
   SchedBundle *createBundle(ArrayRef<Instruction *> Instrs);
@@ -166,8 +168,10 @@ class Scheduler {
     DAG.clear();
     ReadyList.clear();
     ScheduleTopItOpt = std::nullopt;
+    ScheduledBB = nullptr;
     assert(Bndls.empty() && DAG.empty() && ReadyList.empty() &&
-           !ScheduleTopItOpt && "Expected empty state!");
+           !ScheduleTopItOpt && ScheduledBB == nullptr &&
+           "Expected empty state!");
   }
 
 #ifndef NDEBUG
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
index 496521b95a98e..06c1ef6b6d5ae 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
@@ -189,7 +189,13 @@ bool Scheduler::trySchedule(ArrayRef<Instruction *> Instrs) {
                 [Instrs](Instruction *I) {
                   return I->getParent() == (*Instrs.begin())->getParent();
                 }) &&
-         "Instrs not in the same BB!");
+         "Instrs not in the same BB, should have been rejected by Legality!");
+  if (ScheduledBB == nullptr)
+    ScheduledBB = Instrs[0]->getParent();
+  // We don't support crossing BBs for now.
+  if (any_of(Instrs,
+             [this](Instruction *I) { return I->getParent() != ScheduledBB; }))
+    return false;
   auto SchedState = getBndlSchedState(Instrs);
   switch (SchedState) {
   case BndlSchedState::FullyScheduled:
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
index c5e44a97976a7..5a2b92ed24b03 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp
@@ -51,6 +51,14 @@ struct SchedulerTest : public testing::Test {
   }
 };
 
+static sandboxir::BasicBlock *getBasicBlockByName(sandboxir::Function *F,
+                                                  StringRef Name) {
+  for (sandboxir::BasicBlock &BB : *F)
+    if (BB.getName() == Name)
+      return &BB;
+  llvm_unreachable("Expected to find basic block!");
+}
+
 TEST_F(SchedulerTest, SchedBundle) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr, i8 %v0, i8 %v1) {
@@ -237,3 +245,47 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) {
   EXPECT_TRUE(Sched.trySchedule({Add0, Add1}));
   EXPECT_TRUE(Sched.trySchedule({L0, L1}));
 }
+
+TEST_F(SchedulerTest, DontCrossBBs) {
+  parseIR(C, R"IR(
+define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) {
+bb0:
+  %add0 = add i8 %v0, 0
+  %add1 = add i8 %v1, 1
+  br label %bb1
+bb1:
+  store i8 %add0, ptr %ptr0
+  store i8 %add1, ptr %ptr1
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB0 = getBasicBlockByName(F, "bb0");
+  auto *BB1 = getBasicBlockByName(F, "bb1");
+  auto It = BB0->begin();
+  auto *Add0 = &*It++;
+  auto *Add1 = &*It++;
+
+  It = BB1->begin();
+  auto *S0 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+  auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
+
+  {
+    // Schedule bottom-up
+    sandboxir::Scheduler Sched(getAA(*LLVMF), Ctx);
+    EXPECT_TRUE(Sched.trySchedule({Ret}));
+    EXPECT_TRUE(Sched.trySchedule({S0, S1}));
+    // Scheduling across blocks should fail.
+    EXPECT_FALSE(Sched.trySchedule({Add0, Add1}));
+  }
+  {
+    // Schedule top-down
+    sandboxir::Scheduler Sched(getAA(*LLVMF), Ctx);
+    EXPECT_TRUE(Sched.trySchedule({Add0, Add1}));
+    // Scheduling across blocks should fail.
+    EXPECT_FALSE(Sched.trySchedule({S0, S1}));
+  }
+}

From 485b1ac8a265dcf19c55a98aeefff95158cc63a2 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Fri, 24 Jan 2025 14:47:09 -0800
Subject: [PATCH 094/432] [SandboxIR][Docs] C++ highlighting for code block

---
 llvm/docs/SandboxIR.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/SandboxIR.md b/llvm/docs/SandboxIR.md
index 61bae4e36ef43..735190e19966e 100644
--- a/llvm/docs/SandboxIR.md
+++ b/llvm/docs/SandboxIR.md
@@ -6,7 +6,7 @@ Sandbox IR is an IR layer on top of LLVM IR that allows you to save/restore its
 
 Within your LLVM pass:
 
-```
+``` C++
 // 1. Include the necessary Sandbox IR header files.
 #include "llvm/SandboxIR/Context.h
 #include "llvm/SandboxIR/Function.h

From 14b44179cb61dd551c911dea54de57b588621005 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sat, 25 Jan 2025 17:43:16 +0100
Subject: [PATCH 095/432] [libc++][format][3/3] Improves formatting
 performance. (#108990)

This changes the __output_buffer to a new structure. This improves the
performace of std::format, std::format_to, std::format_to_n, and
std::formatted_size.
---
 libcxx/include/__format/buffer.h              | 623 ++++++++++--------
 libcxx/include/__format/format_functions.h    |  29 +-
 libcxx/include/module.modulemap               |   5 +-
 .../format/format.functions/format_tests.h    |   2 +-
 4 files changed, 358 insertions(+), 301 deletions(-)

diff --git a/libcxx/include/__format/buffer.h b/libcxx/include/__format/buffer.h
index 9509f19e16724..0c054bbc3a1d8 100644
--- a/libcxx/include/__format/buffer.h
+++ b/libcxx/include/__format/buffer.h
@@ -14,6 +14,7 @@
 #include <__algorithm/fill_n.h>
 #include <__algorithm/max.h>
 #include <__algorithm/min.h>
+#include <__algorithm/ranges_copy.h>
 #include <__algorithm/ranges_copy_n.h>
 #include <__algorithm/transform.h>
 #include <__algorithm/unwrap_iter.h>
@@ -29,6 +30,7 @@
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/allocate_at_least.h>
+#include <__memory/allocator.h>
 #include <__memory/allocator_traits.h>
 #include <__memory/construct_at.h>
 #include <__memory/ranges_construct_at.h>
@@ -37,6 +39,7 @@
 #include <__type_traits/conditional.h>
 #include <__utility/exception_guard.h>
 #include <__utility/move.h>
+#include <stdexcept>
 #include <string_view>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -52,24 +55,147 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 namespace __format {
 
+// A helper to limit the total size of code units written.
+class _LIBCPP_HIDE_FROM_ABI __max_output_size {
+public:
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __max_output_size(size_t __max_size) : __max_size_{__max_size} {}
+
+  // This function adjusts the size of a (bulk) write operations. It ensures the
+  // number of code units written by a __output_buffer never exceeds
+  // __max_size_ code units.
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __write_request(size_t __code_units) {
+    size_t __result =
+        __code_units_written_ < __max_size_ ? std::min(__code_units, __max_size_ - __code_units_written_) : 0;
+    __code_units_written_ += __code_units;
+    return __result;
+  }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __code_units_written() const noexcept { return __code_units_written_; }
+
+private:
+  size_t __max_size_;
+  // The code units that would have been written if there was no limit.
+  // format_to_n returns this value.
+  size_t __code_units_written_{0};
+};
+
 /// A "buffer" that handles writing to the proper iterator.
 ///
 /// This helper is used together with the @ref back_insert_iterator to offer
 /// type-erasure for the formatting functions. This reduces the number to
 /// template instantiations.
+///
+/// The design is the following:
+/// - There is an external object that connects the buffer to the output.
+/// - This buffer object:
+///   - inherits publicly from this class.
+///   - has a static or dynamic buffer.
+///   - has a static member function to make space in its buffer write
+///     operations. This can be done by increasing the size of the internal
+///     buffer or by writing the contents of the buffer to the output iterator.
+///
+///     This member function is a constructor argument, so its name is not
+///     fixed. The code uses the name __prepare_write.
+/// - The number of output code units can be limited by a __max_output_size
+///   object. This is used in format_to_n This object:
+///   - Contains the maximum number of code units to be written.
+///   - Contains the number of code units that are requested to be written.
+///     This number is returned to the user of format_to_n.
+///   - The write functions call the object's __request_write member function.
+///     This function:
+///     - Updates the number of code units that are requested to be written.
+///     - Returns the number of code units that can be written without
+///       exceeding the maximum number of code units to be written.
+///
+/// Documentation for the buffer usage members:
+/// - __ptr_
+///   The start of the buffer.
+/// - __capacity_
+///   The number of code units that can be written. This means
+///   [__ptr_, __ptr_ + __capacity_) is a valid range to write to.
+/// - __size_
+///   The number of code units written in the buffer. The next code unit will
+///   be written at __ptr_ + __size_. This __size_ may NOT contain the total
+///   number of code units written by the __output_buffer. Whether or not it
+///   does depends on the sub-class used. Typically the total number of code
+///   units written is not interesting. It is interesting for format_to_n which
+///   has its own way to track this number.
+///
+/// Documentation for the modifying buffer operations:
+/// The subclasses have a function with the following signature:
+///
+///   static void __prepare_write(
+///     __output_buffer<_CharT>& __buffer, size_t __code_units);
+///
+/// This function is called when a write function writes more code units than
+/// the buffer's available space. When an __max_output_size object is provided
+/// the number of code units is the number of code units returned from
+/// __max_output_size::__request_write function.
+///
+/// - The __buffer contains *this. Since the class containing this function
+///   inherits from __output_buffer it's safe to cast it to the subclass being
+///   used.
+/// - The __code_units is the number of code units the caller will write + 1.
+///   - This value does not take the available space of the buffer into account.
+///   - The push_back function is more efficient when writing before resizing,
+///     this means the buffer should always have room for one code unit. Hence
+///     the + 1 is the size.
+/// - When the function returns there is room for at least one additional code
+///   unit. There is no requirement there is room for __code_units code units:
+///   - The class has some "bulk" operations. For example, __copy which copies
+///     the contents of a basic_string_view to the output. If the sub-class has
+///     a fixed size buffer the size of the basic_string_view may be larger
+///     than the buffer. In that case it's impossible to honor the requested
+///     size.
+///   - When the buffer has room for at least one code unit the function may be
+///     a no-op.
+/// - When the function makes space for more code units it uses one for these
+///   functions to signal the change:
+///   - __buffer_flushed()
+///     - This function is typically used for a fixed sized buffer.
+///     - The current contents of [__ptr_, __ptr_ + __size_) have been
+///       processed.
+///     - __ptr_ remains unchanged.
+///     - __capacity_ remains unchanged.
+///     - __size_ will be set to 0.
+///   - __buffer_moved(_CharT* __ptr, size_t __capacity)
+///     - This function is typically used for a dynamic sized buffer. There the
+///       location of the buffer changes due to reallocations.
+///     - __ptr_ will be set to __ptr. (This value may be the old value of
+///       __ptr_).
+///     - __capacity_ will be set to __capacity. (This value may be the old
+///       value of __capacity_).
+///     - __size_ remains unchanged,
+///     - The range [__ptr, __ptr + __size_) contains the original data of the
+///       range [__ptr_, __ptr_ + __size_).
+///
+/// The push_back function expects a valid buffer and a capacity of at least 1.
+/// This means:
+/// - The class is constructed with a valid buffer,
+/// - __buffer_moved is called with a valid buffer is used before the first
+///   write operation,
+/// - no write function is ever called, or
+/// - the class is constructed with a __max_output_size object with __max_size 0.
+///
+/// The latter option allows formatted_size to use the output buffer without
+/// ever writing anything to the buffer.
 template <__fmt_char_type _CharT>
 class _LIBCPP_TEMPLATE_VIS __output_buffer {
 public:
-  using value_type = _CharT;
+  using value_type _LIBCPP_NODEBUG           = _CharT;
+  using __prepare_write_type _LIBCPP_NODEBUG = void (*)(__output_buffer<_CharT>&, size_t);
 
-  template <class _Tp>
-  _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, _Tp* __obj)
-      : __ptr_(__ptr),
-        __capacity_(__capacity),
-        __flush_([](_CharT* __p, size_t __n, void* __o) { static_cast<_Tp*>(__o)->__flush(__p, __n); }),
-        __obj_(__obj) {}
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(_CharT* __ptr, size_t __capacity, __prepare_write_type __function)
+      : __output_buffer{__ptr, __capacity, __function, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __reset(_CharT* __ptr, size_t __capacity) {
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __output_buffer(
+      _CharT* __ptr, size_t __capacity, __prepare_write_type __function, __max_output_size* __max_output_size)
+      : __ptr_(__ptr), __capacity_(__capacity), __prepare_write_(__function), __max_output_size_(__max_output_size) {}
+
+  _LIBCPP_HIDE_FROM_ABI void __buffer_flushed() { __size_ = 0; }
+
+  _LIBCPP_HIDE_FROM_ABI void __buffer_moved(_CharT* __ptr, size_t __capacity) {
     __ptr_      = __ptr;
     __capacity_ = __capacity;
   }
@@ -78,12 +204,18 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
 
   // Used in std::back_insert_iterator.
   _LIBCPP_HIDE_FROM_ABI void push_back(_CharT __c) {
+    if (__max_output_size_ && __max_output_size_->__write_request(1) == 0)
+      return;
+
+    _LIBCPP_ASSERT_INTERNAL(
+        __ptr_ && __size_ < __capacity_ && __available() >= 1, "attempted to write outside the buffer");
+
     __ptr_[__size_++] = __c;
 
     // Profiling showed flushing after adding is more efficient than flushing
     // when entering the function.
     if (__size_ == __capacity_)
-      __flush();
+      __prepare_write(0);
   }
 
   /// Copies the input __str to the buffer.
@@ -104,25 +236,20 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
     // upper case. For integral these strings are short.
     // TODO FMT Look at the improvements above.
     size_t __n = __str.size();
-
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::copy_n(__str.data(), __n, std::addressof(__ptr_[__size_]));
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Copy the data in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     const _InCharT* __first = __str.data();
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::copy_n(__first, __chunk, std::addressof(__ptr_[__size_]));
-      __size_ = __chunk;
+      __size_ += __chunk;
       __first += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
@@ -136,121 +263,59 @@ class _LIBCPP_TEMPLATE_VIS __output_buffer {
     _LIBCPP_ASSERT_INTERNAL(__first <= __last, "not a valid range");
 
     size_t __n = static_cast<size_t>(__last - __first);
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::transform(__first, __last, std::addressof(__ptr_[__size_]), std::move(__operation));
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Transform the data in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::transform(__first, __first + __chunk, std::addressof(__ptr_[__size_]), __operation);
-      __size_ = __chunk;
+      __size_ += __chunk;
       __first += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
   /// A \c fill_n wrapper.
   _LIBCPP_HIDE_FROM_ABI void __fill(size_t __n, _CharT __value) {
-    __flush_on_overflow(__n);
-    if (__n < __capacity_) { //  push_back requires the buffer to have room for at least one character (so use <).
-      std::fill_n(std::addressof(__ptr_[__size_]), __n, __value);
-      __size_ += __n;
-      return;
+    if (__max_output_size_) {
+      __n = __max_output_size_->__write_request(__n);
+      if (__n == 0)
+        return;
     }
 
-    // The output doesn't fit in the internal buffer.
-    // Fill the buffer in "__capacity_" sized chunks.
-    _LIBCPP_ASSERT_INTERNAL(__size_ == 0, "the buffer should be flushed by __flush_on_overflow");
     do {
-      size_t __chunk = std::min(__n, __capacity_);
+      __prepare_write(__n);
+      size_t __chunk = std::min(__n, __available());
       std::fill_n(std::addressof(__ptr_[__size_]), __chunk, __value);
-      __size_ = __chunk;
+      __size_ += __chunk;
       __n -= __chunk;
-      __flush();
     } while (__n);
   }
 
-  _LIBCPP_HIDE_FROM_ABI void __flush() {
-    __flush_(__ptr_, __size_, __obj_);
-    __size_ = 0;
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __capacity() const { return __capacity_; }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __size() const { return __size_; }
 
 private:
   _CharT* __ptr_;
   size_t __capacity_;
   size_t __size_{0};
-  void (*__flush_)(_CharT*, size_t, void*);
-  void* __obj_;
-
-  /// Flushes the buffer when the output operation would overflow the buffer.
-  ///
-  /// A simple approach for the overflow detection would be something along the
-  /// lines:
-  /// \code
-  /// // The internal buffer is large enough.
-  /// if (__n <= __capacity_) {
-  ///   // Flush when we really would overflow.
-  ///   if (__size_ + __n >= __capacity_)
-  ///     __flush();
-  ///   ...
-  /// }
-  /// \endcode
-  ///
-  /// This approach works for all cases but one:
-  /// A __format_to_n_buffer_base where \ref __enable_direct_output is true.
-  /// In that case the \ref __capacity_ of the buffer changes during the first
-  /// \ref __flush. During that operation the output buffer switches from its
-  /// __writer_ to its __storage_. The \ref __capacity_ of the former depends
-  /// on the value of n, of the latter is a fixed size. For example:
-  /// - a format_to_n call with a 10'000 char buffer,
-  /// - the buffer is filled with 9'500 chars,
-  /// - adding 1'000 elements would overflow the buffer so the buffer gets
-  ///   changed and the \ref __capacity_ decreases from 10'000 to
-  ///   __buffer_size (256 at the time of writing).
-  ///
-  /// This means that the \ref __flush for this class may need to copy a part of
-  /// the internal buffer to the proper output. In this example there will be
-  /// 500 characters that need this copy operation.
-  ///
-  /// Note it would be more efficient to write 500 chars directly and then swap
-  /// the buffers. This would make the code more complex and \ref format_to_n is
-  /// not the most common use case. Therefore the optimization isn't done.
-  _LIBCPP_HIDE_FROM_ABI void __flush_on_overflow(size_t __n) {
-    if (__size_ + __n >= __capacity_)
-      __flush();
-  }
-};
-
-/// A storage using an internal buffer.
-///
-/// This storage is used when writing a single element to the output iterator
-/// is expensive.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __internal_storage {
-public:
-  _LIBCPP_HIDE_FROM_ABI _CharT* __begin() { return __buffer_; }
+  void (*__prepare_write_)(__output_buffer<_CharT>&, size_t);
+  __max_output_size* __max_output_size_;
 
-  static constexpr size_t __buffer_size = 256 / sizeof(_CharT);
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __available() const { return __capacity_ - __size_; }
 
-private:
-  _CharT __buffer_[__buffer_size];
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __code_units) {
+    // Always have space for one additional code unit. This is a precondition of the push_back function.
+    __code_units += 1;
+    if (__available() < __code_units)
+      __prepare_write_(*this, __code_units + 1);
+  }
 };
 
-/// A storage writing directly to the storage.
-///
-/// This requires the storage to be a contiguous buffer of \a _CharT.
-/// Since the output is directly written to the underlying storage this class
-/// is just an empty class.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __direct_storage {};
-
 template <class _OutIt, class _CharT>
 concept __enable_direct_output =
     __fmt_char_type<_CharT> &&
@@ -259,40 +324,6 @@ concept __enable_direct_output =
      // `#ifdef`.
      || same_as<_OutIt, __wrap_iter<_CharT*>>);
 
-/// Write policy for directly writing to the underlying output.
-template <class _OutIt, __fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_direct {
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_direct(_OutIt __out_it) : __out_it_(__out_it) {}
-
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() { return __out_it_; }
-
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT*, size_t __n) {
-    // _OutIt can be a __wrap_iter<CharT*>. Therefore the original iterator
-    // is adjusted.
-    __out_it_ += __n;
-  }
-
-private:
-  _OutIt __out_it_;
-};
-
-/// Write policy for copying the buffer to the output.
-template <class _OutIt, __fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_iterator {
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_iterator(_OutIt __out_it) : __out_it_{std::move(__out_it)} {}
-
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return std::move(__out_it_); }
-
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    __out_it_ = std::ranges::copy_n(__ptr, __n, std::move(__out_it_)).out;
-  }
-
-private:
-  _OutIt __out_it_;
-};
-
 /// Concept to see whether a \a _Container is insertable.
 ///
 /// The concept is used to validate whether multiple calls to a
@@ -310,196 +341,220 @@ concept __insertable =
 /// Extract the container type of a \ref back_insert_iterator.
 template <class _It>
 struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container {
-  using type = void;
+  using type _LIBCPP_NODEBUG = void;
 };
 
 template <__insertable _Container>
 struct _LIBCPP_TEMPLATE_VIS __back_insert_iterator_container<back_insert_iterator<_Container>> {
-  using type = _Container;
+  using type _LIBCPP_NODEBUG = _Container;
 };
 
-/// Write policy for inserting the buffer in a container.
-template <class _Container>
-class _LIBCPP_TEMPLATE_VIS __writer_container {
+// A dynamically growing buffer.
+template <__fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __allocating_buffer : public __output_buffer<_CharT> {
 public:
-  using _CharT _LIBCPP_NODEBUG = typename _Container::value_type;
+  __allocating_buffer(const __allocating_buffer&)            = delete;
+  __allocating_buffer& operator=(const __allocating_buffer&) = delete;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __writer_container(back_insert_iterator<_Container> __out_it)
-      : __container_{__out_it.__get_container()} {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __allocating_buffer() : __allocating_buffer{nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI auto __out_it() { return std::back_inserter(*__container_); }
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __allocating_buffer(__max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{__small_buffer_, __buffer_size_, __prepare_write, __max_output_size} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    __container_->insert(__container_->end(), __ptr, __ptr + __n);
+  _LIBCPP_HIDE_FROM_ABI ~__allocating_buffer() {
+    if (__ptr_ != __small_buffer_)
+      _Alloc{}.deallocate(__ptr_, this->__capacity());
   }
 
-private:
-  _Container* __container_;
-};
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI basic_string_view<_CharT> __view() { return {__ptr_, this->__size()}; }
 
-/// Selects the type of the writer used for the output iterator.
-template <class _OutIt, class _CharT>
-class _LIBCPP_TEMPLATE_VIS __writer_selector {
-  using _Container _LIBCPP_NODEBUG = typename __back_insert_iterator_container<_OutIt>::type;
+private:
+  using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>;
 
-public:
-  using type =
-      conditional_t<!same_as<_Container, void>,
-                    __writer_container<_Container>,
-                    conditional_t<__enable_direct_output<_OutIt, _CharT>,
-                                  __writer_direct<_OutIt, _CharT>,
-                                  __writer_iterator<_OutIt, _CharT>>>;
-};
+  // Since allocating is expensive the class has a small internal buffer. When
+  // its capacity is exceeded a dynamic buffer will be allocated.
+  static constexpr size_t __buffer_size_ = 256;
+  _CharT __small_buffer_[__buffer_size_];
 
-/// The generic formatting buffer.
-template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-class _LIBCPP_TEMPLATE_VIS __format_buffer {
-  using _Storage _LIBCPP_NODEBUG =
-      conditional_t<__enable_direct_output<_OutIt, _CharT>, __direct_storage<_CharT>, __internal_storage<_CharT>>;
+  _CharT* __ptr_{__small_buffer_};
 
-public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it)
-    requires(same_as<_Storage, __internal_storage<_CharT>>)
-      : __output_(__storage_.__begin(), __storage_.__buffer_size, this), __writer_(std::move(__out_it)) {}
+  _LIBCPP_HIDE_FROM_ABI void __grow_buffer(size_t __capacity) {
+    if (__capacity < __buffer_size_)
+      return;
 
-  _LIBCPP_HIDE_FROM_ABI explicit __format_buffer(_OutIt __out_it)
-    requires(same_as<_Storage, __direct_storage<_CharT>>)
-      : __output_(std::__unwrap_iter(__out_it), size_t(-1), this), __writer_(std::move(__out_it)) {}
+    _LIBCPP_ASSERT_INTERNAL(__capacity > this->__capacity(), "the buffer must grow");
 
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); }
+    // _CharT is an implicit lifetime type so can be used without explicit
+    // construction or destruction.
+    _Alloc __alloc;
+    auto __result = std::__allocate_at_least(__alloc, __capacity);
+    std::copy_n(__ptr_, this->__size(), __result.ptr);
+    if (__ptr_ != __small_buffer_)
+      __alloc.deallocate(__ptr_, this->__capacity());
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) { __writer_.__flush(__ptr, __n); }
+    __ptr_ = __result.ptr;
+    this->__buffer_moved(__ptr_, __result.count);
+  }
 
-  _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && {
-    __output_.__flush();
-    return std::move(__writer_).__out_it();
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write(size_t __size_hint) {
+    __grow_buffer(std::max<size_t>(this->__capacity() + __size_hint, this->__capacity() * 1.6));
   }
 
-private:
-  _LIBCPP_NO_UNIQUE_ADDRESS _Storage __storage_;
-  __output_buffer<_CharT> __output_;
-  typename __writer_selector<_OutIt, _CharT>::type __writer_;
+  _LIBCPP_HIDE_FROM_ABI static void __prepare_write(__output_buffer<_CharT>& __buffer, size_t __size_hint) {
+    static_cast<__allocating_buffer<_CharT>&>(__buffer).__prepare_write(__size_hint);
+  }
 };
 
-/// A buffer that counts the number of insertions.
-///
-/// Since \ref formatted_size only needs to know the size, the output itself is
-/// discarded.
-template <__fmt_char_type _CharT>
-class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer {
+// A buffer that directly writes to the underlying buffer.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __direct_iterator_buffer : public __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return __output_.__make_output_iterator(); }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it)
+      : __direct_iterator_buffer{__out_it, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(const _CharT*, size_t __n) { __size_ += __n; }
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __direct_iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{std::__unwrap_iter(__out_it), __buffer_size, __prepare_write, __max_output_size},
+        __out_it_(__out_it) {}
 
-  _LIBCPP_HIDE_FROM_ABI size_t __result() && {
-    __output_.__flush();
-    return __size_;
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI _OutIt __out_it() && { return __out_it_ + this->__size(); }
 
 private:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this};
-  size_t __size_{0};
-};
+  // The function format_to expects a buffer large enough for the output. The
+  // function format_to_n has its own helper class that restricts the number of
+  // write options. So this function class can pretend to have an infinite
+  // buffer.
+  static constexpr size_t __buffer_size = -1;
+
+  _OutIt __out_it_;
 
-/// The base of a buffer that counts and limits the number of insertions.
-template <class _OutIt, __fmt_char_type _CharT, bool>
-  requires(output_iterator<_OutIt, const _CharT&>)
-struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base {
-  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    std::__throw_length_error("__direct_iterator_buffer");
+  }
+};
 
+// A buffer that writes its output to the end of a container.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __container_inserter_buffer : public __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
-      : __writer_(std::move(__out_it)), __max_size_(std::max(_Size(0), __max_size)) {}
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it)
+      : __container_inserter_buffer{__out_it, nullptr} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    if (_Size(__size_) <= __max_size_)
-      __writer_.__flush(__ptr, std::min(_Size(__n), __max_size_ - __size_));
-    __size_ += __n;
+  [[nodiscard]]
+  _LIBCPP_HIDE_FROM_ABI explicit __container_inserter_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __output_buffer<_CharT>{__small_buffer_, __buffer_size, __prepare_write, __max_output_size},
+        __container_{__out_it.__get_container()} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && {
+    __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size());
+    return std::back_inserter(*__container_);
   }
 
-protected:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_{__storage_.__begin(), __storage_.__buffer_size, this};
-  typename __writer_selector<_OutIt, _CharT>::type __writer_;
+private:
+  typename __back_insert_iterator_container<_OutIt>::type* __container_;
+
+  // This class uses a fixed size buffer and appends the elements in
+  // __buffer_size chunks. An alternative would be to use an allocating buffer
+  // and append the output in a single write operation. Benchmarking showed no
+  // performance difference.
+  static constexpr size_t __buffer_size = 256;
+  _CharT __small_buffer_[__buffer_size];
+
+  _LIBCPP_HIDE_FROM_ABI void __prepare_write() {
+    __container_->insert(__container_->end(), __small_buffer_, __small_buffer_ + this->__size());
+    this->__buffer_flushed();
+  }
 
-  _Size __max_size_;
-  _Size __size_{0};
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write(__output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    static_cast<__container_inserter_buffer<_OutIt, _CharT>&>(__buffer).__prepare_write();
+  }
 };
 
-/// The base of a buffer that counts and limits the number of insertions.
-///
-/// This version is used when \c __enable_direct_output<_OutIt, _CharT> == true.
-///
-/// This class limits the size available to the direct writer so it will not
-/// exceed the maximum number of code units.
+// A buffer that writes to an iterator.
+//
+// Unlike the __container_inserter_buffer this class' performance does benefit
+// from allocating and then inserting.
 template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer_base<_OutIt, _CharT, true> {
-  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
-
+class _LIBCPP_TEMPLATE_VIS __iterator_buffer : public __allocating_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer_base(_OutIt __out_it, _Size __max_size)
-      : __output_(std::__unwrap_iter(__out_it), __max_size, this),
-        __writer_(std::move(__out_it)),
-        __max_size_(__max_size) {
-    if (__max_size <= 0) [[unlikely]]
-      __output_.__reset(__storage_.__begin(), __storage_.__buffer_size);
-  }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it)
+      : __allocating_buffer<_CharT>{}, __out_it_{std::move(__out_it)} {}
 
-  _LIBCPP_HIDE_FROM_ABI void __flush(_CharT* __ptr, size_t __n) {
-    // A __flush to the direct writer happens in the following occasions:
-    // - The format function has written the maximum number of allowed code
-    //   units. At this point it's no longer valid to write to this writer. So
-    //   switch to the internal storage. This internal storage doesn't need to
-    //   be written anywhere so the __flush for that storage writes no output.
-    // - Like above, but the next "mass write" operation would overflow the
-    //   buffer. In that case the buffer is pre-emptively switched. The still
-    //   valid code units will be written separately.
-    // - The format_to_n function is finished. In this case there's no need to
-    //   switch the buffer, but for simplicity the buffers are still switched.
-    // When the __max_size <= 0 the constructor already switched the buffers.
-    if (__size_ == 0 && __ptr != __storage_.__begin()) {
-      __writer_.__flush(__ptr, __n);
-      __output_.__reset(__storage_.__begin(), __storage_.__buffer_size);
-    } else if (__size_ < __max_size_) {
-      // Copies a part of the internal buffer to the output up to n characters.
-      // See __output_buffer<_CharT>::__flush_on_overflow for more information.
-      _Size __s = std::min(_Size(__n), __max_size_ - __size_);
-      std::copy_n(__ptr, __s, __writer_.__out_it());
-      __writer_.__flush(__ptr, __s);
-    }
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI explicit __iterator_buffer(_OutIt __out_it, __max_output_size* __max_output_size)
+      : __allocating_buffer<_CharT>{__max_output_size}, __out_it_{std::move(__out_it)} {}
 
-    __size_ += __n;
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __out_it() && {
+    return std::ranges::copy(this->__view(), std::move(__out_it_)).out;
   }
 
-protected:
-  __internal_storage<_CharT> __storage_;
-  __output_buffer<_CharT> __output_;
-  __writer_direct<_OutIt, _CharT> __writer_;
+private:
+  _OutIt __out_it_;
+};
+
+// Selects the type of the buffer used for the output iterator.
+template <class _OutIt, __fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __buffer_selector {
+  using _Container _LIBCPP_NODEBUG = __back_insert_iterator_container<_OutIt>::type;
 
-  _Size __max_size_;
-  _Size __size_{0};
+public:
+  using type _LIBCPP_NODEBUG =
+      conditional_t<!same_as<_Container, void>,
+                    __container_inserter_buffer<_OutIt, _CharT>,
+                    conditional_t<__enable_direct_output<_OutIt, _CharT>,
+                                  __direct_iterator_buffer<_OutIt, _CharT>,
+                                  __iterator_buffer<_OutIt, _CharT>>>;
 };
 
-/// The buffer that counts and limits the number of insertions.
+// A buffer that counts and limits the number of insertions.
 template <class _OutIt, __fmt_char_type _CharT>
-  requires(output_iterator<_OutIt, const _CharT&>)
-struct _LIBCPP_TEMPLATE_VIS __format_to_n_buffer final
-    : public __format_to_n_buffer_base< _OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>> {
-  using _Base _LIBCPP_NODEBUG = __format_to_n_buffer_base<_OutIt, _CharT, __enable_direct_output<_OutIt, _CharT>>;
-  using _Size _LIBCPP_NODEBUG = iter_difference_t<_OutIt>;
+class _LIBCPP_TEMPLATE_VIS __format_to_n_buffer : private __buffer_selector<_OutIt, _CharT>::type {
+public:
+  using _Base _LIBCPP_NODEBUG = __buffer_selector<_OutIt, _CharT>::type;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __format_to_n_buffer(_OutIt __out_it, iter_difference_t<_OutIt> __n)
+      : _Base{std::move(__out_it), std::addressof(__max_output_size_)},
+        __max_output_size_{__n < 0 ? size_t{0} : static_cast<size_t>(__n)} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); }
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && {
+    return {static_cast<_Base&&>(*this).__out_it(),
+            static_cast<iter_difference_t<_OutIt>>(__max_output_size_.__code_units_written())};
+  }
+
+private:
+  __max_output_size __max_output_size_;
+};
 
+// A buffer that counts the number of insertions.
+//
+// Since formatted_size only needs to know the size, the output itself is
+// discarded.
+template <__fmt_char_type _CharT>
+class _LIBCPP_TEMPLATE_VIS __formatted_size_buffer : private __output_buffer<_CharT> {
 public:
-  _LIBCPP_HIDE_FROM_ABI explicit __format_to_n_buffer(_OutIt __out_it, _Size __max_size)
-      : _Base(std::move(__out_it), __max_size) {}
-  _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return this->__output_.__make_output_iterator(); }
+  using _Base _LIBCPP_NODEBUG = __output_buffer<_CharT>;
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI __formatted_size_buffer()
+      : _Base{nullptr, 0, __prepare_write, std::addressof(__max_output_size_)} {}
+
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI auto __make_output_iterator() { return _Base::__make_output_iterator(); }
+
+  // This function does not need to be r-value qualified, however this is
+  // consistent with similar objects.
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI size_t __result() && { return __max_output_size_.__code_units_written(); }
+
+private:
+  __max_output_size __max_output_size_{0};
 
-  _LIBCPP_HIDE_FROM_ABI format_to_n_result<_OutIt> __result() && {
-    this->__output_.__flush();
-    return {std::move(this->__writer_).__out_it(), this->__size_};
+  _LIBCPP_HIDE_FROM_ABI static void
+  __prepare_write([[maybe_unused]] __output_buffer<_CharT>& __buffer, [[maybe_unused]] size_t __size_hint) {
+    // Note this function does not satisfy the requirement of giving a 1 code unit buffer.
+    _LIBCPP_ASSERT_INTERNAL(
+        false, "Since __max_output_size_.__max_size_ == 0 there should never be call to this function.");
   }
 };
 
@@ -526,11 +581,11 @@ class _LIBCPP_TEMPLATE_VIS __retarget_buffer {
   using _Alloc _LIBCPP_NODEBUG = allocator<_CharT>;
 
 public:
-  using value_type = _CharT;
+  using value_type _LIBCPP_NODEBUG = _CharT;
 
   struct __iterator {
-    using difference_type = ptrdiff_t;
-    using value_type      = _CharT;
+    using difference_type _LIBCPP_NODEBUG = ptrdiff_t;
+    using value_type _LIBCPP_NODEBUG      = _CharT;
 
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(__retarget_buffer& __buffer)
         : __buffer_(std::addressof(__buffer)) {}
diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index b920be5acbe86..5feaf7e5a064a 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -31,7 +31,6 @@
 #include <__format/formatter_pointer.h>
 #include <__format/formatter_string.h>
 #include <__format/parser_std_format_spec.h>
-#include <__iterator/back_insert_iterator.h>
 #include <__iterator/concepts.h>
 #include <__iterator/incrementable_traits.h>
 #include <__iterator/iterator_traits.h> // iter_value_t
@@ -411,7 +410,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to(_OutIt __out_it,
     return std::__format::__vformat_to(
         basic_format_parse_context{__fmt, __args.__size()}, std::__format_context_create(std::move(__out_it), __args));
   else {
-    __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)};
+    typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)};
     std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()},
                                 std::__format_context_create(__buffer.__make_output_iterator(), __args));
     return std::move(__buffer).__out_it();
@@ -452,9 +451,9 @@ format_to(_OutIt __out_it, wformat_string<_Args...> __fmt, _Args&&... __args) {
 // fires too eagerly, see http://llvm.org/PR61563.
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string vformat(string_view __fmt, format_args __args) {
-  string __res;
-  std::vformat_to(std::back_inserter(__res), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<char> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args);
+  return string{__buffer.__view()};
 }
 
 #  if _LIBCPP_HAS_WIDE_CHARACTERS
@@ -463,9 +462,9 @@ template <class = void>
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(wstring_view __fmt, wformat_args __args) {
-  wstring __res;
-  std::vformat_to(std::back_inserter(__res), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<wchar_t> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), __fmt, __args);
+  return wstring{__buffer.__view()};
 }
 #  endif
 
@@ -544,7 +543,7 @@ _LIBCPP_HIDE_FROM_ABI _OutIt __vformat_to(
     return std::__format::__vformat_to(basic_format_parse_context{__fmt, __args.__size()},
                                        std::__format_context_create(std::move(__out_it), __args, std::move(__loc)));
   else {
-    __format::__format_buffer<_OutIt, _CharT> __buffer{std::move(__out_it)};
+    typename __format::__buffer_selector<_OutIt, _CharT>::type __buffer{std::move(__out_it)};
     std::__format::__vformat_to(
         basic_format_parse_context{__fmt, __args.__size()},
         std::__format_context_create(__buffer.__make_output_iterator(), __args, std::move(__loc)));
@@ -585,9 +584,9 @@ format_to(_OutIt __out_it, locale __loc, wformat_string<_Args...> __fmt, _Args&&
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI string
 vformat(locale __loc, string_view __fmt, format_args __args) {
-  string __res;
-  std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<char> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args);
+  return string{__buffer.__view()};
 }
 
 #    if _LIBCPP_HAS_WIDE_CHARACTERS
@@ -596,9 +595,9 @@ vformat(locale __loc, string_view __fmt, format_args __args) {
 template <class = void>
 [[nodiscard]] _LIBCPP_ALWAYS_INLINE inline _LIBCPP_HIDE_FROM_ABI wstring
 vformat(locale __loc, wstring_view __fmt, wformat_args __args) {
-  wstring __res;
-  std::vformat_to(std::back_inserter(__res), std::move(__loc), __fmt, __args);
-  return __res;
+  __format::__allocating_buffer<wchar_t> __buffer;
+  std::vformat_to(__buffer.__make_output_iterator(), std::move(__loc), __fmt, __args);
+  return wstring{__buffer.__view()};
 }
 #    endif
 
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 85b88ca137f85..6c2fb8dc3940b 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1253,7 +1253,10 @@ module std [system] {
   }
 
   module format {
-    module buffer                             { header "__format/buffer.h" }
+    module buffer                             {
+      header "__format/buffer.h"
+      export std.iterator.back_insert_iterator
+    }
     module concepts                           { header "__format/concepts.h" }
     module container_adaptor                  { header "__format/container_adaptor.h" }
     module enable_insertable                  { header "__format/enable_insertable.h" }
diff --git a/libcxx/test/std/utilities/format/format.functions/format_tests.h b/libcxx/test/std/utilities/format/format.functions/format_tests.h
index b2ed6775fe8a1..3969b341cb146 100644
--- a/libcxx/test/std/utilities/format/format.functions/format_tests.h
+++ b/libcxx/test/std/utilities/format/format.functions/format_tests.h
@@ -3038,7 +3038,7 @@ void format_test_buffer_optimizations(TestFunction check) {
   // Used to validate our test sets are the proper size.
   // To test the chunked operations it needs to be larger than the internal
   // buffer. Picked a nice looking number.
-  constexpr int minimum = 3 * std::__format::__internal_storage<CharT>::__buffer_size;
+  constexpr int minimum = 3 * 256;
 #else
   constexpr int minimum = 1;
 #endif

From 8b6211472793680994f7bc15abb5910d0a916cc5 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Sat, 23 Nov 2024 16:29:19 -0800
Subject: [PATCH 096/432] [lldb] Delete unused lldbutil.print_registers (NFC)

---
 .../Python/lldbsuite/test/lldbutil.py         | 27 -------------------
 .../API/macosx/universal/TestUniversal.py     |  2 --
 2 files changed, 29 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py
index 07b5f8cc7d900..ef068cf7f9ed1 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py
@@ -1353,33 +1353,6 @@ def get_args_as_string(frame, showFuncName=True):
         return "(%s)" % (", ".join(args))
 
 
-def print_registers(frame, string_buffer=False):
-    """Prints all the register sets of the frame."""
-
-    output = io.StringIO() if string_buffer else sys.stdout
-
-    print("Register sets for " + str(frame), file=output)
-
-    registerSet = frame.GetRegisters()  # Return type of SBValueList.
-    print(
-        "Frame registers (size of register set = %d):" % registerSet.GetSize(),
-        file=output,
-    )
-    for value in registerSet:
-        # print(value, file=output)
-        print(
-            "%s (number of children = %d):" % (value.GetName(), value.GetNumChildren()),
-            file=output,
-        )
-        for child in value:
-            print(
-                "Name: %s, Value: %s" % (child.GetName(), child.GetValue()), file=output
-            )
-
-    if string_buffer:
-        return output.getvalue()
-
-
 def get_registers(frame, kind):
     """Returns the registers given the frame and the kind of registers desired.
 
diff --git a/lldb/test/API/macosx/universal/TestUniversal.py b/lldb/test/API/macosx/universal/TestUniversal.py
index aecc8814b377e..3c043df641978 100644
--- a/lldb/test/API/macosx/universal/TestUniversal.py
+++ b/lldb/test/API/macosx/universal/TestUniversal.py
@@ -57,8 +57,6 @@ def test_sbdebugger_create_target_with_file_and_target_triple(self):
     @skipIf(compiler="clang", compiler_version=["<", "7.0"])
     def test_process_launch_for_universal(self):
         """Test process launch of a universal binary."""
-        from lldbsuite.test.lldbutil import print_registers
-
         if not haswellOrLater():
             return
 

From b178c2d63e0701655046dfd2ead195b36e0df397 Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Sat, 25 Jan 2025 08:28:51 -0800
Subject: [PATCH 097/432] [SandboxVec][DAG] Fix trim schedule

Fix trimSchedule by skipping instructions without a DAG Node.
---
 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
index 06c1ef6b6d5ae..9ec5d830d8b4a 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp
@@ -172,6 +172,8 @@ void Scheduler::trimSchedule(ArrayRef<Instruction *> Instrs) {
   for (auto *I = LowestI, *E = TopI->getPrevNode(); I != E;
        I = I->getPrevNode()) {
     auto *N = DAG.getNode(I);
+    if (N == nullptr)
+      continue;
     if (auto *SB = N->getSchedBundle())
       eraseBundle(SB);
   }

From aba0476f23fc2a851792e9d85c25ee34a5ea7ed0 Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Sat, 23 Nov 2024 16:30:57 -0800
Subject: [PATCH 098/432] [lldb] Delete lldbutil.PrintableRegex (NFC)

Use of this class wasn't making use of the original regex string. Note that `re.Pattern`
has a `pattern` property to access the original regex.
---
 lldb/packages/Python/lldbsuite/test/lldbutil.py   | 15 ---------------
 .../libcxx/atomic/TestLibCxxAtomic.py             |  5 ++---
 .../libcxx/initializerlist/TestInitializerList.py |  5 ++---
 3 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbutil.py b/lldb/packages/Python/lldbsuite/test/lldbutil.py
index ef068cf7f9ed1..27e0040034370 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbutil.py
@@ -1578,21 +1578,6 @@ def set_actions_for_signal(
     )
 
 
-class PrintableRegex(object):
-    def __init__(self, text):
-        self.regex = re.compile(text)
-        self.text = text
-
-    def match(self, str):
-        return self.regex.match(str)
-
-    def __str__(self):
-        return "%s" % (self.text)
-
-    def __repr__(self):
-        return "re.compile(%s) -> %s" % (self.text, self.regex)
-
-
 def skip_if_callable(test, mycallable, reason):
     if callable(mycallable):
         if mycallable(test):
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
index 241226d50df80..c6592ede03147 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/atomic/TestLibCxxAtomic.py
@@ -2,6 +2,7 @@
 Test lldb data formatter subsystem.
 """
 
+import re
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -30,9 +31,7 @@ def test(self):
 
         self.runCmd("run", RUN_SUCCEEDED)
 
-        lldbutil.skip_if_library_missing(
-            self, self.target(), lldbutil.PrintableRegex("libc\+\+")
-        )
+        lldbutil.skip_if_library_missing(self, self.target(), re.compile(r"libc\+\+"))
 
         # The stop reason of the thread should be breakpoint.
         self.expect(
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py
index 93d5392830b50..b8a1dd3569d77 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/initializerlist/TestInitializerList.py
@@ -3,6 +3,7 @@
 """
 
 
+import re
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -24,9 +25,7 @@ def test(self):
 
         self.runCmd("run", RUN_SUCCEEDED)
 
-        lldbutil.skip_if_library_missing(
-            self, self.target(), lldbutil.PrintableRegex("libc\+\+")
-        )
+        lldbutil.skip_if_library_missing(self, self.target(), re.compile(r"libc\+\+"))
 
         # The stop reason of the thread should be breakpoint.
         self.expect(

From def50f701f6a2c1e0550bb341fd8b64bed299e72 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Sat, 25 Jan 2025 18:30:00 +0000
Subject: [PATCH 099/432] [libc++] implement `std::flat_multimap` (#113835)

fixes https://github.com/llvm/llvm-project/issues/105190

---------

Co-authored-by: Hui Xie <huixie@Mac.broadband>
Co-authored-by: Hui Xie <huixie@Huis-MacBook-Pro.local>
---
 libcxx/docs/FeatureTestMacroTable.rst         |    4 +
 libcxx/docs/Status/Cxx23Papers.csv            |    2 +-
 libcxx/include/CMakeLists.txt                 |    3 +
 libcxx/include/__flat_map/flat_map.h          |  123 +-
 libcxx/include/__flat_map/flat_multimap.h     | 1010 +++++++++++++++++
 libcxx/include/__flat_map/sorted_equivalent.h |   31 +
 libcxx/include/__flat_map/utils.h             |  103 ++
 libcxx/include/__functional/is_transparent.h  |    6 +-
 libcxx/include/flat_map                       |   21 +
 libcxx/include/module.modulemap               |   13 +-
 libcxx/include/version                        |    4 +
 libcxx/modules/std/flat_map.inc               |    4 +-
 .../flat.map/assert.input_range.pass.cpp      |    0
 .../flat.map/assert.sorted_unique.pass.cpp    |    0
 .../flat.multimap/assert.input_range.pass.cpp |   66 ++
 .../assert.sorted_equivalent.pass.cpp         |  225 ++++
 .../flat.map.syn/sorted_equivalent.pass.cpp   |   50 +
 .../flat.map/flat.map.capacity/empty.pass.cpp |    4 +-
 .../flat.map.capacity/empty.verify.cpp        |    6 +-
 .../flat.map.cons/deduct.compile.pass.cpp     |   52 +
 .../flat.map/flat.map.cons/deduct.pass.cpp    |   63 +-
 .../flat.map/flat.map.cons/deduct.verify.cpp  |   44 -
 .../flat.map.cons/default_noexcept.pass.cpp   |    2 +
 .../flat.map.cons/dtor_noexcept.pass.cpp      |    6 +-
 .../flat.multimap.capacity/empty.pass.cpp     |   51 +
 .../flat.multimap.capacity/empty.verify.cpp   |   22 +
 .../flat.multimap.capacity/max_size.pass.cpp  |   78 ++
 .../flat.multimap.capacity/size.pass.cpp      |   70 ++
 .../flat.multimap.cons/alloc.pass.cpp         |   72 ++
 .../assign_initializer_list.pass.cpp          |   58 +
 .../flat.multimap.cons/compare.pass.cpp       |   93 ++
 .../flat.multimap.cons/containers.pass.cpp    |  187 +++
 .../flat.multimap.cons/copy.pass.cpp          |   70 ++
 .../flat.multimap.cons/copy_alloc.pass.cpp    |   67 ++
 .../copy_assign.addressof.compile.pass.cpp    |   30 +
 .../flat.multimap.cons/copy_assign.pass.cpp   |   81 ++
 .../deduct.compile.pass.cpp                   |   52 +
 .../flat.multimap.cons/deduct.pass.cpp        |  343 ++++++
 .../flat.multimap.cons/deduct.verify.cpp      |   57 +
 .../flat.multimap.cons/deduct_pmr.pass.cpp    |  107 ++
 .../flat.multimap.cons/default.pass.cpp       |   72 ++
 .../default_noexcept.pass.cpp                 |   61 +
 .../flat.multimap.cons/dtor_noexcept.pass.cpp |   57 +
 .../initializer_list.pass.cpp                 |  159 +++
 .../flat.multimap.cons/iter_iter.pass.cpp     |  154 +++
 .../flat.multimap.cons/move.pass.cpp          |   89 ++
 .../flat.multimap.cons/move_alloc.pass.cpp    |   82 ++
 .../flat.multimap.cons/move_assign.pass.cpp   |   74 ++
 .../move_assign_clears.pass.cpp               |  101 ++
 .../move_assign_noexcept.pass.cpp             |  110 ++
 .../move_exceptions.pass.cpp                  |   71 ++
 .../flat.multimap.cons/move_noexcept.pass.cpp |  104 ++
 .../flat.multimap.cons/pmr.pass.cpp           |  361 ++++++
 .../flat.multimap.cons/range.pass.cpp         |  227 ++++
 .../sorted_container.pass.cpp                 |  165 +++
 .../sorted_initializer_list.pass.cpp          |  183 +++
 .../sorted_iter_iter.pass.cpp                 |  173 +++
 .../flat.multimap.erasure/erase_if.pass.cpp   |   98 ++
 .../erase_if_exceptions.pass.cpp              |  157 +++
 .../flat.multimap.iterators/iterator.pass.cpp |  105 ++
 .../iterator_comparison.pass.cpp              |  155 +++
 ...rator_concept_conformance.compile.pass.cpp |   84 ++
 ...range_concept_conformance.compile.pass.cpp |   55 +
 .../reverse_iterator.pass.cpp                 |   92 ++
 .../flat.multimap.modifiers/clear.pass.cpp    |   64 ++
 .../flat.multimap.modifiers/emplace.pass.cpp  |  158 +++
 .../emplace_hint.pass.cpp                     |  228 ++++
 .../erase_iter.pass.cpp                       |  127 +++
 .../erase_iter_iter.pass.cpp                  |   99 ++
 .../erase_key.pass.cpp                        |   99 ++
 .../erase_key_transparent.pass.cpp            |  161 +++
 .../flat.multimap.modifiers/extract.pass.cpp  |   93 ++
 .../insert_cv.pass.cpp                        |   81 ++
 .../insert_initializer_list.pass.cpp          |   83 ++
 .../insert_iter_cv.pass.cpp                   |   95 ++
 .../insert_iter_iter.pass.cpp                 |  109 ++
 .../insert_iter_rv.pass.cpp                   |  103 ++
 .../insert_range.pass.cpp                     |  101 ++
 .../insert_rv.pass.cpp                        |  116 ++
 .../insert_sorted_initializer_list.pass.cpp   |   66 ++
 .../insert_sorted_iter_iter.pass.cpp          |   94 ++
 .../insert_transparent.pass.cpp               |  135 +++
 .../flat.multimap.modifiers/replace.pass.cpp  |   82 ++
 .../swap_exception.pass.cpp                   |   80 ++
 .../swap_free.pass.cpp                        |   99 ++
 .../swap_member.pass.cpp                      |   97 ++
 .../flat.multimap.observers/comp.pass.cpp     |   98 ++
 .../keys_values.pass.cpp                      |   59 +
 .../contains.pass.cpp                         |   72 ++
 .../contains_transparent.pass.cpp             |   73 ++
 .../flat.multimap.operations/count.pass.cpp   |   71 ++
 .../count_transparent.pass.cpp                |   83 ++
 .../equal_range.pass.cpp                      |   81 ++
 .../equal_range_transparent.pass.cpp          |  110 ++
 .../flat.multimap.operations/find.pass.cpp    |   57 +
 .../find_transparent.pass.cpp                 |   99 ++
 .../lower_bound.pass.cpp                      |   73 ++
 .../lower_bound_transparent.pass.cpp          |  107 ++
 .../upper_bound.pass.cpp                      |   76 ++
 .../upper_bound_transparent.pass.cpp          |  106 ++
 .../flat.multimap/helpers.h                   |  389 +++++++
 .../flat.multimap/incomplete_type.pass.cpp    |   33 +
 .../flat.multimap/op_compare.pass.cpp         |  133 +++
 .../flat.multimap/types.compile.pass.cpp      |  133 +++
 .../flat_map.version.compile.pass.cpp         |   68 ++
 .../version.version.compile.pass.cpp          |   74 ++
 .../generate_feature_test_macro_components.py |   11 +
 107 files changed, 10468 insertions(+), 177 deletions(-)
 create mode 100644 libcxx/include/__flat_map/flat_multimap.h
 create mode 100644 libcxx/include/__flat_map/sorted_equivalent.h
 create mode 100644 libcxx/include/__flat_map/utils.h
 rename libcxx/test/libcxx/containers/{containers.adaptors => container.adaptors}/flat.map/assert.input_range.pass.cpp (100%)
 rename libcxx/test/libcxx/containers/{containers.adaptors => container.adaptors}/flat.map/assert.sorted_unique.pass.cpp (100%)
 create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp
 create mode 100644 libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index cfb0e5cfb129c..ccaa784ccb088 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -330,6 +330,10 @@ Status
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_expected``                                     ``202211L``
     ---------------------------------------------------------- -----------------
+    ``__cpp_lib_flat_map``                                     ``202207L``
+    ---------------------------------------------------------- -----------------
+    ``__cpp_lib_flat_set``                                     *unimplemented*
+    ---------------------------------------------------------- -----------------
     ``__cpp_lib_format_ranges``                                ``202207L``
     ---------------------------------------------------------- -----------------
     ``__cpp_lib_formatters``                                   *unimplemented*
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 24398574064e6..bc9d4f8866a73 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -52,7 +52,7 @@
 "`P2443R1 <https://wg21.link/P2443R1>`__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18",""
 "","","","","",""
 "`P0009R18 <https://wg21.link/P0009R18>`__","mdspan: A Non-Owning Multidimensional Array Reference","2022-07 (Virtual)","|Complete|","18",""
-"`P0429R9 <https://wg21.link/P0429R9>`__","A Standard ``flat_map``","2022-07 (Virtual)","|In Progress|","",""
+"`P0429R9 <https://wg21.link/P0429R9>`__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","",""
 "`P1169R4 <https://wg21.link/P1169R4>`__","``static operator()``","2022-07 (Virtual)","|Complete|","16",""
 "`P1222R4 <https://wg21.link/P1222R4>`__","A Standard ``flat_set``","2022-07 (Virtual)","","",""
 "`P1223R5 <https://wg21.link/P1223R5>`__","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","2022-07 (Virtual)","|Complete|","19",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 78d3192542b5a..8dac823503d73 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -362,8 +362,11 @@ set(files
   __filesystem/space_info.h
   __filesystem/u8path.h
   __flat_map/flat_map.h
+  __flat_map/flat_multimap.h
   __flat_map/key_value_iterator.h
+  __flat_map/sorted_equivalent.h
   __flat_map/sorted_unique.h
+  __flat_map/utils.h
   __format/buffer.h
   __format/concepts.h
   __format/container_adaptor.h
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index ab53b7a285ca4..a0594ed9dc411 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -29,9 +29,11 @@
 #include <__cstddef/ptrdiff_t.h>
 #include <__flat_map/key_value_iterator.h>
 #include <__flat_map/sorted_unique.h>
+#include <__flat_map/utils.h>
 #include <__functional/invoke.h>
 #include <__functional/is_transparent.h>
 #include <__functional/operations.h>
+#include <__fwd/vector.h>
 #include <__iterator/concepts.h>
 #include <__iterator/distance.h>
 #include <__iterator/iterator_traits.h>
@@ -131,7 +133,7 @@ class flat_map {
   _LIBCPP_HIDE_FROM_ABI static constexpr bool __allocator_ctor_constraint =
       _And<uses_allocator<key_container_type, _Allocator>, uses_allocator<mapped_container_type, _Allocator>>::value;
 
-  _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare, _Compare>;
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare>;
 
 public:
   // [flat.map.cons], construct/copy/destroy
@@ -153,7 +155,7 @@ class flat_map {
 #  if _LIBCPP_HAS_EXCEPTIONS
   } catch (...) {
     __other.clear();
-    // gcc does not like the `throw` keyword in a conditional noexcept function
+    // gcc does not like the `throw` keyword in a conditionally noexcept function
     if constexpr (!(is_nothrow_move_constructible_v<_KeyContainer> &&
                     is_nothrow_move_constructible_v<_MappedContainer> && is_nothrow_move_constructible_v<_Compare>)) {
       throw;
@@ -518,16 +520,16 @@ class flat_map {
     return emplace_hint(__hint, std::move(__x));
   }
 
-  template <class _Pp>
-    requires is_constructible_v<pair<key_type, mapped_type>, _Pp>
-  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert(_Pp&& __x) {
-    return emplace(std::forward<_Pp>(__x));
+  template <class _PairLike>
+    requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> insert(_PairLike&& __x) {
+    return emplace(std::forward<_PairLike>(__x));
   }
 
-  template <class _Pp>
-    requires is_constructible_v<pair<key_type, mapped_type>, _Pp>
-  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _Pp&& __x) {
-    return emplace_hint(__hint, std::forward<_Pp>(__x));
+  template <class _PairLike>
+    requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) {
+    return emplace_hint(__hint, std::forward<_PairLike>(__x));
   }
 
   template <class _InputIterator>
@@ -860,22 +862,10 @@ class flat_map {
     __containers_.values.erase(__containers_.values.begin() + __dist, __containers_.values.end());
   }
 
-  template <class _InputIterator, class _Sentinel>
-  _LIBCPP_HIDE_FROM_ABI size_type __append(_InputIterator __first, _Sentinel __last) {
-    size_type __num_of_appended = 0;
-    for (; __first != __last; ++__first) {
-      value_type __kv = *__first;
-      __containers_.keys.insert(__containers_.keys.end(), std::move(__kv.first));
-      __containers_.values.insert(__containers_.values.end(), std::move(__kv.second));
-      ++__num_of_appended;
-    }
-    return __num_of_appended;
-  }
-
   template <bool _WasSorted, class _InputIterator, class _Sentinel>
   _LIBCPP_HIDE_FROM_ABI void __append_sort_merge_unique(_InputIterator __first, _Sentinel __last) {
     auto __on_failure        = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
-    size_t __num_of_appended = __append(std::move(__first), std::move(__last));
+    size_t __num_of_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last));
     if (__num_of_appended != 0) {
       auto __zv                  = ranges::views::zip(__containers_.keys, __containers_.values);
       auto __append_start_offset = __containers_.keys.size() - __num_of_appended;
@@ -963,7 +953,8 @@ class flat_map {
 
     if (__key_it == __containers_.keys.end() || __compare_(__key, *__key_it)) {
       return pair<iterator, bool>(
-          __try_emplace_exact_hint(
+          __flat_map_utils::__emplace_exact_pos(
+              *this,
               std::move(__key_it),
               std::move(__mapped_it),
               std::forward<_KeyArg>(__key),
@@ -989,10 +980,13 @@ class flat_map {
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __try_emplace_hint(const_iterator __hint, _Kp&& __key, _Args&&... __args) {
     if (__is_hint_correct(__hint, __key)) {
       if (__hint == cend() || __compare_(__key, __hint->first)) {
-        return {
-            __try_emplace_exact_hint(
-                __hint.__key_iter_, __hint.__mapped_iter_, std::forward<_Kp>(__key), std::forward<_Args>(__args)...),
-            true};
+        return {__flat_map_utils::__emplace_exact_pos(
+                    *this,
+                    __hint.__key_iter_,
+                    __hint.__mapped_iter_,
+                    std::forward<_Kp>(__key),
+                    std::forward<_Args>(__args)...),
+                true};
       } else {
         // key equals
         auto __dist = __hint - cbegin();
@@ -1003,49 +997,6 @@ class flat_map {
     }
   }
 
-  template <class _IterK, class _IterM, class _KeyArg, class... _MArgs>
-  _LIBCPP_HIDE_FROM_ABI iterator
-  __try_emplace_exact_hint(_IterK&& __it_key, _IterM&& __it_mapped, _KeyArg&& __key, _MArgs&&... __mapped_args) {
-    auto __on_key_failed = std::__make_exception_guard([&]() noexcept {
-      if constexpr (__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) {
-        // Nothing to roll back!
-      } else {
-        // we need to clear both because we don't know the state of our keys anymore
-        clear() /* noexcept */;
-      }
-    });
-    auto __key_it        = __containers_.keys.emplace(__it_key, std::forward<_KeyArg>(__key));
-    __on_key_failed.__complete();
-
-    auto __on_value_failed = std::__make_exception_guard([&]() noexcept {
-      if constexpr (!__container_traits<_MappedContainer>::__emplacement_has_strong_exception_safety_guarantee) {
-        // we need to clear both because we don't know the state of our values anymore
-        clear() /* noexcept */;
-      } else {
-        // In this case, we know the values are just like before we attempted emplacement,
-        // and we also know that the keys have been emplaced successfully. Just roll back the keys.
-#  if _LIBCPP_HAS_EXCEPTIONS
-        try {
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-          __containers_.keys.erase(__key_it);
-#  if _LIBCPP_HAS_EXCEPTIONS
-        } catch (...) {
-          // Now things are funky for real. We're failing to rollback the keys.
-          // Just give up and clear the whole thing.
-          //
-          // Also, swallow the exception that happened during the rollback and let the
-          // original value-emplacement exception propagate normally.
-          clear() /* noexcept */;
-        }
-#  endif // _LIBCPP_HAS_EXCEPTIONS
-      }
-    });
-    auto __mapped_it = __containers_.values.emplace(__it_mapped, std::forward<_MArgs>(__mapped_args)...);
-    __on_value_failed.__complete();
-
-    return iterator(std::move(__key_it), std::move(__mapped_it));
-  }
-
   template <class _Kp, class _Mapped>
   _LIBCPP_HIDE_FROM_ABI pair<iterator, bool> __insert_or_assign(_Kp&& __key, _Mapped&& __mapped) {
     auto __r = try_emplace(std::forward<_Kp>(__key), std::forward<_Mapped>(__mapped));
@@ -1087,8 +1038,10 @@ class flat_map {
   friend typename flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type
   erase_if(flat_map<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
 
+  friend __flat_map_utils;
+
   containers __containers_;
-  [[no_unique_address]] key_compare __compare_;
+  _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
 
   struct __key_equiv {
     _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
@@ -1187,22 +1140,20 @@ template <ranges::input_range _Range,
           class _Compare   = less<__range_key_type<_Range>>,
           class _Allocator = allocator<byte>,
           class            = __enable_if_t<!__is_allocator<_Compare>::value && __is_allocator<_Allocator>::value>>
-flat_map(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator())
-    -> flat_map<
-        __range_key_type<_Range>,
-        __range_mapped_type<_Range>,
-        _Compare,
-        vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
-        vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
+flat_map(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) -> flat_map<
+    __range_key_type<_Range>,
+    __range_mapped_type<_Range>,
+    _Compare,
+    vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
+    vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
 
 template <ranges::input_range _Range, class _Allocator, class = __enable_if_t<__is_allocator<_Allocator>::value>>
-flat_map(from_range_t, _Range&&, _Allocator)
-    -> flat_map<
-        __range_key_type<_Range>,
-        __range_mapped_type<_Range>,
-        less<__range_key_type<_Range>>,
-        vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
-        vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
+flat_map(from_range_t, _Range&&, _Allocator) -> flat_map<
+    __range_key_type<_Range>,
+    __range_mapped_type<_Range>,
+    less<__range_key_type<_Range>>,
+    vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
+    vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
 
 template <class _Key, class _Tp, class _Compare = less<_Key>>
   requires(!__is_allocator<_Compare>::value)
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
new file mode 100644
index 0000000000000..ea77fb5d79bd2
--- /dev/null
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -0,0 +1,1010 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H
+#define _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H
+
+#include <__algorithm/lexicographical_compare_three_way.h>
+#include <__algorithm/min.h>
+#include <__algorithm/ranges_equal.h>
+#include <__algorithm/ranges_equal_range.h>
+#include <__algorithm/ranges_inplace_merge.h>
+#include <__algorithm/ranges_is_sorted.h>
+#include <__algorithm/ranges_lower_bound.h>
+#include <__algorithm/ranges_partition_point.h>
+#include <__algorithm/ranges_sort.h>
+#include <__algorithm/ranges_unique.h>
+#include <__algorithm/ranges_upper_bound.h>
+#include <__algorithm/remove_if.h>
+#include <__assert>
+#include <__compare/synth_three_way.h>
+#include <__concepts/convertible_to.h>
+#include <__concepts/swappable.h>
+#include <__config>
+#include <__cstddef/byte.h>
+#include <__cstddef/ptrdiff_t.h>
+#include <__flat_map/key_value_iterator.h>
+#include <__flat_map/sorted_equivalent.h>
+#include <__flat_map/utils.h>
+#include <__functional/invoke.h>
+#include <__functional/is_transparent.h>
+#include <__functional/operations.h>
+#include <__fwd/vector.h>
+#include <__iterator/concepts.h>
+#include <__iterator/distance.h>
+#include <__iterator/iterator_traits.h>
+#include <__iterator/ranges_iterator_traits.h>
+#include <__iterator/reverse_iterator.h>
+#include <__memory/allocator_traits.h>
+#include <__memory/uses_allocator.h>
+#include <__memory/uses_allocator_construction.h>
+#include <__ranges/access.h>
+#include <__ranges/concepts.h>
+#include <__ranges/container_compatible_range.h>
+#include <__ranges/drop_view.h>
+#include <__ranges/from_range.h>
+#include <__ranges/ref_view.h>
+#include <__ranges/size.h>
+#include <__ranges/subrange.h>
+#include <__ranges/zip_view.h>
+#include <__type_traits/conjunction.h>
+#include <__type_traits/container_traits.h>
+#include <__type_traits/invoke.h>
+#include <__type_traits/is_allocator.h>
+#include <__type_traits/is_nothrow_constructible.h>
+#include <__type_traits/is_same.h>
+#include <__type_traits/maybe_const.h>
+#include <__utility/exception_guard.h>
+#include <__utility/move.h>
+#include <__utility/pair.h>
+#include <__utility/scope_guard.h>
+#include <__vector/vector.h>
+#include <initializer_list>
+#include <stdexcept>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _Key,
+          class _Tp,
+          class _Compare         = less<_Key>,
+          class _KeyContainer    = vector<_Key>,
+          class _MappedContainer = vector<_Tp>>
+class flat_multimap {
+  template <class, class, class, class, class>
+  friend class flat_multimap;
+
+  static_assert(is_same_v<_Key, typename _KeyContainer::value_type>);
+  static_assert(is_same_v<_Tp, typename _MappedContainer::value_type>);
+  static_assert(!is_same_v<_KeyContainer, std::vector<bool>>, "vector<bool> is not a sequence container");
+  static_assert(!is_same_v<_MappedContainer, std::vector<bool>>, "vector<bool> is not a sequence container");
+
+  template <bool _Const>
+  using __iterator _LIBCPP_NODEBUG = __key_value_iterator<flat_multimap, _KeyContainer, _MappedContainer, _Const>;
+
+public:
+  // types
+  using key_type               = _Key;
+  using mapped_type            = _Tp;
+  using value_type             = pair<key_type, mapped_type>;
+  using key_compare            = __type_identity_t<_Compare>;
+  using reference              = pair<const key_type&, mapped_type&>;
+  using const_reference        = pair<const key_type&, const mapped_type&>;
+  using size_type              = size_t;
+  using difference_type        = ptrdiff_t;
+  using iterator               = __iterator<false>; // see [container.requirements]
+  using const_iterator         = __iterator<true>;  // see [container.requirements]
+  using reverse_iterator       = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+  using key_container_type     = _KeyContainer;
+  using mapped_container_type  = _MappedContainer;
+
+  class value_compare {
+  private:
+    key_compare __comp_;
+    _LIBCPP_HIDE_FROM_ABI value_compare(key_compare __c) : __comp_(__c) {}
+    friend flat_multimap;
+
+  public:
+    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+      return __comp_(__x.first, __y.first);
+    }
+  };
+
+  struct containers {
+    key_container_type keys;
+    mapped_container_type values;
+  };
+
+private:
+  template <class _Allocator>
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool __allocator_ctor_constraint =
+      _And<uses_allocator<key_container_type, _Allocator>, uses_allocator<mapped_container_type, _Allocator>>::value;
+
+  _LIBCPP_HIDE_FROM_ABI static constexpr bool __is_compare_transparent = __is_transparent_v<_Compare>;
+
+public:
+  // [flat.map.cons], construct/copy/destroy
+  _LIBCPP_HIDE_FROM_ABI flat_multimap() noexcept(
+      is_nothrow_default_constructible_v<_KeyContainer> && is_nothrow_default_constructible_v<_MappedContainer> &&
+      is_nothrow_default_constructible_v<_Compare>)
+      : __containers_(), __compare_() {}
+
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap&) = default;
+
+  // The copy/move constructors are not specified in the spec, which means they should be defaulted.
+  // However, the move constructor can potentially leave a moved-from object in an inconsistent
+  // state if an exception is thrown.
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other) noexcept(
+      is_nothrow_move_constructible_v<_KeyContainer> && is_nothrow_move_constructible_v<_MappedContainer> &&
+      is_nothrow_move_constructible_v<_Compare>)
+#  if _LIBCPP_HAS_EXCEPTIONS
+      try
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      : __containers_(std::move(__other.__containers_)), __compare_(std::move(__other.__compare_)) {
+    __other.clear();
+#  if _LIBCPP_HAS_EXCEPTIONS
+  } catch (...) {
+    __other.clear();
+    // gcc does not like the `throw` keyword in a conditionally noexcept function
+    if constexpr (!(is_nothrow_move_constructible_v<_KeyContainer> &&
+                    is_nothrow_move_constructible_v<_MappedContainer> && is_nothrow_move_constructible_v<_Compare>)) {
+      throw;
+    }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+  }
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(const flat_multimap& __other, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_tag{},
+                      __alloc,
+                      __other.__containers_.keys,
+                      __other.__containers_.values,
+                      __other.__compare_) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(flat_multimap&& __other, const _Allocator& __alloc)
+#  if _LIBCPP_HAS_EXCEPTIONS
+      try
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      : flat_multimap(__ctor_uses_allocator_tag{},
+                      __alloc,
+                      std::move(__other.__containers_.keys),
+                      std::move(__other.__containers_.values),
+                      std::move(__other.__compare_)) {
+    __other.clear();
+#  if _LIBCPP_HAS_EXCEPTIONS
+  } catch (...) {
+    __other.clear();
+    throw;
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+  }
+
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+      key_container_type __key_cont, mapped_container_type __mapped_cont, const key_compare& __comp = key_compare())
+      : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    __sort();
+  }
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+      const key_container_type& __key_cont, const mapped_container_type& __mapped_cont, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    __sort();
+  }
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(const key_container_type& __key_cont,
+                const mapped_container_type& __mapped_cont,
+                const key_compare& __comp,
+                const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    __sort();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t,
+                key_container_type __key_cont,
+                mapped_container_type __mapped_cont,
+                const key_compare& __comp = key_compare())
+      : __containers_{.keys = std::move(__key_cont), .values = std::move(__mapped_cont)}, __compare_(__comp) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted");
+  }
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t,
+                const key_container_type& __key_cont,
+                const mapped_container_type& __mapped_cont,
+                const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted");
+  }
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t,
+                const key_container_type& __key_cont,
+                const mapped_container_type& __mapped_cont,
+                const key_compare& __comp,
+                const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_tag{}, __alloc, __key_cont, __mapped_cont, __comp) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(__containers_.keys.size() == __containers_.values.size(),
+                                     "flat_multimap keys and mapped containers have different size");
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__containers_.keys), "Key container is not sorted");
+  }
+
+  _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const key_compare& __comp) : __containers_(), __compare_(__comp) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(const key_compare& __comp, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI explicit flat_multimap(const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {}
+
+  template <class _InputIterator>
+    requires __has_input_iterator_category<_InputIterator>::value
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
+      : __containers_(), __compare_(__comp) {
+    insert(__first, __last);
+  }
+
+  template <class _InputIterator, class _Allocator>
+    requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(_InputIterator __first, _InputIterator __last, const key_compare& __comp, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
+    insert(__first, __last);
+  }
+
+  template <class _InputIterator, class _Allocator>
+    requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(_InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
+    insert(__first, __last);
+  }
+
+  template <_ContainerCompatibleRange<value_type> _Range>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t __fr, _Range&& __rg)
+      : flat_multimap(__fr, std::forward<_Range>(__rg), key_compare()) {}
+
+  template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
+    insert_range(std::forward<_Range>(__rg));
+  }
+
+  template <_ContainerCompatibleRange<value_type> _Range>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp) : flat_multimap(__comp) {
+    insert_range(std::forward<_Range>(__rg));
+  }
+
+  template <_ContainerCompatibleRange<value_type> _Range, class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(from_range_t, _Range&& __rg, const key_compare& __comp, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
+    insert_range(std::forward<_Range>(__rg));
+  }
+
+  template <class _InputIterator>
+    requires __has_input_iterator_category<_InputIterator>::value
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+      sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const key_compare& __comp = key_compare())
+      : __containers_(), __compare_(__comp) {
+    insert(sorted_equivalent, __first, __last);
+  }
+  template <class _InputIterator, class _Allocator>
+    requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t,
+                _InputIterator __first,
+                _InputIterator __last,
+                const key_compare& __comp,
+                const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc, __comp) {
+    insert(sorted_equivalent, __first, __last);
+  }
+
+  template <class _InputIterator, class _Allocator>
+    requires(__has_input_iterator_category<_InputIterator>::value && __allocator_ctor_constraint<_Allocator>)
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t, _InputIterator __first, _InputIterator __last, const _Allocator& __alloc)
+      : flat_multimap(__ctor_uses_allocator_empty_tag{}, __alloc) {
+    insert(sorted_equivalent, __first, __last);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+      : flat_multimap(__il.begin(), __il.end(), __comp) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
+      : flat_multimap(__il.begin(), __il.end(), __comp, __alloc) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(initializer_list<value_type> __il, const _Allocator& __alloc)
+      : flat_multimap(__il.begin(), __il.end(), __alloc) {}
+
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp = key_compare())
+      : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(
+      sorted_equivalent_t, initializer_list<value_type> __il, const key_compare& __comp, const _Allocator& __alloc)
+      : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __comp, __alloc) {}
+
+  template <class _Allocator>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(sorted_equivalent_t, initializer_list<value_type> __il, const _Allocator& __alloc)
+      : flat_multimap(sorted_equivalent, __il.begin(), __il.end(), __alloc) {}
+
+  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(initializer_list<value_type> __il) {
+    clear();
+    insert(__il);
+    return *this;
+  }
+
+  // copy/move assignment are not specified in the spec (defaulted)
+  // but move assignment can potentially leave moved from object in an inconsistent
+  // state if an exception is thrown
+  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(const flat_multimap&) = default;
+
+  _LIBCPP_HIDE_FROM_ABI flat_multimap& operator=(flat_multimap&& __other) noexcept(
+      is_nothrow_move_assignable_v<_KeyContainer> && is_nothrow_move_assignable_v<_MappedContainer> &&
+      is_nothrow_move_assignable_v<_Compare>) {
+    auto __clear_other_guard = std::__make_scope_guard([&]() noexcept { __other.clear() /* noexcept */; });
+    auto __clear_self_guard  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
+    __containers_            = std::move(__other.__containers_);
+    __compare_               = std::move(__other.__compare_);
+    __clear_self_guard.__complete();
+    return *this;
+  }
+
+  // iterators
+  _LIBCPP_HIDE_FROM_ABI iterator begin() noexcept {
+    return iterator(__containers_.keys.begin(), __containers_.values.begin());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator begin() const noexcept {
+    return const_iterator(__containers_.keys.begin(), __containers_.values.begin());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator end() noexcept {
+    return iterator(__containers_.keys.end(), __containers_.values.end());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator end() const noexcept {
+    return const_iterator(__containers_.keys.end(), __containers_.values.end());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI reverse_iterator rbegin() noexcept { return reverse_iterator(end()); }
+  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); }
+  _LIBCPP_HIDE_FROM_ABI reverse_iterator rend() noexcept { return reverse_iterator(begin()); }
+  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const noexcept { return begin(); }
+  _LIBCPP_HIDE_FROM_ABI const_iterator cend() const noexcept { return end(); }
+  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); }
+  _LIBCPP_HIDE_FROM_ABI const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); }
+
+  // [flat.map.capacity], capacity
+  [[nodiscard]] _LIBCPP_HIDE_FROM_ABI bool empty() const noexcept { return __containers_.keys.empty(); }
+
+  _LIBCPP_HIDE_FROM_ABI size_type size() const noexcept { return __containers_.keys.size(); }
+
+  _LIBCPP_HIDE_FROM_ABI size_type max_size() const noexcept {
+    return std::min<size_type>(__containers_.keys.max_size(), __containers_.values.max_size());
+  }
+
+  // [flat.map.modifiers], modifiers
+  template <class... _Args>
+    requires is_constructible_v<pair<key_type, mapped_type>, _Args...> && is_move_constructible_v<key_type> &&
+             is_move_constructible_v<mapped_type>
+  _LIBCPP_HIDE_FROM_ABI iterator emplace(_Args&&... __args) {
+    std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
+    auto __key_it    = ranges::upper_bound(__containers_.keys, __pair.first, __compare_);
+    auto __mapped_it = __corresponding_mapped_it(*this, __key_it);
+
+    return __flat_map_utils::__emplace_exact_pos(
+        *this, std::move(__key_it), std::move(__mapped_it), std::move(__pair.first), std::move(__pair.second));
+  }
+
+  template <class... _Args>
+    requires is_constructible_v<pair<key_type, mapped_type>, _Args...>
+  _LIBCPP_HIDE_FROM_ABI iterator emplace_hint(const_iterator __hint, _Args&&... __args) {
+    std::pair<key_type, mapped_type> __pair(std::forward<_Args>(__args)...);
+
+    auto __prev_larger  = __hint != cbegin() && __compare_(__pair.first, (__hint - 1)->first);
+    auto __next_smaller = __hint != cend() && __compare_(__hint->first, __pair.first);
+
+    auto __hint_distance = __hint.__key_iter_ - __containers_.keys.cbegin();
+    auto __key_iter      = __containers_.keys.begin() + __hint_distance;
+    auto __mapped_iter   = __containers_.values.begin() + __hint_distance;
+
+    if (!__prev_larger && !__next_smaller) [[likely]] {
+      // hint correct, just use exact hint iterators
+    } else if (__prev_larger && !__next_smaller) {
+      // the hint position is more to the right than the key should have been.
+      // we want to emplace the element to a position as right as possible
+      // e.g. Insert new element "2" in the following range
+      // 1, 1, 2, 2, 2, 3, 4, 6
+      //                   ^
+      //                   |
+      //                  hint
+      // We want to insert "2" after the last existing "2"
+      __key_iter    = ranges::upper_bound(__containers_.keys.begin(), __key_iter, __pair.first, __compare_);
+      __mapped_iter = __corresponding_mapped_it(*this, __key_iter);
+    } else {
+      _LIBCPP_ASSERT_INTERNAL(!__prev_larger && __next_smaller, "this means that the multimap is not sorted");
+
+      // the hint position is more to the left than the key should have been.
+      // we want to emplace the element to a position as left as possible
+      //  1, 1, 2, 2, 2, 3, 4, 6
+      //  ^
+      //  |
+      // hint
+      // We want to insert "2" before the first existing "2"
+      __key_iter    = ranges::lower_bound(__key_iter, __containers_.keys.end(), __pair.first, __compare_);
+      __mapped_iter = __corresponding_mapped_it(*this, __key_iter);
+    }
+    return __flat_map_utils::__emplace_exact_pos(
+        *this, __key_iter, __mapped_iter, std::move(__pair.first), std::move(__pair.second));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const value_type& __x) { return emplace(__x); }
+
+  _LIBCPP_HIDE_FROM_ABI iterator insert(value_type&& __x) { return emplace(std::move(__x)); }
+
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, const value_type& __x) {
+    return emplace_hint(__hint, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, value_type&& __x) {
+    return emplace_hint(__hint, std::move(__x));
+  }
+
+  template <class _PairLike>
+    requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
+  _LIBCPP_HIDE_FROM_ABI iterator insert(_PairLike&& __x) {
+    return emplace(std::forward<_PairLike>(__x));
+  }
+
+  template <class _PairLike>
+    requires is_constructible_v<pair<key_type, mapped_type>, _PairLike>
+  _LIBCPP_HIDE_FROM_ABI iterator insert(const_iterator __hint, _PairLike&& __x) {
+    return emplace_hint(__hint, std::forward<_PairLike>(__x));
+  }
+
+  template <class _InputIterator>
+    requires __has_input_iterator_category<_InputIterator>::value
+  _LIBCPP_HIDE_FROM_ABI void insert(_InputIterator __first, _InputIterator __last) {
+    if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
+      __reserve(__last - __first);
+    }
+    __append_sort_merge</*WasSorted = */ false>(std::move(__first), std::move(__last));
+  }
+
+  template <class _InputIterator>
+    requires __has_input_iterator_category<_InputIterator>::value
+  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, _InputIterator __first, _InputIterator __last) {
+    if constexpr (sized_sentinel_for<_InputIterator, _InputIterator>) {
+      __reserve(__last - __first);
+    }
+
+    __append_sort_merge</*WasSorted = */ true>(std::move(__first), std::move(__last));
+  }
+
+  template <_ContainerCompatibleRange<value_type> _Range>
+  _LIBCPP_HIDE_FROM_ABI void insert_range(_Range&& __range) {
+    if constexpr (ranges::sized_range<_Range>) {
+      __reserve(ranges::size(__range));
+    }
+
+    __append_sort_merge</*WasSorted = */ false>(ranges::begin(__range), ranges::end(__range));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void insert(initializer_list<value_type> __il) { insert(__il.begin(), __il.end()); }
+
+  _LIBCPP_HIDE_FROM_ABI void insert(sorted_equivalent_t, initializer_list<value_type> __il) {
+    insert(sorted_equivalent, __il.begin(), __il.end());
+  }
+
+  _LIBCPP_HIDE_FROM_ABI containers extract() && {
+    auto __guard = std::__make_scope_guard([&]() noexcept { clear() /* noexcept */; });
+    auto __ret   = std::move(__containers_);
+    return __ret;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void replace(key_container_type&& __key_cont, mapped_container_type&& __mapped_cont) {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+        __key_cont.size() == __mapped_cont.size(), "flat_multimap keys and mapped containers have different size");
+
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(__is_sorted(__key_cont), "Key container is not sorted");
+    auto __guard         = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
+    __containers_.keys   = std::move(__key_cont);
+    __containers_.values = std::move(__mapped_cont);
+    __guard.__complete();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator erase(iterator __position) {
+    return __erase(__position.__key_iter_, __position.__mapped_iter_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __position) {
+    return __erase(__position.__key_iter_, __position.__mapped_iter_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI size_type erase(const key_type& __x) {
+    auto [__first, __last] = equal_range(__x);
+    auto __res             = __last - __first;
+    erase(__first, __last);
+    return __res;
+  }
+
+  template <class _Kp>
+    requires(__is_compare_transparent && !is_convertible_v<_Kp &&, iterator> &&
+             !is_convertible_v<_Kp &&, const_iterator>)
+  _LIBCPP_HIDE_FROM_ABI size_type erase(_Kp&& __x) {
+    auto [__first, __last] = equal_range(__x);
+    auto __res             = __last - __first;
+    erase(__first, __last);
+    return __res;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator erase(const_iterator __first, const_iterator __last) {
+    auto __on_failure = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
+    auto __key_it     = __containers_.keys.erase(__first.__key_iter_, __last.__key_iter_);
+    auto __mapped_it  = __containers_.values.erase(__first.__mapped_iter_, __last.__mapped_iter_);
+    __on_failure.__complete();
+    return iterator(std::move(__key_it), std::move(__mapped_it));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __y) noexcept {
+    // warning: The spec has unconditional noexcept, which means that
+    // if any of the following functions throw an exception,
+    // std::terminate will be called
+    ranges::swap(__compare_, __y.__compare_);
+    ranges::swap(__containers_.keys, __y.__containers_.keys);
+    ranges::swap(__containers_.values, __y.__containers_.values);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void clear() noexcept {
+    __containers_.keys.clear();
+    __containers_.values.clear();
+  }
+
+  // observers
+  _LIBCPP_HIDE_FROM_ABI key_compare key_comp() const { return __compare_; }
+  _LIBCPP_HIDE_FROM_ABI value_compare value_comp() const { return value_compare(__compare_); }
+
+  _LIBCPP_HIDE_FROM_ABI const key_container_type& keys() const noexcept { return __containers_.keys; }
+  _LIBCPP_HIDE_FROM_ABI const mapped_container_type& values() const noexcept { return __containers_.values; }
+
+  // map operations
+  _LIBCPP_HIDE_FROM_ABI iterator find(const key_type& __x) { return __find_impl(*this, __x); }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator find(const key_type& __x) const { return __find_impl(*this, __x); }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI iterator find(const _Kp& __x) {
+    return __find_impl(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI const_iterator find(const _Kp& __x) const {
+    return __find_impl(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI size_type count(const key_type& __x) const {
+    auto [__first, __last] = equal_range(__x);
+    return __last - __first;
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI size_type count(const _Kp& __x) const {
+    auto [__first, __last] = equal_range(__x);
+    return __last - __first;
+  }
+
+  _LIBCPP_HIDE_FROM_ABI bool contains(const key_type& __x) const { return find(__x) != end(); }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI bool contains(const _Kp& __x) const {
+    return find(__x) != end();
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const key_type& __x) { return __lower_bound<iterator>(*this, __x); }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const key_type& __x) const {
+    return __lower_bound<const_iterator>(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI iterator lower_bound(const _Kp& __x) {
+    return __lower_bound<iterator>(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI const_iterator lower_bound(const _Kp& __x) const {
+    return __lower_bound<const_iterator>(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const key_type& __x) { return __upper_bound<iterator>(*this, __x); }
+
+  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const key_type& __x) const {
+    return __upper_bound<const_iterator>(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI iterator upper_bound(const _Kp& __x) {
+    return __upper_bound<iterator>(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI const_iterator upper_bound(const _Kp& __x) const {
+    return __upper_bound<const_iterator>(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const key_type& __x) {
+    return __equal_range_impl(*this, __x);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const key_type& __x) const {
+    return __equal_range_impl(*this, __x);
+  }
+
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI pair<iterator, iterator> equal_range(const _Kp& __x) {
+    return __equal_range_impl(*this, __x);
+  }
+  template <class _Kp>
+    requires __is_compare_transparent
+  _LIBCPP_HIDE_FROM_ABI pair<const_iterator, const_iterator> equal_range(const _Kp& __x) const {
+    return __equal_range_impl(*this, __x);
+  }
+
+  friend _LIBCPP_HIDE_FROM_ABI bool operator==(const flat_multimap& __x, const flat_multimap& __y) {
+    return ranges::equal(__x, __y);
+  }
+
+  friend _LIBCPP_HIDE_FROM_ABI auto operator<=>(const flat_multimap& __x, const flat_multimap& __y) {
+    return std::lexicographical_compare_three_way(
+        __x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way);
+  }
+
+  friend _LIBCPP_HIDE_FROM_ABI void swap(flat_multimap& __x, flat_multimap& __y) noexcept { __x.swap(__y); }
+
+private:
+  struct __ctor_uses_allocator_tag {
+    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_tag() = default;
+  };
+  struct __ctor_uses_allocator_empty_tag {
+    explicit _LIBCPP_HIDE_FROM_ABI __ctor_uses_allocator_empty_tag() = default;
+  };
+
+  template <class _Allocator, class _KeyCont, class _MappedCont, class... _CompArg>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI
+  flat_multimap(__ctor_uses_allocator_tag,
+                const _Allocator& __alloc,
+                _KeyCont&& __key_cont,
+                _MappedCont&& __mapped_cont,
+                _CompArg&&... __comp)
+      : __containers_{.keys = std::make_obj_using_allocator<key_container_type>(
+                          __alloc, std::forward<_KeyCont>(__key_cont)),
+                      .values = std::make_obj_using_allocator<mapped_container_type>(
+                          __alloc, std::forward<_MappedCont>(__mapped_cont))},
+        __compare_(std::forward<_CompArg>(__comp)...) {}
+
+  template <class _Allocator, class... _CompArg>
+    requires __allocator_ctor_constraint<_Allocator>
+  _LIBCPP_HIDE_FROM_ABI flat_multimap(__ctor_uses_allocator_empty_tag, const _Allocator& __alloc, _CompArg&&... __comp)
+      : __containers_{.keys   = std::make_obj_using_allocator<key_container_type>(__alloc),
+                      .values = std::make_obj_using_allocator<mapped_container_type>(__alloc)},
+        __compare_(std::forward<_CompArg>(__comp)...) {}
+
+  _LIBCPP_HIDE_FROM_ABI bool __is_sorted(auto&& __key_container) const {
+    return ranges::is_sorted(__key_container, __compare_);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void __sort() {
+    auto __zv = ranges::views::zip(__containers_.keys, __containers_.values);
+    ranges::sort(__zv, __compare_, [](const auto& __p) -> decltype(auto) { return std::get<0>(__p); });
+  }
+
+  template <class _Self, class _KeyIter>
+  _LIBCPP_HIDE_FROM_ABI static auto __corresponding_mapped_it(_Self&& __self, _KeyIter&& __key_iter) {
+    return __self.__containers_.values.begin() +
+           static_cast<ranges::range_difference_t<mapped_container_type>>(
+               ranges::distance(__self.__containers_.keys.begin(), __key_iter));
+  }
+
+  template <bool _WasSorted, class _InputIterator, class _Sentinel>
+  _LIBCPP_HIDE_FROM_ABI void __append_sort_merge(_InputIterator __first, _Sentinel __last) {
+    auto __on_failure     = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
+    size_t __num_appended = __flat_map_utils::__append(*this, std::move(__first), std::move(__last));
+    if (__num_appended != 0) {
+      auto __zv                  = ranges::views::zip(__containers_.keys, __containers_.values);
+      auto __append_start_offset = __containers_.keys.size() - __num_appended;
+      auto __end                 = __zv.end();
+      auto __compare_key         = [this](const auto& __p1, const auto& __p2) {
+        return __compare_(std::get<0>(__p1), std::get<0>(__p2));
+      };
+      if constexpr (!_WasSorted) {
+        ranges::sort(__zv.begin() + __append_start_offset, __end, __compare_key);
+      } else {
+        _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
+            __is_sorted(__containers_.keys | ranges::views::drop(__append_start_offset)),
+            "Key container is not sorted");
+      }
+      ranges::inplace_merge(__zv.begin(), __zv.begin() + __append_start_offset, __end, __compare_key);
+    }
+    __on_failure.__complete();
+  }
+
+  template <class _Self, class _Kp>
+  _LIBCPP_HIDE_FROM_ABI static auto __find_impl(_Self&& __self, const _Kp& __key) {
+    auto __it   = __self.lower_bound(__key);
+    auto __last = __self.end();
+    if (__it == __last || __self.__compare_(__key, __it->first)) {
+      return __last;
+    }
+    return __it;
+  }
+
+  template <class _Self, class _Kp>
+  _LIBCPP_HIDE_FROM_ABI static auto __equal_range_impl(_Self&& __self, const _Kp& __key) {
+    auto [__key_first, __key_last] = ranges::equal_range(__self.__containers_.keys, __key, __self.__compare_);
+
+    using __iterator_type = ranges::iterator_t<decltype(__self)>;
+    return std::make_pair(__iterator_type(__key_first, __corresponding_mapped_it(__self, __key_first)),
+                          __iterator_type(__key_last, __corresponding_mapped_it(__self, __key_last)));
+  }
+
+  template <class _Res, class _Self, class _Kp>
+  _LIBCPP_HIDE_FROM_ABI static _Res __lower_bound(_Self&& __self, _Kp& __x) {
+    auto __key_iter    = ranges::lower_bound(__self.__containers_.keys, __x, __self.__compare_);
+    auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
+    return _Res(std::move(__key_iter), std::move(__mapped_iter));
+  }
+
+  template <class _Res, class _Self, class _Kp>
+  _LIBCPP_HIDE_FROM_ABI static _Res __upper_bound(_Self&& __self, _Kp& __x) {
+    auto __key_iter    = ranges::upper_bound(__self.__containers_.keys, __x, __self.__compare_);
+    auto __mapped_iter = __corresponding_mapped_it(__self, __key_iter);
+    return _Res(std::move(__key_iter), std::move(__mapped_iter));
+  }
+
+  _LIBCPP_HIDE_FROM_ABI void __reserve(size_t __size) {
+    if constexpr (requires { __containers_.keys.reserve(__size); }) {
+      __containers_.keys.reserve(__size);
+    }
+
+    if constexpr (requires { __containers_.values.reserve(__size); }) {
+      __containers_.values.reserve(__size);
+    }
+  }
+
+  template <class _KIter, class _MIter>
+  _LIBCPP_HIDE_FROM_ABI iterator __erase(_KIter __key_iter_to_remove, _MIter __mapped_iter_to_remove) {
+    auto __on_failure  = std::__make_exception_guard([&]() noexcept { clear() /* noexcept */; });
+    auto __key_iter    = __containers_.keys.erase(__key_iter_to_remove);
+    auto __mapped_iter = __containers_.values.erase(__mapped_iter_to_remove);
+    __on_failure.__complete();
+    return iterator(std::move(__key_iter), std::move(__mapped_iter));
+  }
+
+  template <class _Key2, class _Tp2, class _Compare2, class _KeyContainer2, class _MappedContainer2, class _Predicate>
+  friend typename flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>::size_type
+  erase_if(flat_multimap<_Key2, _Tp2, _Compare2, _KeyContainer2, _MappedContainer2>&, _Predicate);
+
+  friend __flat_map_utils;
+
+  containers __containers_;
+  _LIBCPP_NO_UNIQUE_ADDRESS key_compare __compare_;
+
+  struct __key_equiv {
+    _LIBCPP_HIDE_FROM_ABI __key_equiv(key_compare __c) : __comp_(__c) {}
+    _LIBCPP_HIDE_FROM_ABI bool operator()(const_reference __x, const_reference __y) const {
+      return !__comp_(std::get<0>(__x), std::get<0>(__y)) && !__comp_(std::get<0>(__y), std::get<0>(__x));
+    }
+    key_compare __comp_;
+  };
+};
+
+template <class _KeyContainer, class _MappedContainer, class _Compare = less<typename _KeyContainer::value_type>>
+  requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value &&
+           !__is_allocator<_MappedContainer>::value &&
+           is_invocable_v<const _Compare&,
+                          const typename _KeyContainer::value_type&,
+                          const typename _KeyContainer::value_type&>)
+flat_multimap(_KeyContainer, _MappedContainer, _Compare = _Compare())
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     _Compare,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _KeyContainer, class _MappedContainer, class _Allocator>
+  requires(uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator> &&
+           !__is_allocator<_KeyContainer>::value && !__is_allocator<_MappedContainer>::value)
+flat_multimap(_KeyContainer, _MappedContainer, _Allocator)
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     less<typename _KeyContainer::value_type>,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _KeyContainer, class _MappedContainer, class _Compare, class _Allocator>
+  requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value &&
+           !__is_allocator<_MappedContainer>::value && uses_allocator_v<_KeyContainer, _Allocator> &&
+           uses_allocator_v<_MappedContainer, _Allocator> &&
+           is_invocable_v<const _Compare&,
+                          const typename _KeyContainer::value_type&,
+                          const typename _KeyContainer::value_type&>)
+flat_multimap(_KeyContainer, _MappedContainer, _Compare, _Allocator)
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     _Compare,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _KeyContainer, class _MappedContainer, class _Compare = less<typename _KeyContainer::value_type>>
+  requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value &&
+           !__is_allocator<_MappedContainer>::value &&
+           is_invocable_v<const _Compare&,
+                          const typename _KeyContainer::value_type&,
+                          const typename _KeyContainer::value_type&>)
+flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Compare = _Compare())
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     _Compare,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _KeyContainer, class _MappedContainer, class _Allocator>
+  requires(uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator> &&
+           !__is_allocator<_KeyContainer>::value && !__is_allocator<_MappedContainer>::value)
+flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Allocator)
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     less<typename _KeyContainer::value_type>,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _KeyContainer, class _MappedContainer, class _Compare, class _Allocator>
+  requires(!__is_allocator<_Compare>::value && !__is_allocator<_KeyContainer>::value &&
+           !__is_allocator<_MappedContainer>::value && uses_allocator_v<_KeyContainer, _Allocator> &&
+           uses_allocator_v<_MappedContainer, _Allocator> &&
+           is_invocable_v<const _Compare&,
+                          const typename _KeyContainer::value_type&,
+                          const typename _KeyContainer::value_type&>)
+flat_multimap(sorted_equivalent_t, _KeyContainer, _MappedContainer, _Compare, _Allocator)
+    -> flat_multimap<typename _KeyContainer::value_type,
+                     typename _MappedContainer::value_type,
+                     _Compare,
+                     _KeyContainer,
+                     _MappedContainer>;
+
+template <class _InputIterator, class _Compare = less<__iter_key_type<_InputIterator>>>
+  requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value)
+flat_multimap(_InputIterator, _InputIterator, _Compare = _Compare())
+    -> flat_multimap<__iter_key_type<_InputIterator>, __iter_mapped_type<_InputIterator>, _Compare>;
+
+template <class _InputIterator, class _Compare = less<__iter_key_type<_InputIterator>>>
+  requires(__has_input_iterator_category<_InputIterator>::value && !__is_allocator<_Compare>::value)
+flat_multimap(sorted_equivalent_t, _InputIterator, _InputIterator, _Compare = _Compare())
+    -> flat_multimap<__iter_key_type<_InputIterator>, __iter_mapped_type<_InputIterator>, _Compare>;
+
+template <ranges::input_range _Range,
+          class _Compare   = less<__range_key_type<_Range>>,
+          class _Allocator = allocator<byte>,
+          class            = __enable_if_t<!__is_allocator<_Compare>::value && __is_allocator<_Allocator>::value>>
+flat_multimap(from_range_t, _Range&&, _Compare = _Compare(), _Allocator = _Allocator()) -> flat_multimap<
+    __range_key_type<_Range>,
+    __range_mapped_type<_Range>,
+    _Compare,
+    vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
+    vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
+
+template <ranges::input_range _Range, class _Allocator, class = __enable_if_t<__is_allocator<_Allocator>::value>>
+flat_multimap(from_range_t, _Range&&, _Allocator) -> flat_multimap<
+    __range_key_type<_Range>,
+    __range_mapped_type<_Range>,
+    less<__range_key_type<_Range>>,
+    vector<__range_key_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_key_type<_Range>>>,
+    vector<__range_mapped_type<_Range>, __allocator_traits_rebind_t<_Allocator, __range_mapped_type<_Range>>>>;
+
+template <class _Key, class _Tp, class _Compare = less<_Key>>
+  requires(!__is_allocator<_Compare>::value)
+flat_multimap(initializer_list<pair<_Key, _Tp>>, _Compare = _Compare()) -> flat_multimap<_Key, _Tp, _Compare>;
+
+template <class _Key, class _Tp, class _Compare = less<_Key>>
+  requires(!__is_allocator<_Compare>::value)
+flat_multimap(sorted_equivalent_t, initializer_list<pair<_Key, _Tp>>, _Compare = _Compare())
+    -> flat_multimap<_Key, _Tp, _Compare>;
+
+template <class _Key, class _Tp, class _Compare, class _KeyContainer, class _MappedContainer, class _Allocator>
+struct uses_allocator<flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>, _Allocator>
+    : bool_constant<uses_allocator_v<_KeyContainer, _Allocator> && uses_allocator_v<_MappedContainer, _Allocator>> {};
+
+template <class _Key, class _Tp, class _Compare, class _KeyContainer, class _MappedContainer, class _Predicate>
+_LIBCPP_HIDE_FROM_ABI typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::size_type
+erase_if(flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>& __flat_multimap, _Predicate __pred) {
+  auto __zv     = ranges::views::zip(__flat_multimap.__containers_.keys, __flat_multimap.__containers_.values);
+  auto __first  = __zv.begin();
+  auto __last   = __zv.end();
+  auto __guard  = std::__make_exception_guard([&] { __flat_multimap.clear(); });
+  auto __it     = std::remove_if(__first, __last, [&](auto&& __zipped) -> bool {
+    using _Ref = typename flat_multimap<_Key, _Tp, _Compare, _KeyContainer, _MappedContainer>::const_reference;
+    return __pred(_Ref(std::get<0>(__zipped), std::get<1>(__zipped)));
+  });
+  auto __res    = __last - __it;
+  auto __offset = __it - __first;
+
+  const auto __erase_container = [&](auto& __cont) { __cont.erase(__cont.begin() + __offset, __cont.end()); };
+
+  __erase_container(__flat_multimap.__containers_.keys);
+  __erase_container(__flat_multimap.__containers_.values);
+
+  __guard.__complete();
+  return __res;
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___FLAT_MAP_FLAT_MULTIMAP_H
diff --git a/libcxx/include/__flat_map/sorted_equivalent.h b/libcxx/include/__flat_map/sorted_equivalent.h
new file mode 100644
index 0000000000000..1db935cc6ee75
--- /dev/null
+++ b/libcxx/include/__flat_map/sorted_equivalent.h
@@ -0,0 +1,31 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H
+#define _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H
+
+#include <__config>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct sorted_equivalent_t {
+  explicit sorted_equivalent_t() = default;
+};
+inline constexpr sorted_equivalent_t sorted_equivalent{};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+#endif // _LIBCPP___FLAT_MAP_SORTED_EQUIVALENT_H
diff --git a/libcxx/include/__flat_map/utils.h b/libcxx/include/__flat_map/utils.h
new file mode 100644
index 0000000000000..acb7dca7ffe96
--- /dev/null
+++ b/libcxx/include/__flat_map/utils.h
@@ -0,0 +1,103 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___FLAT_MAP_UTILS_H
+#define _LIBCPP___FLAT_MAP_UTILS_H
+
+#include <__config>
+#include <__type_traits/container_traits.h>
+#include <__utility/exception_guard.h>
+#include <__utility/forward.h>
+#include <__utility/move.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if _LIBCPP_STD_VER >= 23
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// These utilities are defined in a class instead of a namespace so that this class can be befriended more easily.
+struct __flat_map_utils {
+  // Emplace a {key: value} into a flat_{multi}map, at the exact position that
+  // __it_key and __it_mapped point to, assuming that the key is not already present in the map.
+  // When an exception is thrown during the emplacement, the function will try its best to
+  // roll back the changes it made to the map. If it cannot roll back the changes, it will
+  // clear the map.
+  template <class _Map, class _IterK, class _IterM, class _KeyArg, class... _MArgs>
+  _LIBCPP_HIDE_FROM_ABI static typename _Map::iterator __emplace_exact_pos(
+      _Map& __map, _IterK&& __it_key, _IterM&& __it_mapped, _KeyArg&& __key, _MArgs&&... __mapped_args) {
+    auto __on_key_failed = std::__make_exception_guard([&]() noexcept {
+      using _KeyContainer = typename _Map::key_container_type;
+      if constexpr (__container_traits<_KeyContainer>::__emplacement_has_strong_exception_safety_guarantee) {
+        // Nothing to roll back!
+      } else {
+        // we need to clear both because we don't know the state of our keys anymore
+        __map.clear() /* noexcept */;
+      }
+    });
+    auto __key_it        = __map.__containers_.keys.emplace(__it_key, std::forward<_KeyArg>(__key));
+    __on_key_failed.__complete();
+
+    auto __on_value_failed = std::__make_exception_guard([&]() noexcept {
+      using _MappedContainer = typename _Map::mapped_container_type;
+      if constexpr (!__container_traits<_MappedContainer>::__emplacement_has_strong_exception_safety_guarantee) {
+        // we need to clear both because we don't know the state of our values anymore
+        __map.clear() /* noexcept */;
+      } else {
+        // In this case, we know the values are just like before we attempted emplacement,
+        // and we also know that the keys have been emplaced successfully. Just roll back the keys.
+#  if _LIBCPP_HAS_EXCEPTIONS
+        try {
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+          __map.__containers_.keys.erase(__key_it);
+#  if _LIBCPP_HAS_EXCEPTIONS
+        } catch (...) {
+          // Now things are funky for real. We're failing to rollback the keys.
+          // Just give up and clear the whole thing.
+          //
+          // Also, swallow the exception that happened during the rollback and let the
+          // original value-emplacement exception propagate normally.
+          __map.clear() /* noexcept */;
+        }
+#  endif // _LIBCPP_HAS_EXCEPTIONS
+      }
+    });
+    auto __mapped_it = __map.__containers_.values.emplace(__it_mapped, std::forward<_MArgs>(__mapped_args)...);
+    __on_value_failed.__complete();
+
+    return typename _Map::iterator(std::move(__key_it), std::move(__mapped_it));
+  }
+
+  // TODO: We could optimize this, see
+  // https://github.com/llvm/llvm-project/issues/108624
+  template <class _Map, class _InputIterator, class _Sentinel>
+  _LIBCPP_HIDE_FROM_ABI static typename _Map::size_type
+  __append(_Map& __map, _InputIterator __first, _Sentinel __last) {
+    typename _Map::size_type __num_appended = 0;
+    for (; __first != __last; ++__first) {
+      typename _Map::value_type __kv = *__first;
+      __map.__containers_.keys.insert(__map.__containers_.keys.end(), std::move(__kv.first));
+      __map.__containers_.values.insert(__map.__containers_.values.end(), std::move(__kv.second));
+      ++__num_appended;
+    }
+    return __num_appended;
+  }
+};
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+_LIBCPP_POP_MACROS
+
+#endif // #define _LIBCPP___FLAT_MAP_UTILS_H
diff --git a/libcxx/include/__functional/is_transparent.h b/libcxx/include/__functional/is_transparent.h
index b2d62f2e3ead8..567df1a662f54 100644
--- a/libcxx/include/__functional/is_transparent.h
+++ b/libcxx/include/__functional/is_transparent.h
@@ -21,11 +21,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if _LIBCPP_STD_VER >= 14
 
-template <class _Tp, class, class = void>
+template <class _Tp, class _Key = void, class = void>
 inline const bool __is_transparent_v = false;
 
-template <class _Tp, class _Up>
-inline const bool __is_transparent_v<_Tp, _Up, __void_t<typename _Tp::is_transparent> > = true;
+template <class _Tp, class _Key>
+inline const bool __is_transparent_v<_Tp, _Key, __void_t<typename _Tp::is_transparent> > = true;
 
 #endif
 
diff --git a/libcxx/include/flat_map b/libcxx/include/flat_map
index dbe5d8ee8f8c3..2552450081734 100644
--- a/libcxx/include/flat_map
+++ b/libcxx/include/flat_map
@@ -35,6 +35,25 @@ namespace std {
            class Predicate>
     typename flat_map<Key, T, Compare, KeyContainer, MappedContainer>::size_type
       erase_if(flat_map<Key, T, Compare, KeyContainer, MappedContainer>& c, Predicate pred);
+
+  // [flat.multimap], class template flat_multimap
+  template<class Key, class T, class Compare = less<Key>,
+           class KeyContainer = vector<Key>, class MappedContainer = vector<T>>
+    class flat_multimap;
+
+  struct sorted_equivalent_t { explicit sorted_equivalent_t() = default; };
+  inline constexpr sorted_equivalent_t sorted_equivalent{};
+
+  template<class Key, class T, class Compare, class KeyContainer, class MappedContainer,
+           class Allocator>
+    struct uses_allocator<flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>,
+                          Allocator>;
+
+  // [flat.multimap.erasure], erasure for flat_multimap
+  template<class Key, class T, class Compare, class KeyContainer, class MappedContainer,
+           class Predicate>
+    typename flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>::size_type
+      erase_if(flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>& c, Predicate pred);
 */
 
 #if __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
@@ -44,6 +63,8 @@ namespace std {
 
 #  if _LIBCPP_STD_VER >= 23
 #    include <__flat_map/flat_map.h>
+#    include <__flat_map/flat_multimap.h>
+#    include <__flat_map/sorted_equivalent.h>
 #    include <__flat_map/sorted_unique.h>
 #  endif
 
diff --git a/libcxx/include/module.modulemap b/libcxx/include/module.modulemap
index 6c2fb8dc3940b..4bae02137b37b 100644
--- a/libcxx/include/module.modulemap
+++ b/libcxx/include/module.modulemap
@@ -1244,9 +1244,20 @@ module std [system] {
   }
 
   module flat_map {
-    module flat_map                       { header "__flat_map/flat_map.h" }
+    module flat_map                       {
+      header "__flat_map/flat_map.h"
+      export std.vector.vector
+      export std.vector.fwd
+    }
+    module flat_multimap                  {
+      header "__flat_map/flat_multimap.h"
+      export std.vector.vector
+      export std.vector.fwd
+    }
     module key_value_iterator             { header "__flat_map/key_value_iterator.h" }
+    module sorted_equivalent              { header "__flat_map/sorted_equivalent.h" }
     module sorted_unique                  { header "__flat_map/sorted_unique.h" }
+    module utils                          { header "__flat_map/utils.h" }
 
     header "flat_map"
     export *
diff --git a/libcxx/include/version b/libcxx/include/version
index 57d6ec629d27c..29a71ed574e56 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -101,6 +101,8 @@ __cpp_lib_execution                                     201902L <execution>
                                                         201603L // C++17
 __cpp_lib_expected                                      202211L <expected>
 __cpp_lib_filesystem                                    201703L <filesystem>
+__cpp_lib_flat_map                                      202207L <flat_map>
+__cpp_lib_flat_set                                      202207L <flat_set>
 __cpp_lib_format                                        202110L <format>
 __cpp_lib_format_path                                   202403L <filesystem>
 __cpp_lib_format_ranges                                 202207L <format>
@@ -480,6 +482,8 @@ __cpp_lib_void_t                                        201411L <type_traits>
 # define __cpp_lib_constexpr_typeinfo                   202106L
 # define __cpp_lib_containers_ranges                    202202L
 # define __cpp_lib_expected                             202211L
+# define __cpp_lib_flat_map                             202207L
+// # define __cpp_lib_flat_set                             202207L
 # define __cpp_lib_format_ranges                        202207L
 // # define __cpp_lib_formatters                           202302L
 # define __cpp_lib_forward_like                         202207L
diff --git a/libcxx/modules/std/flat_map.inc b/libcxx/modules/std/flat_map.inc
index 6a86229bceaba..e9521749dc4a8 100644
--- a/libcxx/modules/std/flat_map.inc
+++ b/libcxx/modules/std/flat_map.inc
@@ -20,8 +20,6 @@ export namespace std {
   // [flat.map.erasure], erasure for flat_map
   using std::erase_if;
 
-#endif // _LIBCPP_STD_VER >= 23
-#if 0
   // [flat.multimap], class template flat_multimap
   using std::flat_multimap;
 
@@ -29,5 +27,5 @@ export namespace std {
   using std::sorted_equivalent_t;
 
   // [flat.multimap.erasure], erasure for flat_multimap
-#endif
+#endif // _LIBCPP_STD_VER >= 23
 } // namespace std
diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.input_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.input_range.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.input_range.pass.cpp
rename to libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.input_range.pass.cpp
diff --git a/libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.sorted_unique.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.sorted_unique.pass.cpp
similarity index 100%
rename from libcxx/test/libcxx/containers/containers.adaptors/flat.map/assert.sorted_unique.pass.cpp
rename to libcxx/test/libcxx/containers/container.adaptors/flat.map/assert.sorted_unique.pass.cpp
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp
new file mode 100644
index 0000000000000..504f36fcd00b8
--- /dev/null
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.input_range.pass.cpp
@@ -0,0 +1,66 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: libcpp-hardening-mode=none
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <flat_map>
+
+// flat_multimap(key_container_type , mapped_container_type , const key_compare& __comp = key_compare())
+// flat_multimap(const key_container_type& , const mapped_container_type& , const _Allocator& )
+// flat_multimap(const key_container_type& , const mapped_container_type& , const key_compare&, const _Allocator& )
+// void replace(key_container_type&& , mapped_container_type&&)
+//
+
+#include <flat_map>
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  using M = std::flat_multimap<int, int>;
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] { M m({1, 2, 3}, {4}); }()), "flat_multimap keys and mapped containers have different size");
+
+  TEST_LIBCPP_ASSERT_FAILURE(([] { M m({1, 2, 3}, {4}, std::less<int>{}); }()),
+                             "flat_multimap keys and mapped containers have different size");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{1, 2, 3};
+        const std::vector values{4};
+        const std::allocator<int> alloc{};
+        M m(keys, values, alloc);
+      }()),
+      "flat_multimap keys and mapped containers have different size");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{1, 2, 3};
+        const std::vector values{4};
+        const std::less<int> key_compare{};
+        const std::allocator<int> alloc{};
+        M m(keys, values, key_compare, alloc);
+      }()),
+      "flat_multimap keys and mapped containers have different size");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::vector keys{1, 2, 3};
+        std::vector values{4};
+        M m;
+        m.replace(std::move(keys), std::move(values));
+      }()),
+      "flat_multimap keys and mapped containers have different size");
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp
new file mode 100644
index 0000000000000..6b8ad3c7ac9aa
--- /dev/null
+++ b/libcxx/test/libcxx/containers/container.adaptors/flat.multimap/assert.sorted_equivalent.pass.cpp
@@ -0,0 +1,225 @@
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: libcpp-hardening-mode=none
+// REQUIRES: libcpp-hardening-mode=debug
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// <flat_map>
+
+// flat_multimap(key_container_type , mapped_container_type , const key_compare& __comp = key_compare())
+// flat_multimap(const key_container_type& , const mapped_container_type& , const _Allocator& )
+// flat_multimap(const key_container_type& , const mapped_container_type& , const key_compare&, const _Allocator& )
+// void replace(key_container_type&& , mapped_container_type&&)
+//
+
+#include <flat_map>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  using M = std::flat_multimap<int, int>;
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] { M m(std::sorted_equivalent, {2, 2, 1}, {4, 5, 6}); }()), "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] { M m(std::sorted_equivalent, {4, 2, 3}, {4, 5, 6}); }()), "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] { M m(std::sorted_equivalent, {2, 2, 1}, {4, 5, 6}, std::less<int>{}); }()), "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] { M m(std::sorted_equivalent, {4, 2, 3}, {4, 5, 6}, std::less<int>{}); }()), "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{2, 2, 1};
+        const std::vector values{4, 5, 6};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, keys, values, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{4, 2, 3};
+        const std::vector values{4, 5, 6};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, keys, values, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{2, 2, 1};
+        const std::vector values{4, 5, 6};
+        const std::allocator<int> alloc{};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, keys, values, comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector keys{4, 2, 3};
+        const std::vector values{4, 5, 6};
+        const std::allocator<int> alloc{};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, keys, values, comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), comp);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), comp);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::less<int> comp{};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::less<int> comp{};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v.begin(), v.end(), alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, v, comp);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::less<int> comp{};
+        M m(std::sorted_equivalent, v, comp);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::less<int> comp{};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v, comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::less<int> comp{};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v, comp, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        const std::allocator<int> alloc{};
+        M m(std::sorted_equivalent, v, alloc);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        M m;
+        m.insert(std::sorted_equivalent, v.begin(), v.end());
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        const std::vector<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        M m;
+        m.insert(std::sorted_equivalent, v.begin(), v.end());
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{2, 4}, {2, 5}, {1, 6}};
+        M m;
+        m.insert(std::sorted_equivalent, v);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::initializer_list<std::pair<int, int>> v{{4, 4}, {2, 5}, {3, 6}};
+        M m;
+        m.insert(std::sorted_equivalent, v);
+      }()),
+      "Key container is not sorted");
+
+  TEST_LIBCPP_ASSERT_FAILURE(
+      ([] {
+        std::vector keys{2, 1, 3};
+        std::vector values{4, 5, 6};
+        M m;
+        m.replace(std::move(keys), std::move(values));
+      }()),
+      "Key container is not sorted");
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp
new file mode 100644
index 0000000000000..d9ee3fbd287b5
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.map.syn/sorted_equivalent.pass.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// struct sorted_equivalent_t { explicit sorted_equivalent_t() = default; };
+// inline constexpr sorted_equivalent_t sorted_equivalent{};
+
+#include <cassert>
+#include <concepts>
+#include <flat_map>
+#include <type_traits>
+
+template <class T>
+void implicit_test(T) {}
+
+template <class T>
+concept HasImplicitDefaultCtor = requires { implicit_test<T>({}); };
+
+static_assert(std::is_default_constructible_v<std::sorted_equivalent_t>);
+static_assert(std::is_trivially_default_constructible_v<std::sorted_equivalent_t>);
+static_assert(!HasImplicitDefaultCtor<std::sorted_equivalent_t>);
+
+constexpr bool test() {
+  {
+    [[maybe_unused]] std::sorted_equivalent_t s;
+  }
+  {
+    [[maybe_unused]] std::same_as<const std::sorted_equivalent_t&> decltype(auto) s = (std::sorted_equivalent);
+  }
+  {
+    [[maybe_unused]] std::same_as<const std::sorted_equivalent_t> decltype(auto) copy = std::sorted_equivalent;
+  }
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
index 5ecc2cf7c917b..05efe063c1e17 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.pass.cpp
@@ -25,7 +25,9 @@
 
 template <class KeyContainer, class ValueContainer>
 void test() {
-  using M = std::flat_map<int, double, std::less<int>, KeyContainer, ValueContainer>;
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<int>, KeyContainer, ValueContainer>;
   M m;
   ASSERT_SAME_TYPE(decltype(m.empty()), bool);
   ASSERT_NOEXCEPT(m.empty());
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp
index cc8016182dcb6..79b943b790d04 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.capacity/empty.verify.cpp
@@ -14,11 +14,7 @@
 
 #include <flat_map>
 
-#include "test_macros.h"
-
-int main(int, char**) {
+void f() {
   std::flat_map<int, int> c;
   c.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
-
-  return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp
new file mode 100644
index 0000000000000..190d78f927f34
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.compile.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// Test CTAD on cases where deduction should fail.
+
+#include <flat_map>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+struct NotAnAllocator {
+  friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; }
+};
+
+using P  = std::pair<int, long>;
+using PC = std::pair<const int, long>;
+
+template <class... Args>
+concept CanDeductFlatMap = requires { std::flat_map{std::declval<Args>()...}; };
+
+static_assert(CanDeductFlatMap<std::vector<int>, std::vector<int>>);
+
+// cannot deduce Key and T from nothing
+static_assert(!CanDeductFlatMap<>);
+
+// cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs
+static_assert(!CanDeductFlatMap<std::vector<std::pair<int, int>>>);
+
+// cannot deduce Key and T from just (KeyContainer, Allocator)
+static_assert(!CanDeductFlatMap<std::vector<int>, std::allocator<std::pair<const int, int>>>);
+
+// cannot deduce Key and T from just (Compare)
+static_assert(!CanDeductFlatMap<std::less<int>>);
+
+// cannot deduce Key and T from just (Compare, Allocator)
+static_assert(!CanDeductFlatMap<std::less<int>, std::allocator<PC>>);
+
+// cannot deduce Key and T from just (Allocator)
+static_assert(!CanDeductFlatMap<std::allocator<PC>>);
+
+// cannot convert from some arbitrary unrelated type
+static_assert(!CanDeductFlatMap<NotAnAllocator>);
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp
index d01bee9aae9c0..009392feb3862 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.pass.cpp
@@ -169,6 +169,24 @@ void test_iter_iter() {
     std::flat_map m(mo.cbegin(), mo.cend());
     ASSERT_SAME_TYPE(decltype(m), decltype(mo));
   }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
+    std::flat_map s               = {source, source + 3}; // flat_map(InputIterator, InputIterator)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, int>);
+    assert(s.size() == 3);
+  }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
+    std::flat_map s{source, source + 3}; // flat_map(InputIterator, InputIterator)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, int>);
+    assert(s.size() == 3);
+  }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
+    std::flat_map s{std::sorted_unique, source, source + 3}; // flat_map(sorted_unique_t, InputIterator, InputIterator)
+    static_assert(std::is_same_v<decltype(s), std::flat_map<int, int>>);
+    assert(s.size() == 3);
+  }
 }
 
 void test_iter_iter_compare() {
@@ -227,6 +245,19 @@ void test_initializer_list() {
     ASSERT_SAME_TYPE(decltype(m), std::flat_map<int, long>);
     assert(std::ranges::equal(m, sorted_arr));
   }
+  {
+    std::flat_map s = {std::make_pair(1, 'a')}; // flat_map(initializer_list<pair<int, char>>)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, char>);
+    assert(s.size() == 1);
+  }
+  {
+    using M = std::flat_map<int, short>;
+    M m;
+    std::flat_map s = {std::make_pair(m, m)}; // flat_map(initializer_list<pair<M, M>>)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_map<M, M>);
+    assert(s.size() == 1);
+    assert(s[m] == m);
+  }
 }
 
 void test_initializer_list_compare() {
@@ -305,38 +336,6 @@ int main(int, char**) {
   test_from_range_compare();
 
   AssociativeContainerDeductionGuidesSfinaeAway<std::flat_map, std::flat_map<int, short>>();
-  {
-    std::flat_map s = {std::make_pair(1, 'a')}; // flat_map(initializer_list<pair<int, char>>)
-    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, char>);
-    assert(s.size() == 1);
-  }
-  {
-    using M = std::flat_map<int, short>;
-    M m;
-    std::flat_map s = {std::make_pair(m, m)}; // flat_map(initializer_list<pair<M, M>>)
-    ASSERT_SAME_TYPE(decltype(s), std::flat_map<M, M>);
-    assert(s.size() == 1);
-    assert(s[m] == m);
-  }
-
-  {
-    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
-    std::flat_map s               = {source, source + 3}; // flat_map(InputIterator, InputIterator)
-    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, int>);
-    assert(s.size() == 3);
-  }
-  {
-    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
-    std::flat_map s{source, source + 3}; // flat_map(InputIterator, InputIterator)
-    ASSERT_SAME_TYPE(decltype(s), std::flat_map<int, int>);
-    assert(s.size() == 3);
-  }
-  {
-    std::pair<int, int> source[3] = {{1, 1}, {2, 2}, {3, 3}};
-    std::flat_map s{std::sorted_unique, source, source + 3}; // flat_map(sorted_unique_t, InputIterator, InputIterator)
-    static_assert(std::is_same_v<decltype(s), std::flat_map<int, int>>);
-    assert(s.size() == 3);
-  }
 
   return 0;
 }
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp
index 08244f01cb24e..ed20c1ae715b8 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/deduct.verify.cpp
@@ -14,56 +14,12 @@
 
 #include <flat_map>
 #include <functional>
-#include <memory>
 #include <utility>
-#include <vector>
-
-struct NotAnAllocator {
-  friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; }
-};
 
 using P  = std::pair<int, long>;
 using PC = std::pair<const int, long>;
 
 void test() {
-  {
-    // cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs
-    std::vector<std::pair<int, int>> v;
-    std::flat_map s(v);
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot deduce Key and T from just (KeyContainer, Allocator)
-    std::vector<int> v;
-    std::flat_map s(v, std::allocator<std::pair<const int, int>>());
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot deduce Key and T from nothing
-    std::flat_map m;
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot deduce Key and T from just (Compare)
-    std::flat_map m(std::less<int>{});
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot deduce Key and T from just (Compare, Allocator)
-    std::flat_map m(std::less<int>{}, std::allocator<PC>{});
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot deduce Key and T from just (Allocator)
-    std::flat_map m(std::allocator<PC>{});
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
-  {
-    // cannot convert from some arbitrary unrelated type
-    NotAnAllocator a;
-    std::flat_map m(a);
-    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_map'}}}}
-  }
   {
     // cannot deduce that the inner braced things should be std::pair and not something else
     std::flat_map m{{1, 1L}, {2, 2L}, {3, 3L}};
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
index ac24c8a8ac067..790dfa4a02ed5 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/default_noexcept.pass.cpp
@@ -37,10 +37,12 @@ int main(int, char**) {
   {
     using C = std::flat_map<MoveOnly, MoveOnly>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
+    C c;
   }
   {
     using C = std::flat_map<MoveOnly, MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>;
     static_assert(std::is_nothrow_default_constructible_v<C>);
+    C c;
   }
 #endif // _LIBCPP_VERSION
   {
diff --git a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
index e3ab33a55d95b..1570b0fa14888 100644
--- a/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/container.adaptors/flat.map/flat.map.cons/dtor_noexcept.pass.cpp
@@ -24,28 +24,32 @@
 
 struct ThrowingDtorComp {
   bool operator()(const auto&, const auto&) const;
-  ~ThrowingDtorComp() noexcept(false);
+  ~ThrowingDtorComp() noexcept(false) {}
 };
 
 int main(int, char**) {
   {
     using C = std::flat_map<MoveOnly, MoveOnly>;
     static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
   }
   {
     using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
     using C = std::flat_map<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
   }
   {
     using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
     using C = std::flat_map<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V>;
     static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
   }
 #if defined(_LIBCPP_VERSION)
   {
     using C = std::flat_map<MoveOnly, MoveOnly, ThrowingDtorComp>;
     static_assert(!std::is_nothrow_destructible_v<C>);
+    C c;
   }
 #endif // _LIBCPP_VERSION
 
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
new file mode 100644
index 0000000000000..4fa4fd6a69b94
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.pass.cpp
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// [[nodiscard]] bool empty() const noexcept;
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<int>, KeyContainer, ValueContainer>;
+  M m;
+  ASSERT_SAME_TYPE(decltype(m.empty()), bool);
+  ASSERT_NOEXCEPT(m.empty());
+  assert(m.empty());
+  assert(std::as_const(m).empty());
+  m = {{1, 1.0}, {1, 2.0}};
+  assert(!m.empty());
+  m.clear();
+  assert(m.empty());
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp
new file mode 100644
index 0000000000000..9b7b827c9bec8
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/empty.verify.cpp
@@ -0,0 +1,22 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// [[nodiscard]] bool empty() const noexcept;
+
+#include <flat_map>
+
+void f() {
+  std::flat_multimap<int, int> c;
+  c.empty(); // expected-warning {{ignoring return value of function declared with 'nodiscard' attribute}}
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
new file mode 100644
index 0000000000000..0960c43c5a90a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/max_size.pass.cpp
@@ -0,0 +1,78 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// size_type max_size() const noexcept;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <limits>
+#include <type_traits>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+int main(int, char**) {
+  {
+    using A1 = limited_allocator<int, 10>;
+    using A2 = limited_allocator<int, 20>;
+    using C  = std::flat_multimap<int, int, std::less<int>, std::vector<int, A1>, std::vector<int, A2>>;
+    ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(C::size_type, std::size_t);
+    const C c;
+    ASSERT_NOEXCEPT(c.max_size());
+    ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type);
+    assert(c.max_size() <= 10);
+    LIBCPP_ASSERT(c.max_size() == 10);
+  }
+  {
+    using A1 = limited_allocator<int, 10>;
+    using A2 = limited_allocator<int, 20>;
+    using C  = std::flat_multimap<int, int, std::less<int>, std::vector<int, A2>, std::vector<int, A1>>;
+    ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(C::size_type, std::size_t);
+    const C c;
+    ASSERT_NOEXCEPT(c.max_size());
+    ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type);
+    assert(c.max_size() <= 10);
+    LIBCPP_ASSERT(c.max_size() == 10);
+  }
+  {
+    using A = limited_allocator<int, (size_t)-1>;
+    using C = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::vector<int, A>>;
+    ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(C::size_type, std::size_t);
+    const C::size_type max_dist = static_cast<C::size_type>(std::numeric_limits<C::difference_type>::max());
+    const C c;
+    ASSERT_NOEXCEPT(c.max_size());
+    ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type);
+    assert(c.max_size() <= max_dist);
+    LIBCPP_ASSERT(c.max_size() == max_dist);
+  }
+  {
+    typedef std::flat_multimap<char, char> C;
+    ASSERT_SAME_TYPE(C::difference_type, std::ptrdiff_t);
+    ASSERT_SAME_TYPE(C::size_type, std::size_t);
+    const C::size_type max_dist = static_cast<C::size_type>(std::numeric_limits<C::difference_type>::max());
+    const C c;
+    ASSERT_NOEXCEPT(c.max_size());
+    ASSERT_SAME_TYPE(decltype(c.max_size()), C::size_type);
+    assert(c.max_size() <= max_dist);
+    assert(c.max_size() <= alloc_max_size(std::allocator<char>()));
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
new file mode 100644
index 0000000000000..533f8da631fc8
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.capacity/size.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// size_type size() const noexcept;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using M = std::flat_multimap<int, char, std::less<int>, KeyContainer, ValueContainer>;
+  {
+    const M m = {{1, 'a'}, {1, 'b'}, {4, 'd'}, {5, 'e'}, {5, 'h'}};
+    ASSERT_SAME_TYPE(decltype(m.size()), std::size_t);
+    ASSERT_NOEXCEPT(m.size());
+    assert(m.size() == 5);
+  }
+  {
+    const M m = {{1, 'a'}};
+    ASSERT_SAME_TYPE(decltype(m.size()), std::size_t);
+    ASSERT_NOEXCEPT(m.size());
+    assert(m.size() == 1);
+  }
+  {
+    const M m;
+    ASSERT_SAME_TYPE(decltype(m.size()), std::size_t);
+    ASSERT_NOEXCEPT(m.size());
+    assert(m.size() == 0);
+  }
+  {
+    M m;
+    std::size_t s = 1000;
+    for (auto i = 0u; i < s; ++i) {
+      m.emplace(i, 'a');
+    }
+    for (auto i = 0u; i < s; ++i) {
+      m.emplace(i, 'b');
+    }
+    ASSERT_SAME_TYPE(decltype(m.size()), std::size_t);
+    ASSERT_NOEXCEPT(m.size());
+    assert(m.size() == 2 * s);
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
new file mode 100644
index 0000000000000..3e155eb2a1075
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/alloc.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// template<class Allocator>
+//   explicit flat_multimap(const Allocator& a);
+
+#include <cassert>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "test_allocator.h"
+#include "../../../test_compare.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const A2&>);
+  }
+  {
+    // explicit
+    using M =
+        std::flat_multimap<int,
+                           long,
+                           std::less<int>,
+                           std::vector<int, test_allocator<int>>,
+                           std::vector<long, test_allocator<long>>>;
+
+    static_assert(std::is_constructible_v<M, test_allocator<int>>);
+    static_assert(!std::is_convertible_v<test_allocator<int>, M>);
+  }
+  {
+    using A = test_allocator<short>;
+    using M =
+        std::flat_multimap<int,
+                           long,
+                           std::less<int>,
+                           std::vector<int, test_allocator<int>>,
+                           std::vector<long, test_allocator<long>>>;
+    M m(A(0, 5));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.keys().get_allocator().get_id() == 5);
+    assert(m.values().get_allocator().get_id() == 5);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
new file mode 100644
index 0000000000000..32f75daae7e38
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/assign_initializer_list.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(initializer_list<value_type> il);
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  {
+    M m = {{8, 8}, {10, 10}};
+    assert(m.size() == 2);
+    m                              = {{3, 0}, {1, 0}, {2, 0}, {2, 1}, {3, 1}, {4, 0}, {3, 2}, {5, 0}, {6, 0}, {5, 1}};
+    std::pair<int, int> expected[] = {{1, 0}, {2, 0}, {2, 1}, {3, 0}, {3, 1}, {3, 2}, {4, 0}, {5, 0}, {5, 1}, {6, 0}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    M m = {{10, 1}, {8, 1}};
+    assert(m.size() == 2);
+    m                                    = {{3, 2}};
+    std::pair<double, double> expected[] = {{3, 2}};
+    assert(std::ranges::equal(m, expected));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
new file mode 100644
index 0000000000000..1989b8a4ff68a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/compare.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// explicit flat_multimap(const key_compare& comp);
+// template <class Alloc>
+//   flat_multimap(const key_compare& comp, const Alloc& a);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using M1 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A1>>;
+    using M2 = std::flat_multimap<int, int, C, std::vector<int, A1>, std::vector<int, A2>>;
+    using M3 = std::flat_multimap<int, int, C, std::vector<int, A2>, std::vector<int, A1>>;
+    static_assert(std::is_constructible_v<M1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const C&, const A2&>);
+  }
+  {
+    using C = test_less<int>;
+    auto m  = std::flat_multimap<int, char*, C>(C(3));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(3));
+  }
+  {
+    // The one-argument ctor is explicit.
+    using C = test_less<int>;
+    static_assert(std::is_constructible_v<std::flat_multimap<int, char*, C>, C>);
+    static_assert(!std::is_convertible_v<C, std::flat_multimap<int, char*, C>>);
+
+    static_assert(std::is_constructible_v<std::flat_multimap<int, char*>, std::less<int>>);
+    static_assert(!std::is_convertible_v<std::less<int>, std::flat_multimap<int, char*>>);
+  }
+  {
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    auto m   = std::flat_multimap<int, short, C, std::vector<int, A1>, std::vector<short, A2>>(C(4), A1(5));
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // explicit(false)
+    using C                                                                         = test_less<int>;
+    using A1                                                                        = test_allocator<int>;
+    using A2                                                                        = test_allocator<short>;
+    std::flat_multimap<int, short, C, std::deque<int, A1>, std::deque<short, A2>> m = {C(4), A1(5)};
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // If an allocator is given, it must be usable by both containers.
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
+    static_assert(std::is_constructible_v<M, std::less<>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::less<>, A>);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
new file mode 100644
index 0000000000000..17ee3c3864b1b
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/containers.pass.cpp
@@ -0,0 +1,187 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(key_container_type key_cont, mapped_container_type mapped_cont,
+//           const key_compare& comp = key_compare());
+// template<class Allocator>
+//   flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont,
+//            const Allocator& a);
+// template<class Alloc>
+//   flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont,
+//            const key_compare& comp, const Alloc& a);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "min_allocator.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+struct P {
+  int first;
+  int second;
+  template <class T, class U>
+  bool operator==(const std::pair<T, U>& rhs) const {
+    return MoveOnly(first) == rhs.first && MoveOnly(second) == rhs.second;
+  }
+};
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, const V1&, const V1&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const V1&, const V1&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const V1&, const V2&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const V2&, const V1&, const C&, const A2&>);
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type)
+    using M                         = std::flat_multimap<int, char>;
+    std::vector<int> ks             = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::vector<char> vs            = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                          = M(ks, vs);
+    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+    assert(std::ranges::equal(m, expected));
+
+    // explicit(false)
+    M m2 = {ks, vs};
+    assert(m2 == m);
+
+    m = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type)
+    // move-only
+    P expected[] = {{3, 3}, {3, 2}, {2, 1}, {1, 4}};
+    using Ks     = std::deque<int, min_allocator<int>>;
+    using Vs     = std::vector<MoveOnly, min_allocator<MoveOnly>>;
+    using M      = std::flat_multimap<int, MoveOnly, std::greater<int>, Ks, Vs>;
+    Ks ks        = {1, 3, 3, 2};
+    Vs vs;
+    vs.push_back(4);
+    vs.push_back(3);
+    vs.push_back(2);
+    vs.push_back(1);
+    auto m = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert(std::ranges::equal(m, expected, std::equal_to<>()));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type)
+    // container's allocators are used
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
+    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    auto m  = M(std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
+    assert(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(6));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type, key_compare)
+    using C                         = test_less<int>;
+    using M                         = std::flat_multimap<int, char, C>;
+    std::vector<int> ks             = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::vector<char> vs            = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                          = M(ks, vs, C(4));
+    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+    assert(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(4));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type, const Allocator&)
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
+    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    auto m  = M(ks, vs, A(4)); // replaces the allocators
+    assert(!ks.empty());       // it was an lvalue above
+    assert(!vs.empty());       // it was an lvalue above
+    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
+    assert(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(4));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type, const Allocator&)
+    // explicit(false)
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
+    auto ks = std::vector<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(5));
+    auto vs = std::deque<int, A>({1, 1, 1, 2, 2, 3, 2, 3, 3}, A(6));
+    M m     = {ks, vs, A(4)}; // implicit ctor
+    assert(!ks.empty());      // it was an lvalue above
+    assert(!vs.empty());      // it was an lvalue above
+    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}};
+    assert(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(4));
+  }
+  {
+    // flat_multimap(key_container_type , mapped_container_type, key_compare, const Allocator&)
+    using C                         = test_less<int>;
+    using A                         = test_allocator<int>;
+    using M                         = std::flat_multimap<int, int, C, std::vector<int, A>, std::vector<int, A>>;
+    std::vector<int, A> ks          = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::vector<int, A> vs          = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    auto m                          = M(ks, vs, C(4), A(5));
+    std::pair<int, char> expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+    assert(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(5));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4), A(5)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+    assert(m2.keys().get_allocator() == A(5));
+    assert(m2.values().get_allocator() == A(5));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
new file mode 100644
index 0000000000000..0e6d12cd3c569
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy.pass.cpp
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(const flat_multimap& m);
+
+#include <cassert>
+#include <flat_map>
+#include <vector>
+
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+
+int main(int, char**) {
+  {
+    using C = test_less<int>;
+    std::vector<int, test_allocator<int>> ks({1, 1, 3, 3, 5}, test_allocator<int>(6));
+    std::vector<char, test_allocator<char>> vs({2, 2, 1, 1, 1}, test_allocator<char>(7));
+    using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
+    auto mo = M(ks, vs, C(5));
+    auto m  = mo;
+
+    assert(m.key_comp() == C(5));
+    assert(m.keys() == ks);
+    assert(m.values() == vs);
+    assert(m.keys().get_allocator() == test_allocator<int>(6));
+    assert(m.values().get_allocator() == test_allocator<char>(7));
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys() == ks);
+    assert(mo.values() == vs);
+    assert(mo.keys().get_allocator() == test_allocator<int>(6));
+    assert(mo.values().get_allocator() == test_allocator<char>(7));
+  }
+  {
+    using C  = test_less<int>;
+    using Ks = std::vector<int, other_allocator<int>>;
+    using Vs = std::vector<char, other_allocator<char>>;
+    auto ks  = Ks({1, 3, 5, 5, 5, 5}, other_allocator<int>(6));
+    auto vs  = Vs({2, 2, 5, 5, 5, 1}, other_allocator<char>(7));
+    using M  = std::flat_multimap<int, char, C, Ks, Vs>;
+    auto mo  = M(Ks(ks, other_allocator<int>(6)), Vs(vs, other_allocator<int>(7)), C(5));
+    auto m   = mo;
+
+    assert(m.key_comp() == C(5));
+    assert(m.keys() == ks);
+    assert(m.values() == vs);
+    assert(m.keys().get_allocator() == other_allocator<int>(-2));
+    assert(m.values().get_allocator() == other_allocator<char>(-2));
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys() == ks);
+    assert(mo.values() == vs);
+    assert(mo.keys().get_allocator() == other_allocator<int>(6));
+    assert(mo.values().get_allocator() == other_allocator<char>(7));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
new file mode 100644
index 0000000000000..3047c004d42e9
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_alloc.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(const flat_multimap&, const allocator_type&);
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, const M1&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, const M1&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, const M2&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, const M3&, const A2&>);
+  }
+  {
+    using C = test_less<int>;
+    std::vector<int, test_allocator<int>> ks({1, 3, 3, 5, 5}, test_allocator<int>(6));
+    std::vector<char, test_allocator<char>> vs({2, 2, 1, 1, 1}, test_allocator<char>(7));
+    using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
+    auto mo = M(ks, vs, C(5));
+    auto m  = M(mo, test_allocator<int>(3));
+
+    assert(m.key_comp() == C(5));
+    assert(m.keys() == ks);
+    assert(m.values() == vs);
+    assert(m.keys().get_allocator() == test_allocator<int>(3));
+    assert(m.values().get_allocator() == test_allocator<char>(3));
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys() == ks);
+    assert(mo.values() == vs);
+    assert(mo.keys().get_allocator() == test_allocator<int>(6));
+    assert(mo.values().get_allocator() == test_allocator<char>(7));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp
new file mode 100644
index 0000000000000..233a9c6859318
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.addressof.compile.pass.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(const flat_multimap& s);
+
+// Validate whether the container can be copy-assigned (move-assigned, swapped)
+// with an ADL-hijacking operator&
+
+#include <flat_map>
+#include <utility>
+
+#include "test_macros.h"
+#include "operator_hijacker.h"
+
+void test() {
+  std::flat_multimap<operator_hijacker, operator_hijacker> so;
+  std::flat_multimap<operator_hijacker, operator_hijacker> s;
+  s = so;
+  s = std::move(so);
+  swap(s, so);
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
new file mode 100644
index 0000000000000..3dd7ebdd38871
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/copy_assign.pass.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(const flat_multimap& m);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+
+int main(int, char**) {
+  {
+    // test_allocator is not propagated
+    using C = test_less<int>;
+    std::vector<int, test_allocator<int>> ks({1, 1, 3, 3, 5}, test_allocator<int>(6));
+    std::vector<char, test_allocator<char>> vs({1, 2, 3, 4, 5}, test_allocator<char>(7));
+    using M = std::flat_multimap<int, char, C, decltype(ks), decltype(vs)>;
+    auto mo = M(ks, vs, C(5));
+    auto m  = M({{3, 3}, {4, 4}, {5, 5}}, C(3), test_allocator<int>(2));
+    m       = mo;
+
+    assert(m.key_comp() == C(5));
+    assert(m.keys() == ks);
+    assert(m.values() == vs);
+    assert(m.keys().get_allocator() == test_allocator<int>(2));
+    assert(m.values().get_allocator() == test_allocator<char>(2));
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys() == ks);
+    assert(mo.values() == vs);
+    assert(mo.keys().get_allocator() == test_allocator<int>(6));
+    assert(mo.values().get_allocator() == test_allocator<char>(7));
+  }
+  {
+    // other_allocator is propagated
+    using C  = test_less<int>;
+    using Ks = std::vector<int, other_allocator<int>>;
+    using Vs = std::vector<char, other_allocator<char>>;
+    auto ks  = Ks({1, 1, 3, 3, 5}, other_allocator<int>(6));
+    auto vs  = Vs({2, 1, 3, 2, 1}, other_allocator<char>(7));
+    using M  = std::flat_multimap<int, char, C, Ks, Vs>;
+    auto mo  = M(Ks(ks, other_allocator<int>(6)), Vs(vs, other_allocator<int>(7)), C(5));
+    auto m   = M({{3, 3}, {4, 4}, {5, 5}}, C(3), other_allocator<int>(2));
+    m        = mo;
+
+    assert(m.key_comp() == C(5));
+    assert(m.keys() == ks);
+    assert(m.values() == vs);
+    assert(m.keys().get_allocator() == other_allocator<int>(6));
+    assert(m.values().get_allocator() == other_allocator<char>(7));
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys() == ks);
+    assert(mo.values() == vs);
+    assert(mo.keys().get_allocator() == other_allocator<int>(6));
+    assert(mo.values().get_allocator() == other_allocator<char>(7));
+  }
+  {
+    // self-assignment
+    using M = std::flat_multimap<int, int>;
+    M m     = {{1, 1}, {3, 4}};
+    m       = static_cast<const M&>(m);
+    assert((m == M{{1, 1}, {3, 4}}));
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp
new file mode 100644
index 0000000000000..a9d8382bd037c
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.compile.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// Test CTAD on cases where deduction should fail.
+
+#include <flat_map>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+struct NotAnAllocator {
+  friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; }
+};
+
+using P  = std::pair<int, long>;
+using PC = std::pair<const int, long>;
+
+template <class... Args>
+concept CanDeductFlatMultimap = requires { std::flat_multimap{std::declval<Args>()...}; };
+
+static_assert(CanDeductFlatMultimap<std::vector<int>, std::vector<int>>);
+
+// cannot deduce Key and T from nothing
+static_assert(!CanDeductFlatMultimap<>);
+
+// cannot deduce Key and T from just (KeyContainer), even if it's a container of pairs
+static_assert(!CanDeductFlatMultimap<std::vector<std::pair<int, int>>>);
+
+// cannot deduce Key and T from just (KeyContainer, Allocator)
+static_assert(!CanDeductFlatMultimap<std::vector<int>, std::allocator<std::pair<const int, int>>>);
+
+// cannot deduce Key and T from just (Compare)
+static_assert(!CanDeductFlatMultimap<std::less<int>>);
+
+// cannot deduce Key and T from just (Compare, Allocator)
+static_assert(!CanDeductFlatMultimap<std::less<int>, std::allocator<PC>>);
+
+// cannot deduce Key and T from just (Allocator)
+static_assert(!CanDeductFlatMultimap<std::allocator<PC>>);
+
+// cannot convert from some arbitrary unrelated type
+static_assert(!CanDeductFlatMultimap<NotAnAllocator>);
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp
new file mode 100644
index 0000000000000..a718d9cfad5b7
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.pass.cpp
@@ -0,0 +1,343 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <deque>
+#include <initializer_list>
+#include <list>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "deduction_guides_sfinae_checks.h"
+#include "test_allocator.h"
+
+using P  = std::pair<int, long>;
+using PC = std::pair<const int, long>;
+
+void test_copy() {
+  {
+    std::flat_multimap<long, short> source = {{1, 2}, {1, 3}};
+    std::flat_multimap s(source);
+    ASSERT_SAME_TYPE(decltype(s), decltype(source));
+    assert(s == source);
+  }
+  {
+    std::flat_multimap<long, short, std::greater<long>> source = {{1, 2}, {1, 3}};
+    std::flat_multimap s{source}; // braces instead of parens
+    ASSERT_SAME_TYPE(decltype(s), decltype(source));
+    assert(s == source);
+  }
+  {
+    std::flat_multimap<long, short, std::greater<long>> source = {{1, 2}, {1, 3}};
+    std::flat_multimap s(source, std::allocator<int>());
+    ASSERT_SAME_TYPE(decltype(s), decltype(source));
+    assert(s == source);
+  }
+}
+
+void test_containers() {
+  std::deque<int, test_allocator<int>> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator<int>(0, 43));
+  std::deque<int, test_allocator<int>> sorted_ks({1, 1, 2, 2, 2, 3, INT_MAX}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> sorted_vs({1, 3, 2, 4, 5, 4, 3}, test_allocator<int>(0, 43));
+  const std::pair<int, short> expected[] = {{1, 1}, {1, 3}, {2, 2}, {2, 4}, {2, 5}, {3, 4}, {INT_MAX, 3}};
+  {
+    std::flat_multimap s(ks, vs);
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::less<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 43);
+  }
+  {
+    std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs);
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::less<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 43);
+  }
+  {
+    std::flat_multimap s(ks, vs, test_allocator<long>(0, 44));
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::less<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 44);
+    assert(s.values().get_allocator().get_id() == 44);
+  }
+  {
+    std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs, test_allocator<long>(0, 44));
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::less<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 44);
+    assert(s.values().get_allocator().get_id() == 44);
+  }
+}
+
+void test_containers_compare() {
+  std::deque<int, test_allocator<int>> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator<int>(0, 43));
+  std::deque<int, test_allocator<int>> sorted_ks({INT_MAX, 3, 2, 2, 2, 1, 1}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> sorted_vs({3, 4, 2, 4, 5, 1, 3}, test_allocator<int>(0, 43));
+  const std::pair<int, short> expected[] = {{INT_MAX, 3}, {3, 4}, {2, 2}, {2, 4}, {2, 5}, {1, 1}, {1, 3}};
+  {
+    std::flat_multimap s(ks, vs, std::greater<int>());
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::greater<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 43);
+  }
+  {
+    std::flat_multimap s(std::sorted_equivalent, sorted_ks, sorted_vs, std::greater<int>());
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::greater<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 43);
+  }
+  {
+    std::flat_multimap s(ks, vs, std::greater<int>(), test_allocator<long>(0, 44));
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::greater<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 44);
+    assert(s.values().get_allocator().get_id() == 44);
+  }
+  {
+    std::flat_multimap s(
+        std::sorted_equivalent, sorted_ks, sorted_vs, std::greater<int>(), test_allocator<long>(0, 44));
+
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::greater<int>, decltype(ks), decltype(vs)>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 44);
+    assert(s.values().get_allocator().get_id() == 44);
+  }
+}
+
+void test_iter_iter() {
+  const P arr[]          = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}};
+  const P sorted_arr[]   = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}};
+  const PC arrc[]        = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}};
+  const PC sorted_arrc[] = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}};
+  {
+    std::flat_multimap m(std::begin(arr), std::end(arr));
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::begin(arrc), std::end(arrc));
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr));
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc));
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap<int, short> mo;
+    std::flat_multimap m(mo.begin(), mo.end());
+    ASSERT_SAME_TYPE(decltype(m), decltype(mo));
+  }
+  {
+    std::flat_multimap<int, short> mo;
+    std::flat_multimap m(mo.cbegin(), mo.cend());
+    ASSERT_SAME_TYPE(decltype(m), decltype(mo));
+  }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {1, 1}, {3, 3}};
+    std::flat_multimap s          = {source, source + 3}; // flat_multimap(InputIterator, InputIterator)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, int>);
+    assert(s.size() == 3);
+  }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {1, 1}, {3, 3}};
+    std::flat_multimap s{source, source + 3}; // flat_multimap(InputIterator, InputIterator)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, int>);
+    assert(s.size() == 3);
+  }
+  {
+    std::pair<int, int> source[3] = {{1, 1}, {1, 2}, {3, 3}};
+    std::flat_multimap s{
+        std::sorted_equivalent, source, source + 3}; // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator)
+    static_assert(std::is_same_v<decltype(s), std::flat_multimap<int, int>>);
+    assert(s.size() == 3);
+  }
+}
+
+void test_iter_iter_compare() {
+  const P arr[]          = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}};
+  const P sorted_arr[]   = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}};
+  const PC arrc[]        = {{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}};
+  const PC sorted_arrc[] = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}};
+  using C                = std::greater<long long>;
+  {
+    std::flat_multimap m(std::begin(arr), std::end(arr), C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::begin(arrc), std::end(arrc), C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arr), std::end(sorted_arr), C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, std::begin(sorted_arrc), std::end(sorted_arrc), C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap<int, short> mo;
+    std::flat_multimap m(mo.begin(), mo.end(), C());
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, short, C>);
+  }
+  {
+    std::flat_multimap<int, short> mo;
+    std::flat_multimap m(mo.cbegin(), mo.cend(), C());
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, short, C>);
+  }
+}
+
+void test_initializer_list() {
+  const P sorted_arr[] = {{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}};
+  {
+    std::flat_multimap m{std::pair{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}};
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, {std::pair{1, 1L}, {1, 1L}, {2, 2L}, {3, 1L}, {INT_MAX, 1L}});
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap s = {std::make_pair(1, 'a')}; // flat_multimap(initializer_list<pair<int, char>>)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, char>);
+    assert(s.size() == 1);
+  }
+  {
+    using M = std::flat_multimap<int, short>;
+    M m;
+    std::flat_multimap s = {std::make_pair(m, m)}; // flat_multimap(initializer_list<pair<M, M>>)
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<M, M>);
+    assert(s.size() == 1);
+    assert(s.find(m)->second == m);
+  }
+}
+
+void test_initializer_list_compare() {
+  const P sorted_arr[] = {{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}};
+  using C              = std::greater<long long>;
+  {
+    std::flat_multimap m({std::pair{1, 1L}, {2, 2L}, {1, 1L}, {INT_MAX, 1L}, {3, 1L}}, C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+  {
+    std::flat_multimap m(std::sorted_equivalent, {std::pair{INT_MAX, 1L}, {3, 1L}, {2, 2L}, {1, 1L}, {1, 1L}}, C());
+
+    ASSERT_SAME_TYPE(decltype(m), std::flat_multimap<int, long, C>);
+    assert(std::ranges::equal(m, sorted_arr));
+  }
+}
+
+void test_from_range() {
+  std::list<std::pair<int, short>> r     = {{1, 1}, {2, 2}, {1, 1}, {INT_MAX, 4}, {3, 5}};
+  const std::pair<int, short> expected[] = {{1, 1}, {1, 1}, {2, 2}, {3, 5}, {INT_MAX, 4}};
+  {
+    std::flat_multimap s(std::from_range, r);
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::less<int>>);
+    assert(std::ranges::equal(s, expected));
+  }
+  {
+    std::flat_multimap s(std::from_range, r, test_allocator<long>(0, 42));
+    ASSERT_SAME_TYPE(
+        decltype(s),
+        std::flat_multimap<int,
+                           short,
+                           std::less<int>,
+                           std::vector<int, test_allocator<int>>,
+                           std::vector<short, test_allocator<short>>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 42);
+  }
+}
+
+void test_from_range_compare() {
+  std::list<std::pair<int, short>> r     = {{1, 1}, {2, 2}, {1, 1}, {INT_MAX, 4}, {3, 5}};
+  const std::pair<int, short> expected[] = {{INT_MAX, 4}, {3, 5}, {2, 2}, {1, 1}, {1, 1}};
+  {
+    std::flat_multimap s(std::from_range, r, std::greater<int>());
+    ASSERT_SAME_TYPE(decltype(s), std::flat_multimap<int, short, std::greater<int>>);
+    assert(std::ranges::equal(s, expected));
+  }
+  {
+    std::flat_multimap s(std::from_range, r, std::greater<int>(), test_allocator<long>(0, 42));
+    ASSERT_SAME_TYPE(
+        decltype(s),
+        std::flat_multimap<int,
+                           short,
+                           std::greater<int>,
+                           std::vector<int, test_allocator<int>>,
+                           std::vector<short, test_allocator<short>>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().get_id() == 42);
+    assert(s.values().get_allocator().get_id() == 42);
+  }
+}
+
+int main(int, char**) {
+  // Each test function also tests the sorted_equivalent-prefixed and allocator-suffixed overloads.
+  test_copy();
+  test_containers();
+  test_containers_compare();
+  test_iter_iter();
+  test_iter_iter_compare();
+  test_initializer_list();
+  test_initializer_list_compare();
+  test_from_range();
+  test_from_range_compare();
+
+  AssociativeContainerDeductionGuidesSfinaeAway<std::flat_multimap, std::flat_multimap<int, short>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp
new file mode 100644
index 0000000000000..c25218e890f21
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct.verify.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// Test CTAD on cases where deduction should fail.
+
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+struct NotAnAllocator {
+  friend bool operator<(NotAnAllocator, NotAnAllocator) { return false; }
+};
+
+using P  = std::pair<int, long>;
+using PC = std::pair<const int, long>;
+
+void test() {
+  {
+    // cannot deduce that the inner braced things should be std::pair and not something else
+    std::flat_multimap m{{1, 1L}, {2, 2L}, {3, 3L}};
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+  {
+    // cannot deduce that the inner braced things should be std::pair and not something else
+    std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::less<int>());
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+  {
+    // cannot deduce that the inner braced things should be std::pair and not something else
+    std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::less<int>(), std::allocator<PC>());
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+  {
+    // cannot deduce that the inner braced things should be std::pair and not something else
+    std::flat_multimap m({{1, 1L}, {2, 2L}, {3, 3L}}, std::allocator<PC>());
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+  {
+    // since we have parens, not braces, this deliberately does not find the initializer_list constructor
+    std::flat_multimap m(P{1, 1L});
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+  {
+    // since we have parens, not braces, this deliberately does not find the initializer_list constructor
+    std::flat_multimap m(PC{1, 1L});
+    // expected-error-re@-1{{{{no viable constructor or deduction guide for deduction of template arguments of '.*flat_multimap'}}}}
+  }
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp
new file mode 100644
index 0000000000000..1955a8806631b
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/deduct_pmr.pass.cpp
@@ -0,0 +1,107 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: availability-pmr-missing
+
+// <flat_map>
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <deque>
+#include <initializer_list>
+#include <list>
+#include <flat_map>
+#include <functional>
+#include <memory_resource>
+#include <ranges>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "test_allocator.h"
+
+using P  = std::pair<int, long>;
+using PC = std::pair<const int, long>;
+
+void test_containers() {
+  std::deque<int, test_allocator<int>> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator<int>(0, 43));
+  std::deque<int, test_allocator<int>> sorted_ks({1, 1, 2, 2, 2, 3, INT_MAX}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> sorted_vs({1, 3, 2, 4, 5, 4, 3}, test_allocator<int>(0, 43));
+  const std::pair<int, short> expected[] = {{1, 1}, {1, 3}, {2, 2}, {2, 4}, {2, 5}, {3, 4}, {INT_MAX, 3}};
+  {
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::monotonic_buffer_resource mr2;
+    std::pmr::deque<int> pks(ks.begin(), ks.end(), &mr);
+    std::pmr::deque<short> pvs(vs.begin(), vs.end(), &mr);
+    std::flat_multimap s(std::move(pks), std::move(pvs), &mr2);
+
+    ASSERT_SAME_TYPE(
+        decltype(s), std::flat_multimap<int, short, std::less<int>, std::pmr::deque<int>, std::pmr::deque<short>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().resource() == &mr2);
+    assert(s.values().get_allocator().resource() == &mr2);
+  }
+  {
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::monotonic_buffer_resource mr2;
+    std::pmr::deque<int> pks(sorted_ks.begin(), sorted_ks.end(), &mr);
+    std::pmr::deque<short> pvs(sorted_vs.begin(), sorted_vs.end(), &mr);
+    std::flat_multimap s(std::sorted_equivalent, std::move(pks), std::move(pvs), &mr2);
+
+    ASSERT_SAME_TYPE(
+        decltype(s), std::flat_multimap<int, short, std::less<int>, std::pmr::deque<int>, std::pmr::deque<short>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().resource() == &mr2);
+    assert(s.values().get_allocator().resource() == &mr2);
+  }
+}
+
+void test_containers_compare() {
+  std::deque<int, test_allocator<int>> ks({1, 2, 1, 2, 2, INT_MAX, 3}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> vs({1, 2, 3, 4, 5, 3, 4}, test_allocator<int>(0, 43));
+  std::deque<int, test_allocator<int>> sorted_ks({INT_MAX, 3, 2, 2, 2, 1, 1}, test_allocator<int>(0, 42));
+  std::deque<short, test_allocator<short>> sorted_vs({3, 4, 2, 4, 5, 1, 3}, test_allocator<int>(0, 43));
+  const std::pair<int, short> expected[] = {{INT_MAX, 3}, {3, 4}, {2, 2}, {2, 4}, {2, 5}, {1, 1}, {1, 3}};
+
+  {
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::monotonic_buffer_resource mr2;
+    std::pmr::deque<int> pks(ks.begin(), ks.end(), &mr);
+    std::pmr::deque<short> pvs(vs.begin(), vs.end(), &mr);
+    std::flat_multimap s(std::move(pks), std::move(pvs), std::greater<int>(), &mr2);
+
+    ASSERT_SAME_TYPE(
+        decltype(s), std::flat_multimap<int, short, std::greater<int>, std::pmr::deque<int>, std::pmr::deque<short>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().resource() == &mr2);
+    assert(s.values().get_allocator().resource() == &mr2);
+  }
+  {
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::monotonic_buffer_resource mr2;
+    std::pmr::deque<int> pks(sorted_ks.begin(), sorted_ks.end(), &mr);
+    std::pmr::deque<short> pvs(sorted_vs.begin(), sorted_vs.end(), &mr);
+    std::flat_multimap s(std::sorted_equivalent, std::move(pks), std::move(pvs), std::greater<int>(), &mr2);
+
+    ASSERT_SAME_TYPE(
+        decltype(s), std::flat_multimap<int, short, std::greater<int>, std::pmr::deque<int>, std::pmr::deque<short>>);
+    assert(std::ranges::equal(s, expected));
+    assert(s.keys().get_allocator().resource() == &mr2);
+    assert(s.values().get_allocator().resource() == &mr2);
+  }
+}
+
+int main(int, char**) {
+  test_containers();
+  test_containers_compare();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
new file mode 100644
index 0000000000000..c910f748d95fe
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap();
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+#include "test_macros.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+
+struct DefaultCtableComp {
+  explicit DefaultCtableComp() { default_constructed_ = true; }
+  bool operator()(int, int) const { return false; }
+  bool default_constructed_ = false;
+};
+
+int main(int, char**) {
+  {
+    std::flat_multimap<int, char*> m;
+    assert(m.empty());
+  }
+  {
+    // explicit(false)
+    std::flat_multimap<int, char*> m = {};
+    assert(m.empty());
+  }
+  {
+    std::flat_multimap<int, char*, DefaultCtableComp, std::deque<int, min_allocator<int>>> m;
+    assert(m.empty());
+    assert(m.begin() == m.end());
+    assert(m.key_comp().default_constructed_);
+  }
+  {
+    using A1 = explicit_allocator<int>;
+    using A2 = explicit_allocator<char*>;
+    {
+      std::flat_multimap<int, char*, DefaultCtableComp, std::vector<int, A1>, std::vector<char*, A2>> m;
+      assert(m.empty());
+      assert(m.key_comp().default_constructed_);
+    }
+    {
+      A1 a1;
+      std::flat_multimap<int, int, DefaultCtableComp, std::vector<int, A1>, std::vector<int, A1>> m(a1);
+      assert(m.empty());
+      assert(m.key_comp().default_constructed_);
+    }
+  }
+  {
+    // If an allocator is given, it must be usable by both containers.
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, std::vector<int, A>>;
+    static_assert(std::is_constructible_v<M>);
+    static_assert(!std::is_constructible_v<M, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, A>);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
new file mode 100644
index 0000000000000..fa490f120875f
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/default_noexcept.pass.cpp
@@ -0,0 +1,61 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap()
+//    noexcept(
+//        is_nothrow_default_constructible_v<key_container_type> &&
+//        is_nothrow_default_constructible_v<mapped_container_type> &&
+//        is_nothrow_default_constructible_v<key_compare>);
+
+// This tests a conforming extension
+
+#include <cassert>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+struct ThrowingCtorComp {
+  ThrowingCtorComp() noexcept(false) {}
+  bool operator()(const auto&, const auto&) const { return false; }
+};
+
+int main(int, char**) {
+#if defined(_LIBCPP_VERSION)
+  {
+    using C = std::flat_multimap<MoveOnly, MoveOnly>;
+    static_assert(std::is_nothrow_default_constructible_v<C>);
+    C c;
+  }
+  {
+    using C =
+        std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, test_allocator<MoveOnly>>>;
+    static_assert(std::is_nothrow_default_constructible_v<C>);
+    C c;
+  }
+#endif // _LIBCPP_VERSION
+  {
+    using C =
+        std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, std::vector<MoveOnly, other_allocator<MoveOnly>>>;
+    static_assert(!std::is_nothrow_default_constructible_v<C>);
+    C c;
+  }
+  {
+    using C = std::flat_multimap<MoveOnly, MoveOnly, ThrowingCtorComp>;
+    static_assert(!std::is_nothrow_default_constructible_v<C>);
+    C c;
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
new file mode 100644
index 0000000000000..fd31e440a6614
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/dtor_noexcept.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// ~flat_multimap();
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+struct ThrowingDtorComp {
+  bool operator()(const auto&, const auto&) const;
+  ~ThrowingDtorComp() noexcept(false) {}
+};
+
+int main(int, char**) {
+  {
+    using C = std::flat_multimap<MoveOnly, MoveOnly>;
+    static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
+  }
+  {
+    using V = std::vector<MoveOnly, test_allocator<MoveOnly>>;
+    using C = std::flat_multimap<MoveOnly, MoveOnly, std::less<MoveOnly>, V, V>;
+    static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
+  }
+  {
+    using V = std::deque<MoveOnly, other_allocator<MoveOnly>>;
+    using C = std::flat_multimap<MoveOnly, MoveOnly, std::greater<MoveOnly>, V, V>;
+    static_assert(std::is_nothrow_destructible_v<C>);
+    C c;
+  }
+#if defined(_LIBCPP_VERSION)
+  {
+    using C = std::flat_multimap<MoveOnly, MoveOnly, ThrowingDtorComp>;
+    static_assert(!std::is_nothrow_destructible_v<C>);
+    C c;
+  }
+#endif // _LIBCPP_VERSION
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
new file mode 100644
index 0000000000000..8e89192ec0ea1
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/initializer_list.pass.cpp
@@ -0,0 +1,159 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(initializer_list<value_type> il, const key_compare& comp = key_compare());
+// template<class Alloc>
+//    flat_multimap(initializer_list<value_type> il, const Alloc& a);
+// template<class Alloc>
+//    flat_multimap(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <type_traits>
+#include <vector>
+
+#include "test_macros.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+
+#include "../../../test_compare.h"
+
+struct DefaultCtableComp {
+  explicit DefaultCtableComp() { default_constructed_ = true; }
+  bool operator()(int, int) const { return false; }
+  bool default_constructed_ = false;
+};
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    using IL = std::initializer_list<std::pair<int, int>>;
+    static_assert(std::is_constructible_v<M1, IL, const A1&>);
+    static_assert(!std::is_constructible_v<M1, IL, const A2&>);
+    static_assert(!std::is_constructible_v<M2, IL, const A2&>);
+    static_assert(!std::is_constructible_v<M3, IL, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, IL, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, IL, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, IL, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, IL, const C&, const A2&>);
+  }
+
+  {
+    // initializer_list<value_type> needs to match exactly
+    using M = std::flat_multimap<int, short>;
+    using C = typename M::key_compare;
+    static_assert(std::is_constructible_v<M, std::initializer_list<std::pair<int, short>>>);
+    static_assert(std::is_constructible_v<M, std::initializer_list<std::pair<int, short>>, C>);
+    static_assert(std::is_constructible_v<M, std::initializer_list<std::pair<int, short>>, C, std::allocator<int>>);
+    static_assert(std::is_constructible_v<M, std::initializer_list<std::pair<int, short>>, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::initializer_list<std::pair<const int, short>>>);
+    static_assert(!std::is_constructible_v<M, std::initializer_list<std::pair<const int, short>>, C>);
+    static_assert(
+        !std::is_constructible_v<M, std::initializer_list<std::pair<const int, short>>, C, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::initializer_list<std::pair<const int, short>>, std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>>);
+    static_assert(!std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>, C>);
+    static_assert(
+        !std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>, C, std::allocator<int>>);
+    static_assert(
+        !std::is_constructible_v<M, std::initializer_list<std::pair<const int, const short>>, std::allocator<int>>);
+  }
+
+  std::pair<int, short> expected[] = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {5, 2}};
+  {
+    // flat_multimap(initializer_list<value_type>);
+    using M                                         = std::flat_multimap<int, short>;
+    std::initializer_list<std::pair<int, short>> il = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    M m(il);
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>);
+    // explicit(false)
+    using M = std::flat_multimap<int, short>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>);
+    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>>;
+    M m     = {{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}};
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+  }
+  {
+    using A = explicit_allocator<int>;
+    {
+      // flat_multimap(initializer_list<value_type>);
+      // different comparator
+      using M = std::flat_multimap<int, int, DefaultCtableComp, std::vector<int, A>, std::deque<int, A>>;
+      M m     = {{1, 1}, {2, 2}, {3, 3}};
+      assert(m.size() == 3);
+
+      std::pair<int, int> expected1[] = {{1, 1}, {2, 2}, {3, 3}};
+      assert(std::ranges::equal(m, expected1));
+      assert(m.key_comp().default_constructed_);
+    }
+    {
+      // flat_multimap(initializer_list<value_type>, const Allocator&);
+      using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
+      A a;
+      M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, a);
+      assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+    }
+  }
+  {
+    // flat_multimap(initializer_list<value_type>, const key_compare&);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, short, C>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10));
+    assert(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(10));
+
+    // explicit(false)
+    M m2 = {{{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, C(10)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(10));
+  }
+  {
+    // flat_multimap(initializer_list<value_type>, const key_compare&);
+    // Sorting uses the comparator that was passed in
+    using M = std::flat_multimap<int, short, std::function<bool(int, int)>, std::deque<int, min_allocator<int>>>;
+    auto m  = M({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, std::greater<int>());
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+    assert(m.key_comp()(2, 1) == true);
+  }
+  {
+    // flat_multimap(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
+    using A = explicit_allocator<int>;
+    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, A>, std::vector<int, A>>;
+    A a;
+    M m({{5, 2}, {2, 2}, {2, 2}, {3, 3}, {1, 1}, {3, 3}}, {}, a);
+    assert(std::equal(m.rbegin(), m.rend(), expected, expected + 6));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
new file mode 100644
index 0000000000000..c9c5e6c99d1c8
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/iter_iter.pass.cpp
@@ -0,0 +1,154 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// template <class InputIterator>
+//   flat_multimap(InputIterator first, InputIterator last, const key_compare& comp = key_compare());
+// template<class InputIterator, class Allocator>
+//   flat_multimap(InputIterator first, InputIterator last, const Allocator& a);
+// template<class InputIterator, class Allocator>
+//   flat_multimap(InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C     = test_less<int>;
+    using A1    = test_allocator<int>;
+    using A2    = other_allocator<int>;
+    using V1    = std::vector<int, A1>;
+    using V2    = std::vector<int, A2>;
+    using M1    = std::flat_multimap<int, int, C, V1, V1>;
+    using M2    = std::flat_multimap<int, int, C, V1, V2>;
+    using M3    = std::flat_multimap<int, int, C, V2, V1>;
+    using Iter1 = typename M1::iterator;
+    using Iter2 = typename M2::iterator;
+    using Iter3 = typename M3::iterator;
+    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, Iter1, Iter1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, Iter1, Iter1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, Iter2, Iter2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, Iter3, Iter3, const C&, const A2&>);
+  }
+
+  using P      = std::pair<int, short>;
+  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+  P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+  {
+    // flat_multimap(InputIterator , InputIterator)
+    // cpp17_input_iterator
+    using M = std::flat_multimap<int, short>;
+    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+
+    // explicit(false)
+    M m2 = {cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9)};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator)
+    // greater
+    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
+    auto m  = M(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
+    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 8, 9, 4, 5, 7, 1, 2, 3}));
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator)
+    // Test when the operands are of array type (also contiguous iterator type)
+    using M = std::flat_multimap<int, short, std::greater<int>, std::vector<int, min_allocator<int>>>;
+    auto m  = M(ar, ar);
+    assert(m.empty());
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator, const key_compare&)
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, short, C, std::vector<int>, std::deque<short>>;
+    auto m  = M(ar, ar + 9, C(3));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(3));
+
+    // explicit(false)
+    M m2 = {ar, ar + 9, C(3)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(3));
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator, const Allocator&)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    auto m   = M(ar, ar + 9, A1(5));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    M m      = {ar, ar + 9, A1(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator, const key_compare&, const Allocator&)
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    auto m   = M(ar, ar + 9, C(3), A1(5));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(InputIterator , InputIterator, const key_compare&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
+    M m      = {ar, ar + 9, {}, A2(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
new file mode 100644
index 0000000000000..893c9247959d6
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(flat_multimap&&);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "../helpers.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    using C = test_less<int>;
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    M mo    = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A(7));
+    M m     = std::move(mo);
+    assert((m == M{{1, 1}, {1, 2}, {3, 1}}));
+    assert(m.key_comp() == C(5));
+    assert(m.keys().get_allocator() == A(7));
+    assert(m.values().get_allocator() == A(7));
+
+    assert(mo.empty());
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys().get_allocator().get_id() == test_alloc_base::moved_value);
+    assert(mo.values().get_allocator().get_id() == test_alloc_base::moved_value);
+  }
+  {
+    using C = test_less<int>;
+    using A = min_allocator<int>;
+    using M = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    M mo    = M({{1, 1}, {1, 2}, {3, 1}}, C(5), A());
+    M m     = std::move(mo);
+    assert((m == M{{1, 1}, {1, 2}, {3, 1}}));
+    assert(m.key_comp() == C(5));
+    assert(m.keys().get_allocator() == A());
+    assert(m.values().get_allocator() == A());
+
+    assert(mo.empty());
+    assert(mo.key_comp() == C(5));
+    assert(m.keys().get_allocator() == A());
+    assert(m.values().get_allocator() == A());
+  }
+  {
+    // A moved-from flat_multimap maintains its class invariant in the presence of moved-from comparators.
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
+    M mo    = M({{1, 1}, {1, 2}, {3, 1}}, std::less<int>());
+    M m     = std::move(mo);
+    assert(m.size() == 3);
+    assert(std::is_sorted(m.begin(), m.end(), m.value_comp()));
+    assert(m.key_comp()(1, 2) == true);
+
+    assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp()));
+    LIBCPP_ASSERT(m.key_comp()(1, 2) == true);
+    LIBCPP_ASSERT(mo.empty());
+    mo.insert({{1, 1}, {1, 2}, {3, 1}}); // insert has no preconditions
+    assert(m == mo);
+  }
+  {
+    // moved-from object maintains invariant if one of underlying container does not clear after move
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    M m1    = M({1, 1, 3}, {1, 2, 3});
+    M m2    = std::move(m1);
+    assert(m2.size() == 3);
+    check_invariant(m1);
+    LIBCPP_ASSERT(m1.empty());
+    LIBCPP_ASSERT(m1.keys().size() == 0);
+    LIBCPP_ASSERT(m1.values().size() == 0);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
new file mode 100644
index 0000000000000..a0259e805ac5a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_alloc.pass.cpp
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(flat_multimap&&, const allocator_type&);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <vector>
+
+#include "../helpers.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, M1&&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, M1&&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, M2&&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, M3&&, const A2&>);
+  }
+  {
+    std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {2, 3}, {2, 2}, {3, 1}};
+    using C                        = test_less<int>;
+    using A                        = test_allocator<int>;
+    using M                        = std::flat_multimap<int, int, C, std::vector<int, A>, std::deque<int, A>>;
+    auto mo                        = M(expected, expected + 5, C(5), A(7));
+    auto m                         = M(std::move(mo), A(3));
+
+    assert(m.key_comp() == C(5));
+    assert(m.size() == 5);
+    auto [keys, values] = std::move(m).extract();
+    assert(keys.get_allocator() == A(3));
+    assert(values.get_allocator() == A(3));
+    assert(std::ranges::equal(keys, expected | std::views::elements<0>));
+    assert(std::ranges::equal(values, expected | std::views::elements<1>));
+
+    // The original flat_multimap is moved-from.
+    assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp()));
+    assert(mo.empty());
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys().get_allocator() == A(7));
+    assert(mo.values().get_allocator() == A(7));
+  }
+  {
+    // moved-from object maintains invariant if one of underlying container does not clear after move
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    M m1    = M({1, 1, 3}, {1, 2, 3});
+    M m2(std::move(m1), std::allocator<int>{});
+    assert(m2.size() == 3);
+    check_invariant(m1);
+    LIBCPP_ASSERT(m1.empty());
+    LIBCPP_ASSERT(m1.keys().size() == 0);
+    LIBCPP_ASSERT(m1.values().size() == 0);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
new file mode 100644
index 0000000000000..38200d008c78a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(flat_multimap&&);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "../../../test_compare.h"
+#include "test_allocator.h"
+#include "min_allocator.h"
+
+int main(int, char**) {
+  {
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<char>;
+    using M  = std::flat_multimap<int, char, C, std::vector<int, A1>, std::vector<char, A2>>;
+    M mo     = M({{1, 1}, {1, 3}, {3, 2}}, C(5), A1(7));
+    M m      = M({}, C(3), A1(7));
+    m        = std::move(mo);
+    assert((m == M{{1, 1}, {1, 3}, {3, 2}}));
+    assert(m.key_comp() == C(5));
+    auto [ks, vs] = std::move(m).extract();
+    assert(ks.get_allocator() == A1(7));
+    assert(vs.get_allocator() == A2(7));
+    assert(mo.empty());
+  }
+  {
+    using C  = test_less<int>;
+    using A1 = other_allocator<int>;
+    using A2 = other_allocator<char>;
+    using M  = std::flat_multimap<int, char, C, std::deque<int, A1>, std::deque<char, A2>>;
+    M mo     = M({{4, 5}, {4, 4}}, C(5), A1(7));
+    M m      = M({{1, 1}, {1, 2}, {1, 3}, {4, 4}}, C(3), A1(7));
+    m        = std::move(mo);
+    assert((m == M{{4, 5}, {4, 4}}));
+    assert(m.key_comp() == C(5));
+    auto [ks, vs] = std::move(m).extract();
+    assert(ks.get_allocator() == A1(7));
+    assert(vs.get_allocator() == A2(7));
+    assert(mo.empty());
+  }
+  {
+    using A = min_allocator<int>;
+    using M = std::flat_multimap<int, int, std::greater<int>, std::vector<int, A>, std::vector<int, A>>;
+    M mo    = M({{5, 1}, {5, 2}, {3, 3}}, A());
+    M m     = M({{4, 4}, {4, 3}, {4, 2}, {1, 1}}, A());
+    m       = std::move(mo);
+    assert((m == M{{5, 1}, {5, 2}, {3, 3}}));
+    auto [ks, vs] = std::move(m).extract();
+    assert(ks.get_allocator() == A());
+    assert(vs.get_allocator() == A());
+    assert(mo.empty());
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
new file mode 100644
index 0000000000000..bc65dca32899c
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_clears.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(flat_multimap&&);
+// Preserves the class invariant for the moved-from flat_multimap.
+
+#include <algorithm>
+#include <cassert>
+#include <compare>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "../helpers.h"
+#include "test_macros.h"
+
+struct MoveNegates {
+  int value_    = 0;
+  MoveNegates() = default;
+  MoveNegates(int v) : value_(v) {}
+  MoveNegates(MoveNegates&& rhs) : value_(rhs.value_) { rhs.value_ = -rhs.value_; }
+  MoveNegates& operator=(MoveNegates&& rhs) {
+    value_     = rhs.value_;
+    rhs.value_ = -rhs.value_;
+    return *this;
+  }
+  ~MoveNegates()                             = default;
+  auto operator<=>(const MoveNegates&) const = default;
+};
+
+struct MoveClears {
+  int value_   = 0;
+  MoveClears() = default;
+  MoveClears(int v) : value_(v) {}
+  MoveClears(MoveClears&& rhs) : value_(rhs.value_) { rhs.value_ = 0; }
+  MoveClears& operator=(MoveClears&& rhs) {
+    value_     = rhs.value_;
+    rhs.value_ = 0;
+    return *this;
+  }
+  ~MoveClears()                             = default;
+  auto operator<=>(const MoveClears&) const = default;
+};
+
+int main(int, char**) {
+  {
+    const std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {3, 3}, {3, 4}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
+    using M = std::flat_multimap<MoveNegates, int, std::less<MoveNegates>, std::vector<MoveNegates>>;
+    M m     = M(expected, expected + 8);
+    M m2    = M(expected, expected + 3);
+
+    m2 = std::move(m);
+
+    assert(std::equal(m2.begin(), m2.end(), expected, expected + 8));
+    LIBCPP_ASSERT(m.empty());
+    check_invariant(m);
+    m.insert({1, 1});
+    m.insert({2, 2});
+    assert(m.contains(1));
+    assert(m.find(2) != m.end());
+  }
+  {
+    const std::pair<int, int> expected[] = {{1, 1}, {1, 2}, {3, 3}, {4, 4}, {5, 5}, {5, 6}, {7, 7}, {8, 8}};
+    using M = std::flat_multimap<MoveClears, int, std::less<MoveClears>, std::vector<MoveClears>>;
+    M m     = M(expected, expected + 8);
+    M m2    = M(expected, expected + 3);
+
+    m2 = std::move(m);
+
+    assert(std::equal(m2.begin(), m2.end(), expected, expected + 8));
+    LIBCPP_ASSERT(m.empty());
+    check_invariant(m);
+    m.insert({1, 1});
+    m.insert({2, 2});
+    assert(m.contains(1));
+    assert(m.find(2) != m.end());
+  }
+  {
+    // moved-from object maintains invariant if one of underlying container does not clear after move
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    M m1    = M({1, 1, 3}, {1, 2, 3});
+    M m2    = M({1, 1}, {1, 2});
+    m2      = std::move(m1);
+    assert(m2.size() == 3);
+    check_invariant(m1);
+    LIBCPP_ASSERT(m1.empty());
+    LIBCPP_ASSERT(m1.keys().size() == 0);
+    LIBCPP_ASSERT(m1.values().size() == 0);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp
new file mode 100644
index 0000000000000..4eb58313f6f72
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_assign_noexcept.pass.cpp
@@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap& operator=(flat_multimap&& c)
+//     noexcept(
+//          is_nothrow_move_assignable<key_container_type>::value &&
+//          is_nothrow_move_assignable<mapped_container_type>::value &&
+//          is_nothrow_copy_assignable<key_compare>::value);
+
+// This tests a conforming extension
+
+#include <flat_map>
+#include <functional>
+#include <memory_resource>
+#include <type_traits>
+#include <vector>
+
+#include "MoveOnly.h"
+#include "test_allocator.h"
+#include "test_macros.h"
+
+struct MoveSensitiveComp {
+  MoveSensitiveComp() noexcept(false)                         = default;
+  MoveSensitiveComp(const MoveSensitiveComp&) noexcept(false) = default;
+  MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; }
+  MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept = default;
+  MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) {
+    rhs.is_moved_from_ = true;
+    return *this;
+  }
+  bool operator()(const auto&, const auto&) const { return false; }
+  bool is_moved_from_ = false;
+};
+
+struct MoveThrowsComp {
+  MoveThrowsComp(MoveThrowsComp&&) noexcept(false);
+  MoveThrowsComp(const MoveThrowsComp&) noexcept(true);
+  MoveThrowsComp& operator=(MoveThrowsComp&&) noexcept(false);
+  MoveThrowsComp& operator=(const MoveThrowsComp&) noexcept(true);
+  bool operator()(const auto&, const auto&) const;
+};
+
+int main(int, char**) {
+  {
+    using C = std::flat_multimap<int, int>;
+    LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    using C =
+        std::flat_multimap<MoveOnly,
+                           int,
+                           std::less<MoveOnly>,
+                           std::vector<MoveOnly, test_allocator<MoveOnly>>,
+                           std::vector<int, test_allocator<int>>>;
+    static_assert(!std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    using C =
+        std::flat_multimap<int,
+                           MoveOnly,
+                           std::less<int>,
+                           std::vector<int, test_allocator<int>>,
+                           std::vector<MoveOnly, test_allocator<MoveOnly>>>;
+    static_assert(!std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    using C =
+        std::flat_multimap<MoveOnly,
+                           int,
+                           std::less<MoveOnly>,
+                           std::vector<MoveOnly, other_allocator<MoveOnly>>,
+                           std::vector<int, other_allocator<int>>>;
+    LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    using C =
+        std::flat_multimap<int,
+                           MoveOnly,
+                           std::less<int>,
+                           std::vector<int, other_allocator<int>>,
+                           std::vector<MoveOnly, other_allocator<MoveOnly>>>;
+    LIBCPP_STATIC_ASSERT(std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    // Test with a comparator that throws on move-assignment.
+    using C = std::flat_multimap<int, int, MoveThrowsComp>;
+    LIBCPP_STATIC_ASSERT(!std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    // Test with a container that throws on move-assignment.
+    using C = std::flat_multimap<int, int, std::less<int>, std::pmr::vector<int>, std::vector<int>>;
+    static_assert(!std::is_nothrow_move_assignable_v<C>);
+  }
+  {
+    // Test with a container that throws on move-assignment.
+    using C = std::flat_multimap<int, int, std::less<int>, std::vector<int>, std::pmr::vector<int>>;
+    static_assert(!std::is_nothrow_move_assignable_v<C>);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp
new file mode 100644
index 0000000000000..c2085e32be532
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_exceptions.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: no-exceptions
+
+// <flat_map>
+
+// flat_multimap(flat_multimap&& s);
+// If any member function in [flat.multimap.defn] exits via an exception, the invariant is restored.
+
+#include <algorithm>
+#include <cassert>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "../helpers.h"
+#include "test_macros.h"
+
+static int countdown = 0;
+
+struct EvilContainer : std::vector<int> {
+  EvilContainer() = default;
+  EvilContainer(EvilContainer&& rhs) {
+    // Throw on move-construction.
+    if (--countdown == 0) {
+      rhs.insert(rhs.end(), 0);
+      rhs.insert(rhs.end(), 0);
+      throw 42;
+    }
+  }
+};
+
+int main(int, char**) {
+  {
+    using M   = std::flat_multimap<int, int, std::less<int>, EvilContainer, std::vector<int>>;
+    M mo      = {{1, 1}, {1, 2}, {3, 3}};
+    countdown = 1;
+    try {
+      M m = std::move(mo);
+      assert(false); // not reached
+    } catch (int x) {
+      assert(x == 42);
+    }
+    // The source flat_multimap maintains its class invariant.
+    check_invariant(mo);
+    LIBCPP_ASSERT(mo.empty());
+  }
+  {
+    using M   = std::flat_multimap<int, int, std::less<int>, std::vector<int>, EvilContainer>;
+    M mo      = {{1, 1}, {1, 2}, {3, 3}};
+    countdown = 1;
+    try {
+      M m = std::move(mo);
+      assert(false); // not reached
+    } catch (int x) {
+      assert(x == 42);
+    }
+    // The source flat_multimap maintains its class invariant.
+    check_invariant(mo);
+    LIBCPP_ASSERT(mo.empty());
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp
new file mode 100644
index 0000000000000..e038902e26d52
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/move_noexcept.pass.cpp
@@ -0,0 +1,104 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(flat_multimap&&)
+//        noexcept(is_nothrow_move_constructible<key_container_type>::value &&
+//                 is_nothrow_move_constructible<mapped_container_type>::value &&
+//                 is_nothrow_copy_constructible<key_compare>::value);
+
+// This tests a conforming extension
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "test_macros.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+
+template <class T>
+struct ThrowingMoveAllocator {
+  using value_type                                    = T;
+  explicit ThrowingMoveAllocator()                    = default;
+  ThrowingMoveAllocator(const ThrowingMoveAllocator&) = default;
+  ThrowingMoveAllocator(ThrowingMoveAllocator&&) noexcept(false) {}
+  T* allocate(std::ptrdiff_t n) { return std::allocator<T>().allocate(n); }
+  void deallocate(T* p, std::ptrdiff_t n) { return std::allocator<T>().deallocate(p, n); }
+  friend bool operator==(ThrowingMoveAllocator, ThrowingMoveAllocator) = default;
+};
+
+struct ThrowingMoveComp {
+  ThrowingMoveComp() = default;
+  ThrowingMoveComp(const ThrowingMoveComp&) noexcept(true) {}
+  ThrowingMoveComp(ThrowingMoveComp&&) noexcept(false) {}
+  bool operator()(const auto&, const auto&) const { return false; }
+};
+
+struct MoveSensitiveComp {
+  MoveSensitiveComp() noexcept(false)                  = default;
+  MoveSensitiveComp(const MoveSensitiveComp&) noexcept = default;
+  MoveSensitiveComp(MoveSensitiveComp&& rhs) { rhs.is_moved_from_ = true; }
+  MoveSensitiveComp& operator=(const MoveSensitiveComp&) noexcept(false) = default;
+  MoveSensitiveComp& operator=(MoveSensitiveComp&& rhs) {
+    rhs.is_moved_from_ = true;
+    return *this;
+  }
+  bool operator()(const auto&, const auto&) const { return false; }
+  bool is_moved_from_ = false;
+};
+
+int main(int, char**) {
+  {
+    using C = std::flat_multimap<int, int>;
+    LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v<C>);
+    C c;
+    C d = std::move(c);
+  }
+  {
+    using C = std::flat_multimap<int, int, std::less<int>, std::deque<int, test_allocator<int>>>;
+    LIBCPP_STATIC_ASSERT(std::is_nothrow_move_constructible_v<C>);
+    C c;
+    C d = std::move(c);
+  }
+#if _LIBCPP_VERSION
+  {
+    // Container fails to be nothrow-move-constructible; this relies on libc++'s support for non-nothrow-copyable allocators
+    using C =
+        std::flat_multimap<int, int, std::less<int>, std::deque<int, ThrowingMoveAllocator<int>>, std::vector<int>>;
+    static_assert(!std::is_nothrow_move_constructible_v<std::deque<int, ThrowingMoveAllocator<int>>>);
+    static_assert(!std::is_nothrow_move_constructible_v<C>);
+    C c;
+    C d = std::move(c);
+  }
+  {
+    // Container fails to be nothrow-move-constructible; this relies on libc++'s support for non-nothrow-copyable allocators
+    using C =
+        std::flat_multimap<int, int, std::less<int>, std::vector<int>, std::deque<int, ThrowingMoveAllocator<int>>>;
+    static_assert(!std::is_nothrow_move_constructible_v<std::deque<int, ThrowingMoveAllocator<int>>>);
+    static_assert(!std::is_nothrow_move_constructible_v<C>);
+    C c;
+    C d = std::move(c);
+  }
+#endif // _LIBCPP_VERSION
+  {
+    // Comparator fails to be nothrow-move-constructible
+    using C = std::flat_multimap<int, int, ThrowingMoveComp>;
+    static_assert(!std::is_nothrow_move_constructible_v<C>);
+    C c;
+    C d = std::move(c);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp
new file mode 100644
index 0000000000000..8b518f6afbda9
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/pmr.pass.cpp
@@ -0,0 +1,361 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: availability-pmr-missing
+
+// <flat_map>
+
+// Test various constructors with pmr
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <memory_resource>
+#include <ranges>
+#include <vector>
+#include <string>
+
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "test_allocator.h"
+#include "../../../test_compare.h"
+
+int main(int, char**) {
+  {
+    // flat_multimap(const Allocator& a);
+    using M = std::flat_multimap<int, short, std::less<int>, std::pmr::vector<int>, std::pmr::vector<short>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::polymorphic_allocator<int> pa = &mr;
+    auto m1                                 = M(pa);
+    assert(m1.empty());
+    assert(m1.keys().get_allocator() == pa);
+    assert(m1.values().get_allocator() == pa);
+    auto m2 = M(&mr);
+    assert(m2.empty());
+    assert(m2.keys().get_allocator() == pa);
+    assert(m2.values().get_allocator() == pa);
+  }
+  {
+    // flat_multimap(const key_compare& comp, const Alloc& a);
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    vm.emplace_back(std::greater<int>());
+    assert(vm[0] == M{});
+    assert(vm[0].key_comp()(2, 1) == true);
+    assert(vm[0].value_comp()({2, 0}, {1, 0}) == true);
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(const key_container_type& key_cont, const mapped_container_type& mapped_cont,
+    //          const Allocator& a);
+    using M = std::flat_multimap<int, int, std::less<int>, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::pmr::vector<int> ks = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    std::pmr::vector<int> vs = {1, 1, 1, 2, 2, 3, 2, 3, 3};
+    assert(ks.get_allocator().resource() != &mr);
+    assert(vs.get_allocator().resource() != &mr);
+    vm.emplace_back(ks, vs);
+    assert(ks.size() == 9); // ks' value is unchanged, since it was an lvalue above
+    assert(vs.size() == 9); // vs' value is unchanged, since it was an lvalue above
+    assert((vm[0] == M{{1, 1}, {1, 1}, {1, 1}, {2, 2}, {2, 2}, {2, 2}, {3, 3}, {3, 3}, {3, 3}}));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(const flat_multimap&, const allocator_type&);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr1;
+    std::pmr::monotonic_buffer_resource mr2;
+    M mo = M({1, 2, 1}, {2, 2, 1}, C(5), &mr1);
+    M m  = {mo, &mr2}; // also test the implicitness of this constructor
+
+    assert(m.key_comp() == C(5));
+    assert((m.keys() == std::pmr::vector<int>{1, 1, 2}));
+    assert((m.values() == std::pmr::vector<int>{2, 1, 2}));
+    assert(m.keys().get_allocator().resource() == &mr2);
+    assert(m.values().get_allocator().resource() == &mr2);
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert((mo.keys() == std::pmr::vector<int>{1, 1, 2}));
+    assert((mo.values() == std::pmr::vector<int>{2, 1, 2}));
+    assert(mo.keys().get_allocator().resource() == &mr1);
+    assert(mo.values().get_allocator().resource() == &mr1);
+  }
+  {
+    // flat_multimap(const flat_multimap&, const allocator_type&);
+    using M = std::flat_multimap<int, int, std::less<>, std::pmr::vector<int>, std::pmr::deque<int>>;
+    std::pmr::vector<M> vs;
+    M m = {{1, 2}, {1, 2}, {3, 1}};
+    vs.push_back(m);
+    assert(vs[0] == m);
+  }
+  {
+    // flat_multimap& operator=(const flat_multimap& m);
+    // pmr allocator is not propagated
+    using M = std::flat_multimap<int, int, std::less<>, std::pmr::deque<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr1;
+    std::pmr::monotonic_buffer_resource mr2;
+    M mo = M({{1, 1}, {1, 2}, {3, 3}}, &mr1);
+    M m  = M({{4, 4}, {4, 5}}, &mr2);
+    m    = mo;
+    assert((m == M{{1, 1}, {1, 2}, {3, 3}}));
+    assert(m.keys().get_allocator().resource() == &mr2);
+    assert(m.values().get_allocator().resource() == &mr2);
+
+    // mo is unchanged
+    assert((mo == M{{1, 1}, {1, 2}, {3, 3}}));
+    assert(mo.keys().get_allocator().resource() == &mr1);
+  }
+  {
+    // flat_multimap(const flat_multimap& m);
+    using C = test_less<int>;
+    std::pmr::monotonic_buffer_resource mr;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    auto mo = M({{1, 1}, {1, 2}, {3, 3}}, C(5), &mr);
+    auto m  = mo;
+
+    assert(m.key_comp() == C(5));
+    assert((m == M{{1, 1}, {1, 2}, {3, 3}}));
+    auto [ks, vs] = std::move(m).extract();
+    assert(ks.get_allocator().resource() == std::pmr::get_default_resource());
+    assert(vs.get_allocator().resource() == std::pmr::get_default_resource());
+
+    // mo is unchanged
+    assert(mo.key_comp() == C(5));
+    assert((mo == M{{1, 1}, {1, 2}, {3, 3}}));
+    auto [kso, vso] = std::move(mo).extract();
+    assert(kso.get_allocator().resource() == &mr);
+    assert(vso.get_allocator().resource() == &mr);
+  }
+  {
+    //  flat_multimap(initializer_list<value_type> il, const Alloc& a);
+    using M = std::flat_multimap<int, int, std::less<int>, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::initializer_list<M::value_type> il = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
+    vm.emplace_back(il);
+    assert((vm[0] == M{{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}}));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    //  flat_multimap(initializer_list<value_type> il, const key_compare& comp, const Alloc& a);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::deque<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::initializer_list<M::value_type> il = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
+    vm.emplace_back(il, C(5));
+    assert((vm[0] == M{{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}}));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+    assert(vm[0].key_comp() == C(5));
+  }
+  {
+    // flat_multimap(InputIterator first, InputIterator last, const Allocator& a);
+    using P      = std::pair<int, short>;
+    P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+    P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+    {
+      //  cpp17 iterator
+      using M = std::flat_multimap<int, short, std::less<int>, std::pmr::vector<int>, std::pmr::vector<short>>;
+      std::pmr::monotonic_buffer_resource mr;
+      std::pmr::vector<M> vm(&mr);
+      vm.emplace_back(cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 9));
+      assert(std::ranges::equal(vm[0].keys(), expected | std::views::elements<0>));
+      LIBCPP_ASSERT(std::ranges::equal(vm[0], expected));
+      assert(vm[0].keys().get_allocator().resource() == &mr);
+      assert(vm[0].values().get_allocator().resource() == &mr);
+    }
+    {
+      using M = std::flat_multimap<int, short, std::less<int>, std::pmr::vector<int>, std::pmr::vector<short>>;
+      std::pmr::monotonic_buffer_resource mr;
+      std::pmr::vector<M> vm(&mr);
+      vm.emplace_back(ar, ar);
+      assert(vm[0].empty());
+      assert(vm[0].keys().get_allocator().resource() == &mr);
+      assert(vm[0].values().get_allocator().resource() == &mr);
+    }
+  }
+  {
+    // flat_multimap(flat_multimap&&, const allocator_type&);
+    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {2, 2}, {3, 1}};
+    using C                        = test_less<int>;
+    using M                        = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::deque<int>>;
+    std::pmr::monotonic_buffer_resource mr1;
+    std::pmr::monotonic_buffer_resource mr2;
+    M mo = M({{1, 1}, {3, 1}, {1, 1}, {2, 2}}, C(5), &mr1);
+    M m  = {std::move(mo), &mr2}; // also test the implicitness of this constructor
+
+    assert(m.key_comp() == C(5));
+    assert(m.size() == 4);
+    assert(m.keys().get_allocator().resource() == &mr2);
+    assert(m.values().get_allocator().resource() == &mr2);
+    assert(std::ranges::equal(m, expected));
+
+    // The original flat_multimap is moved-from.
+    assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp()));
+    assert(mo.key_comp() == C(5));
+    assert(mo.keys().get_allocator().resource() == &mr1);
+    assert(mo.values().get_allocator().resource() == &mr1);
+  }
+  {
+    // flat_multimap(flat_multimap&&, const allocator_type&);
+    using M = std::flat_multimap<int, int, std::less<>, std::pmr::deque<int>, std::pmr::vector<int>>;
+    std::pmr::vector<M> vs;
+    M m = {{1, 1}, {3, 1}, {1, 1}, {2, 2}};
+    vs.push_back(std::move(m));
+    assert((vs[0].keys() == std::pmr::deque<int>{1, 1, 2, 3}));
+    assert((vs[0].values() == std::pmr::vector<int>{1, 1, 2, 1}));
+  }
+  {
+    // flat_multimap& operator=(flat_multimap&&);
+    using M = std::
+        flat_multimap<std::pmr::string, int, std::less<>, std::pmr::vector<std::pmr::string>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr1;
+    std::pmr::monotonic_buffer_resource mr2;
+    M mo = M({{"short", 1},
+              {"very long string that definitely won't fit in the SSO buffer and therefore becomes empty on move", 2}},
+             &mr1);
+    M m  = M({{"don't care", 3}}, &mr2);
+    m    = std::move(mo);
+    assert(m.size() == 2);
+    assert(std::is_sorted(m.begin(), m.end(), m.value_comp()));
+    assert(m.begin()->first.get_allocator().resource() == &mr2);
+
+    assert(std::is_sorted(mo.begin(), mo.end(), mo.value_comp()));
+    mo.insert({"foo", 1});
+    assert(mo.begin()->first.get_allocator().resource() == &mr1);
+  }
+  {
+    //  flat_multimap(from_range_t, R&&, const Alloc&);
+    using P      = std::pair<int, short>;
+    P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+    P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+    {
+      // input_range
+      using M    = std::flat_multimap<int, short, std::less<int>, std::pmr::vector<int>, std::pmr::vector<short>>;
+      using Iter = cpp20_input_iterator<const P*>;
+      using Sent = sentinel_wrapper<Iter>;
+      using R    = std::ranges::subrange<Iter, Sent>;
+      std::pmr::monotonic_buffer_resource mr;
+      std::pmr::vector<M> vm(&mr);
+      vm.emplace_back(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
+      assert(std::ranges::equal(vm[0].keys(), expected | std::views::elements<0>));
+      LIBCPP_ASSERT(std::ranges::equal(vm[0], expected));
+      assert(vm[0].keys().get_allocator().resource() == &mr);
+      assert(vm[0].values().get_allocator().resource() == &mr);
+    }
+    {
+      using M = std::flat_multimap<int, short, std::less<int>, std::pmr::vector<int>, std::pmr::vector<short>>;
+      using R = std::ranges::subrange<const P*>;
+      std::pmr::monotonic_buffer_resource mr;
+      std::pmr::vector<M> vm(&mr);
+      vm.emplace_back(std::from_range, R(ar, ar));
+      assert(vm[0].empty());
+      assert(vm[0].keys().get_allocator().resource() == &mr);
+      assert(vm[0].values().get_allocator().resource() == &mr);
+    }
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, const key_container_type& key_cont,
+    //          const mapped_container_type& mapped_cont, const Alloc& a);
+    using M = std::flat_multimap<int, int, std::less<int>, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::pmr::vector<int> ks = {1, 1, 4, 10};
+    std::pmr::vector<int> vs = {4, 3, 2, 1};
+    vm.emplace_back(std::sorted_equivalent, ks, vs);
+    assert(!ks.empty()); // it was an lvalue above
+    assert(!vs.empty()); // it was an lvalue above
+    assert((vm[0] == M{{1, 4}, {1, 3}, {4, 2}, {10, 1}}));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, const key_container_type& key_cont,
+    //          const mapped_container_type& mapped_cont, const Alloc& a);
+    using M = std::flat_multimap<int, int, std::less<int>, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::pmr::vector<int> ks({1, 1, 4, 10}, &mr);
+    std::pmr::vector<int> vs({4, 3, 2, 1}, &mr);
+    vm.emplace_back(std::sorted_equivalent, ks, vs);
+    assert((vm[0] == M{{1, 4}, {1, 3}, {4, 2}, {10, 1}}));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type> il, const Alloc& a);
+    // cpp_17
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    using P = std::pair<int, int>;
+    P ar[]  = {{1, 1}, {1, 2}, {1, 4}, {5, 5}};
+    vm.emplace_back(
+        std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4), C(3));
+    assert((vm[0] == M{{1, 1}, {1, 2}, {1, 4}, {5, 5}}));
+    assert(vm[0].key_comp() == C(3));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type> il, const Alloc& a);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::pair<int, int> ar[1] = {{42, 42}};
+    vm.emplace_back(std::sorted_equivalent, ar, ar, C(4));
+    assert(vm[0] == M{});
+    assert(vm[0].key_comp() == C(4));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(InputIterator first, InputIterator last, const Alloc& a);
+    // cpp_17
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    using P = std::pair<int, int>;
+    P ar[]  = {{1, 1}, {1, 2}, {1, 4}, {5, 5}};
+    vm.emplace_back(
+        std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4), C(3));
+    assert((vm[0] == M{{1, 1}, {1, 2}, {1, 4}, {5, 5}}));
+    assert(vm[0].key_comp() == C(3));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+  {
+    // flat_multimap(InputIterator first, InputIterator last, const Alloc& a);
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, int, C, std::pmr::vector<int>, std::pmr::vector<int>>;
+    std::pmr::monotonic_buffer_resource mr;
+    std::pmr::vector<M> vm(&mr);
+    std::pair<int, int> ar[1] = {{42, 42}};
+    vm.emplace_back(std::sorted_equivalent, ar, ar, C(4));
+    assert(vm[0] == M{});
+    assert(vm[0].key_comp() == C(4));
+    assert(vm[0].keys().get_allocator().resource() == &mr);
+    assert(vm[0].values().get_allocator().resource() == &mr);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
new file mode 100644
index 0000000000000..de750e2506341
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/range.pass.cpp
@@ -0,0 +1,227 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// template<container-compatible-range<value_type> R>
+//     flat_multimap(from_range_t, R&&)
+// template<container-compatible-range<value_type> R>
+//     flat_multimap(from_range_t, R&&, const key_compare&)
+// template<container-compatible-range<value_type> R, class Alloc>
+//      flat_multimap(from_range_t, R&&, const Alloc&);
+// template<container-compatible-range<value_type> R, class Alloc>
+//      flat_multimap(from_range_t, R&&, const key_compare&, const Alloc&);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <string>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+// test constraint container-compatible-range
+
+template <class V>
+using RangeOf = std::ranges::subrange<V*>;
+using Map     = std::flat_multimap<int, double>;
+
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<int, double>>>);
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<short, double>>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>>);
+
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<int, double>>, std::less<int>>);
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<short, double>>, std::less<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>, std::less<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>, std::less<int>>);
+
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<int, double>>, std::allocator<int>>);
+static_assert(std::is_constructible_v<Map, std::from_range_t, RangeOf<std::pair<short, double>>, std::allocator<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>, std::allocator<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>, std::allocator<int>>);
+
+static_assert(std::is_constructible_v<Map,
+                                      std::from_range_t,
+                                      RangeOf<std::pair<int, double>>,
+                                      std::less<int>,
+                                      std::allocator<int>>);
+static_assert(std::is_constructible_v<Map,
+                                      std::from_range_t,
+                                      RangeOf<std::pair<short, double>>,
+                                      std::less<int>,
+                                      std::allocator<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<int>, std::less<int>, std::allocator<int>>);
+static_assert(!std::is_constructible_v<Map, std::from_range_t, RangeOf<double>, std::less<int>, std::allocator<int>>);
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::from_range_t, M1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::from_range_t, M2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::from_range_t, M3, const C&, const A2&>);
+  }
+  {
+    // container-compatible-range
+    using C           = test_less<int>;
+    using A1          = test_allocator<int>;
+    using A2          = test_allocator<std::string>;
+    using M           = std::flat_multimap<int, std::string, C, std::vector<int, A1>, std::vector<std::string, A2>>;
+    using Pair        = std::pair<int, std::string>;
+    using PairLike    = std::tuple<int, std::string>;
+    using NonPairLike = int;
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const A1&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const A1&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const A1&>);
+
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<Pair>&, const C&, const A1&>);
+    static_assert(std::is_constructible_v<M, std::from_range_t, std::vector<PairLike>&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M, std::from_range_t, std::vector<NonPairLike>&, const C&, const A1&>);
+  }
+
+  using P      = std::pair<int, short>;
+  P ar[]       = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {3, 6}, {2, 7}, {3, 8}, {3, 9}};
+  P expected[] = {{1, 1}, {1, 2}, {1, 3}, {2, 4}, {2, 5}, {2, 7}, {3, 6}, {3, 8}, {3, 9}};
+  {
+    // flat_multimap(from_range_t, R&&)
+    // input_range && !common
+    using M    = std::flat_multimap<int, short>;
+    using Iter = cpp20_input_iterator<const P*>;
+    using Sent = sentinel_wrapper<Iter>;
+    using R    = std::ranges::subrange<Iter, Sent>;
+    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+
+    // explicit(false)
+    M m2 = {std::from_range, R(Iter(ar), Sent(Iter(ar + 9)))};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(from_range_t, R&&)
+    // greater
+    using M = std::flat_multimap<int, short, std::greater<int>, std::deque<int, min_allocator<int>>, std::deque<short>>;
+    using Iter = cpp20_input_iterator<const P*>;
+    using Sent = sentinel_wrapper<Iter>;
+    using R    = std::ranges::subrange<Iter, Sent>;
+    auto m     = M(std::from_range, R(Iter(ar), Sent(Iter(ar + 9))));
+    assert((m.keys() == std::deque<int, min_allocator<int>>{3, 3, 3, 2, 2, 2, 1, 1, 1}));
+    LIBCPP_ASSERT((m.values() == std::deque<short>{6, 8, 9, 4, 5, 7, 1, 2, 3}));
+  }
+  {
+    // flat_multimap(from_range_t, R&&)
+    // contiguous range
+    using M = std::flat_multimap<int, short>;
+    using R = std::ranges::subrange<const P*>;
+    auto m  = M(std::from_range, R(ar, ar + 9));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+  }
+  {
+    // flat_multimap(from_range_t, R&&, const key_compare&)
+    using C = test_less<int>;
+    using M = std::flat_multimap<int, short, C, std::vector<int>, std::deque<short>>;
+    using R = std::ranges::subrange<const P*>;
+    auto m  = M(std::from_range, R(ar, ar + 9), C(3));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(3));
+
+    // explicit(false)
+    M m2 = {std::from_range, R(ar, ar + 9), C(3)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(3));
+  }
+  {
+    // flat_multimap(from_range_t, R&&, const Allocator&)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    auto m   = M(std::from_range, R(ar, ar + 9), A1(5));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(from_range_t, R&&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    M m      = {std::from_range, R(ar, ar + 9), A1(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(from_range_t, R&&, const key_compare&, const Allocator&)
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    auto m   = M(std::from_range, R(ar, ar + 9), C(3), A1(5));
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(from_range_t, R&&, const key_compare&, const Allocator&)
+    // explicit(false)
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, std::less<int>, std::deque<int, A1>, std::vector<short, A2>>;
+    using R  = std::ranges::subrange<const P*>;
+    M m      = {std::from_range, R(ar, ar + 9), {}, A2(5)}; // implicit ctor
+    assert(std::ranges::equal(m.keys(), expected | std::views::elements<0>));
+    LIBCPP_ASSERT(std::ranges::equal(m, expected));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
new file mode 100644
index 0000000000000..16579f0deed5d
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_container.pass.cpp
@@ -0,0 +1,165 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap(sorted_equivalent_t, key_container_type key_cont, mapped_container_type mapped_cont,
+//          const key_compare& comp = key_compare());
+//
+// template<class Alloc>
+//   flat_multimap(sorted_equivalent_t, const key_container_type& key_cont,
+//            const mapped_container_type& mapped_cont, const Alloc& a);
+// template<class Alloc>
+//   flat_multimap(sorted_equivalent_t, const key_container_type& key_cont,
+//            const mapped_container_type& mapped_cont,
+//            const key_compare& comp, const Alloc& a);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "min_allocator.h"
+#include "MoveOnly.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, const V1&, const V1&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, const V1&, const V2&, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, const V2&, const V1&, const C&, const A2&>);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
+    using M              = std::flat_multimap<int, char>;
+    std::vector<int> ks  = {1, 4, 4, 10};
+    std::vector<char> vs = {4, 3, 2, 1};
+    auto ks2             = ks;
+    auto vs2             = vs;
+
+    auto m = M(std::sorted_equivalent, ks, vs);
+    assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
+    m = M(std::sorted_equivalent, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 4}, {4, 3}, {4, 2}, {10, 1}}));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, std::move(ks2), std::move(vs2)};
+    assert(m == m2);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
+    // non-default container, comparator and allocator type
+    using Ks = std::deque<int, min_allocator<int>>;
+    using Vs = std::deque<char, min_allocator<char>>;
+    using M  = std::flat_multimap<int, char, std::greater<int>, Ks, Vs>;
+    Ks ks    = {10, 1, 1, 1};
+    Vs vs    = {1, 2, 3, 4};
+    auto m   = M(std::sorted_equivalent, ks, vs);
+    assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}}));
+    m = M(std::sorted_equivalent, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{1, 2}, {1, 3}, {1, 4}, {10, 1}}));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type)
+    // allocator copied into the containers
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
+    auto ks = std::vector<int, A>({2, 2, 4, 10}, A(4));
+    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
+    auto m  = M(std::sorted_equivalent, std::move(ks), std::move(vs));
+    assert(ks.empty()); // it was moved-from
+    assert(vs.empty()); // it was moved-from
+    assert((m == M{{2, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.keys().get_allocator() == A(4));
+    assert(m.values().get_allocator() == A(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare)
+    using C              = test_less<int>;
+    using M              = std::flat_multimap<int, char, C>;
+    std::vector<int> ks  = {1, 2, 10, 10};
+    std::vector<char> vs = {4, 3, 2, 1};
+
+    auto m = M(std::sorted_equivalent, ks, vs, C(4));
+    assert((m == M{{1, 4}, {2, 3}, {10, 2}, {10, 1}}));
+    assert(m.key_comp() == C(4));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, ks, vs, C(4)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, key_compare, const Allocator&)
+    using C                = test_less<int>;
+    using A                = test_allocator<int>;
+    using M                = std::flat_multimap<int, int, C, std::vector<int, A>, std::vector<int, A>>;
+    std::vector<int, A> ks = {1, 2, 4, 10};
+    std::vector<int, A> vs = {4, 3, 2, 1};
+    auto m                 = M(std::sorted_equivalent, ks, vs, C(4), A(5));
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {10, 1}}));
+    assert(m.key_comp() == C(4));
+    assert(m.keys().get_allocator() == A(5));
+    assert(m.values().get_allocator() == A(5));
+
+    // explicit(false)
+    M m2 = {ks, vs, C(4), A(5)};
+    assert(m2 == m);
+    assert(m2.key_comp() == C(4));
+    assert(m2.keys().get_allocator() == A(5));
+    assert(m2.values().get_allocator() == A(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, key_container_type , mapped_container_type, const Allocator&)
+    using A = test_allocator<int>;
+    using M = std::flat_multimap<int, int, std::less<int>, std::vector<int, A>, std::deque<int, A>>;
+    auto ks = std::vector<int, A>({1, 2, 4, 4}, A(4));
+    auto vs = std::deque<int, A>({4, 3, 2, 1}, A(5));
+    auto m  = M(std::sorted_equivalent, ks, vs, A(6)); // replaces the allocators
+    assert(!ks.empty());                               // it was an lvalue above
+    assert(!vs.empty());                               // it was an lvalue above
+    assert((m == M{{1, 4}, {2, 3}, {4, 2}, {4, 1}}));
+    assert(m.keys().get_allocator() == A(6));
+    assert(m.values().get_allocator() == A(6));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, ks, vs, A(6)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A(6));
+    assert(m2.values().get_allocator() == A(6));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
new file mode 100644
index 0000000000000..b34313bb3d404
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_initializer_list.pass.cpp
@@ -0,0 +1,183 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// template <class InputIterator>
+//   flat_multimap(sorted_equivalent_t s, initializer_list<value_type> il,
+//            const key_compare& comp = key_compare())
+// template<class Alloc>
+//   flat_multimap(sorted_equivalent_t, initializer_list<value_type> il, const Alloc& a);
+// template<class Alloc>
+//   flat_multimap(sorted_equivalent_t, initializer_list<value_type> il,
+//            const key_compare& comp, const Alloc& a);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+template <class T, class U>
+std::initializer_list<std::pair<T, U>> il = {{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+
+const auto il1 = il<int, int>;
+const auto il2 = il<int, short>;
+const auto il3 = il<short, int>;
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = other_allocator<int>;
+    using V1 = std::vector<int, A1>;
+    using V2 = std::vector<int, A2>;
+    using M1 = std::flat_multimap<int, int, C, V1, V1>;
+    using M2 = std::flat_multimap<int, int, C, V1, V2>;
+    using M3 = std::flat_multimap<int, int, C, V2, V1>;
+    using IL = std::initializer_list<std::pair<int, int>>;
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, IL, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, IL, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, IL, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, IL, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, IL, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, IL, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, IL, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, IL, const C&, const A2&>);
+  }
+  {
+    // initializer_list<value_type> needs to match exactly
+    using M = std::flat_multimap<int, short>;
+    using C = typename M::key_compare;
+    static_assert(std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<std::pair<int, short>>>);
+    static_assert(
+        std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<std::pair<int, short>>, C>);
+    static_assert(std::is_constructible_v<M,
+                                          std::sorted_equivalent_t,
+                                          std::initializer_list<std::pair<int, short>>,
+                                          C,
+                                          std::allocator<int>>);
+    static_assert(std::is_constructible_v<M,
+                                          std::sorted_equivalent_t,
+                                          std::initializer_list<std::pair<int, short>>,
+                                          std::allocator<int>>);
+    static_assert(
+        !std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<std::pair<const int, short>>>);
+    static_assert(
+        !std::is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<std::pair<const int, short>>, C>);
+    static_assert(!std::is_constructible_v<M,
+                                           std::sorted_equivalent_t,
+                                           std::initializer_list<std::pair<const int, short>>,
+                                           C,
+                                           std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M,
+                                           std::sorted_equivalent_t,
+                                           std::initializer_list<std::pair<const int, short>>,
+                                           std::allocator<int>>);
+    static_assert(
+        !std::
+            is_constructible_v<M, std::sorted_equivalent_t, std::initializer_list<std::pair<const int, const short>>>);
+    static_assert(!std::is_constructible_v<M,
+                                           std::sorted_equivalent_t,
+                                           std::initializer_list<std::pair<const int, const short>>,
+                                           C>);
+    static_assert(!std::is_constructible_v<M,
+                                           std::sorted_equivalent_t,
+                                           std::initializer_list<std::pair<const int, const short>>,
+                                           C,
+                                           std::allocator<int>>);
+    static_assert(!std::is_constructible_v<M,
+                                           std::sorted_equivalent_t,
+                                           std::initializer_list<std::pair<const int, const short>>,
+                                           std::allocator<int>>);
+  }
+
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>);
+    using M       = std::flat_multimap<int, int>;
+    auto m        = M(std::sorted_equivalent, il1);
+    auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il1};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
+    auto m  = M(std::sorted_equivalent, il1, std::less<int>());
+    assert(m == M({{1, 1}, {4, 2}, {4, 4}, {5, 5}}, std::less<>()));
+    assert(m.key_comp()(1, 2) == true);
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il1, std::less<int>()};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&);
+    // greater
+    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
+    std::initializer_list<std::pair<int, int>> il4{{5, 5}, {4, 4}, {1, 2}, {1, 1}};
+    auto m = M(std::sorted_equivalent, il4, std::greater<int>());
+    assert((m == M{{5, 5}, {4, 4}, {1, 2}, {1, 1}}));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>,  const Allocator&)
+    using A1      = test_allocator<int>;
+    using A2      = test_allocator<short>;
+    using M       = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    auto m        = M(std::sorted_equivalent, il2, A1(5));
+    auto expected = M{{1, 1}, {4, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, il2, A1(5)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A1(5));
+    assert(m2.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    auto m   = M(std::sorted_equivalent, il2, C(3), A1(5));
+    assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, initializer_list<value_type>, const key_compare&, const Allocator&);
+    // explicit(false)
+    using A1 = test_allocator<short>;
+    using A2 = test_allocator<int>;
+    using M  = std::flat_multimap<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
+    M m      = {std::sorted_equivalent, il3, {}, A1(5)}; // implicit ctor
+    assert((m == M{{1, 1}, {4, 2}, {4, 4}, {5, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
new file mode 100644
index 0000000000000..45c4b3dc675a5
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.cons/sorted_iter_iter.pass.cpp
@@ -0,0 +1,173 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// template <class InputIterator>
+//   flat_multimap(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp = key_compare());
+// template<class InputIterator, class Alloc>
+//   flat_multimap(InputIterator first, InputIterator last, const Alloc& a);
+// template<class InputIterator, class Allocator>
+//   flat_multimap(sorted_equivalent_t, InputIterator first, InputIterator last, const key_compare& comp, const Allocator& a);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+#include "../../../test_compare.h"
+
+int main(int, char**) {
+  {
+    // The constructors in this subclause shall not participate in overload
+    // resolution unless uses_allocator_v<key_container_type, Alloc> is true
+    // and uses_allocator_v<mapped_container_type, Alloc> is true.
+    using C     = test_less<int>;
+    using A1    = test_allocator<int>;
+    using A2    = other_allocator<int>;
+    using V1    = std::vector<int, A1>;
+    using V2    = std::vector<int, A2>;
+    using M1    = std::flat_multimap<int, int, C, V1, V1>;
+    using M2    = std::flat_multimap<int, int, C, V1, V2>;
+    using M3    = std::flat_multimap<int, int, C, V2, V1>;
+    using Iter1 = typename M1::iterator;
+    using Iter2 = typename M2::iterator;
+    using Iter3 = typename M3::iterator;
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const A2&>);
+
+    static_assert(std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A1&>);
+    static_assert(!std::is_constructible_v<M1, std::sorted_equivalent_t, Iter1, Iter1, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M2, std::sorted_equivalent_t, Iter2, Iter2, const C&, const A2&>);
+    static_assert(!std::is_constructible_v<M3, std::sorted_equivalent_t, Iter3, Iter3, const C&, const A2&>);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator);
+    // cpp17_input_iterator
+    using M = std::flat_multimap<int, int>;
+    using P = std::pair<int, int>;
+    P ar[]  = {{1, 1}, {4, 4}, {5, 5}, {5, 2}};
+    auto m  = M(std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4));
+    auto expected = M{{1, 1}, {4, 4}, {5, 5}, {5, 2}};
+    assert(m == expected);
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, cpp17_input_iterator<const P*>(ar), cpp17_input_iterator<const P*>(ar + 4)};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator);
+    // contiguous iterator
+    using C = test_less<int>;
+    using M =
+        std::flat_multimap<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
+    std::pair<int, int> ar[] = {{1, 1}, {1, 4}, {2, 2}, {5, 5}};
+    auto m                   = M(std::sorted_equivalent, ar, ar + 4);
+    auto expected            = M{{1, 1}, {1, 4}, {2, 2}, {5, 5}};
+    assert(m == expected);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
+    // cpp_17_input_iterator
+    using M = std::flat_multimap<int, int, std::function<bool(int, int)>>;
+    using P = std::pair<int, int>;
+    P ar[]  = {{1, 1}, {2, 2}, {2, 4}, {5, 5}};
+    auto m  = M(std::sorted_equivalent,
+               cpp17_input_iterator<const P*>(ar),
+               cpp17_input_iterator<const P*>(ar + 4),
+               std::less<int>());
+    assert(m == M({{1, 1}, {2, 2}, {2, 4}, {5, 5}}, std::less<>()));
+    assert(m.key_comp()(1, 2) == true);
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent,
+            cpp17_input_iterator<const P*>(ar),
+            cpp17_input_iterator<const P*>(ar + 4),
+            std::less<int>()};
+    assert(m2 == m);
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
+    // greater
+    using M = std::flat_multimap<int, int, std::greater<int>, std::deque<int, min_allocator<int>>, std::vector<int>>;
+    using P = std::pair<int, int>;
+    P ar[]  = {{5, 5}, {2, 4}, {2, 2}, {1, 1}};
+    auto m  = M(std::sorted_equivalent,
+               cpp17_input_iterator<const P*>(ar),
+               cpp17_input_iterator<const P*>(ar + 4),
+               std::greater<int>());
+    assert((m == M{{5, 5}, {2, 4}, {2, 2}, {1, 1}}));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&);
+    // contiguous iterator
+    using C = test_less<int>;
+    using M =
+        std::flat_multimap<int, int, C, std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>;
+    std::pair<int, int> ar[1] = {{42, 42}};
+    auto m                    = M(std::sorted_equivalent, ar, ar, C(5));
+    assert(m.empty());
+    assert(m.key_comp() == C(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator , InputIterator, const Allocator&)
+    using A1      = test_allocator<int>;
+    using A2      = test_allocator<short>;
+    using M       = std::flat_multimap<int, short, std::less<int>, std::vector<int, A1>, std::deque<short, A2>>;
+    using P       = std::pair<int, int>;
+    P ar[]        = {{2, 1}, {2, 2}, {4, 4}, {5, 5}};
+    auto m        = M(std::sorted_equivalent, ar, ar + 4, A1(5));
+    auto expected = M{{2, 1}, {2, 2}, {4, 4}, {5, 5}};
+    assert(m == expected);
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+
+    // explicit(false)
+    M m2 = {std::sorted_equivalent, ar, ar + 4, A1(5)};
+    assert(m2 == m);
+    assert(m2.keys().get_allocator() == A1(5));
+    assert(m2.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
+    using C  = test_less<int>;
+    using A1 = test_allocator<int>;
+    using A2 = test_allocator<short>;
+    using M  = std::flat_multimap<int, short, C, std::vector<int, A1>, std::deque<short, A2>>;
+    using P  = std::pair<int, int>;
+    P ar[]   = {{1, 1}, {1, 2}, {1, 4}, {1, 5}};
+    auto m   = M(std::sorted_equivalent, ar, ar + 4, C(3), A1(5));
+    assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}}));
+    assert(m.key_comp() == C(3));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+  {
+    // flat_multimap(sorted_equivalent_t, InputIterator, InputIterator, const key_compare&, const Allocator&);
+    // explicit(false)
+    using A1 = test_allocator<short>;
+    using A2 = test_allocator<int>;
+    using M  = std::flat_multimap<short, int, std::less<int>, std::deque<short, A1>, std::vector<int, A2>>;
+    using P  = std::pair<int, int>;
+    P ar[]   = {{1, 1}, {1, 2}, {1, 4}, {1, 5}};
+    M m      = {std::sorted_equivalent, ar, ar + 4, {}, A1(5)}; // implicit ctor
+    assert((m == M{{1, 1}, {1, 2}, {1, 4}, {1, 5}}));
+    assert(m.keys().get_allocator() == A1(5));
+    assert(m.values().get_allocator() == A2(5));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
new file mode 100644
index 0000000000000..76d5cbd909050
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if.pass.cpp
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class Key, class T, class Compare, class KeyContainer, class MappedContainer, class Predicate>
+//   typename flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>::size_type
+//   erase_if(flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>& c, Predicate pred);
+
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <initializer_list>
+#include <vector>
+
+#include "test_macros.h"
+#include "test_allocator.h"
+#include "min_allocator.h"
+
+// Verify that `flat_multimap` (like `multimap`) does NOT support std::erase.
+//
+template <class S>
+concept HasStdErase = requires(S& s, typename S::value_type x) { std::erase(s, x); };
+static_assert(HasStdErase<std::vector<int>>);
+static_assert(!HasStdErase<std::flat_multimap<int, int>>);
+
+template <class M>
+M make(std::initializer_list<int> vals) {
+  M ret;
+  for (int v : vals) {
+    ret.emplace(static_cast<typename M::key_type>(v), static_cast<typename M::mapped_type>(v + 10));
+  }
+  return ret;
+}
+
+template <class M, class Pred>
+void test0(
+    std::initializer_list<int> vals, Pred p, std::initializer_list<int> expected, std::size_t expected_erased_count) {
+  M s = make<M>(vals);
+  ASSERT_SAME_TYPE(typename M::size_type, decltype(std::erase_if(s, p)));
+  assert(expected_erased_count == std::erase_if(s, p));
+  assert(s == make<M>(expected));
+}
+
+template <class S>
+void test() {
+  // Test all the plausible signatures for this predicate.
+  auto is1   = [](typename S::const_reference v) { return v.first == 1; };
+  auto is2   = [](typename S::value_type v) { return v.first == 2; };
+  auto is3   = [](const typename S::value_type& v) { return v.first == 3; };
+  auto is4   = [](auto v) { return v.first == 4; };
+  auto True  = [](const auto&) { return true; };
+  auto False = [](auto&&) { return false; };
+
+  test0<S>({}, is1, {}, 0);
+
+  test0<S>({1}, is1, {}, 1);
+  test0<S>({1, 1}, is1, {}, 2);
+  test0<S>({1, 1}, is2, {1, 1}, 0);
+
+  test0<S>({1, 2}, is1, {2}, 1);
+  test0<S>({1, 2}, is2, {1}, 1);
+  test0<S>({1, 2, 2, 2}, is2, {1}, 3);
+  test0<S>({1, 2, 2, 2}, is3, {1, 2, 2, 2}, 0);
+
+  test0<S>({1, 1, 2, 2, 3, 3}, is1, {2, 2, 3, 3}, 2);
+  test0<S>({1, 1, 2, 2, 3, 3}, is2, {1, 1, 3, 3}, 2);
+  test0<S>({1, 1, 2, 2, 3, 3}, is3, {1, 1, 2, 2}, 2);
+  test0<S>({1, 1, 2, 2, 3, 3}, is4, {1, 1, 2, 2, 3, 3}, 0);
+
+  test0<S>({1, 2, 2, 3, 3, 3}, True, {}, 6);
+  test0<S>({1, 2, 2, 3, 3, 3}, False, {1, 2, 2, 3, 3, 3}, 0);
+}
+
+int main(int, char**) {
+  test<std::flat_multimap<int, char>>();
+  test<std::flat_multimap<int,
+                          char,
+                          std::less<int>,
+                          std::vector<int, min_allocator<int>>,
+                          std::vector<char, min_allocator<char>>>>();
+  test<std::flat_multimap<int, char, std::greater<int>, std::vector<int, test_allocator<int>>>>();
+  test<std::flat_multimap<int, char, std::less<int>, std::deque<int, min_allocator<int>>>>();
+  test<std::flat_multimap<int, char, std::greater<int>, std::deque<int, test_allocator<int>>>>();
+  test<std::flat_multimap<long, int>>();
+  test<std::flat_multimap<double, int>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp
new file mode 100644
index 0000000000000..13b57202f7862
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.erasure/erase_if_exceptions.pass.cpp
@@ -0,0 +1,157 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: no-exceptions
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class Key, class T, class Compare, class KeyContainer, class MappedContainer, class Predicate>
+//   typename flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>::size_type
+//   erase_if(flat_multimap<Key, T, Compare, KeyContainer, MappedContainer>& c, Predicate pred);
+// If any member function in [flat.set.defn] exits via an exception, the invariant is restored.
+// (This is not a member function, but let's respect the invariant anyway.)
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "../helpers.h"
+#include "test_macros.h"
+
+struct Counter {
+  int c1, c2, throws;
+  void tick() {
+    c1 -= 1;
+    if (c1 == 0) {
+      c1 = c2;
+      throws += 1;
+      throw 42;
+    }
+  }
+};
+Counter g_counter = {0, 0, 0};
+
+struct ThrowingAssignment {
+  ThrowingAssignment(int i) : i_(i) {}
+  ThrowingAssignment(const ThrowingAssignment&) = default;
+  ThrowingAssignment& operator=(const ThrowingAssignment& rhs) {
+    g_counter.tick();
+    i_ = rhs.i_;
+    g_counter.tick();
+    return *this;
+  }
+  operator int() const { return i_; }
+  int i_;
+};
+
+struct ThrowingComparator {
+  bool operator()(const ThrowingAssignment& a, const ThrowingAssignment& b) const {
+    g_counter.tick();
+    return a.i_ < b.i_;
+  }
+};
+
+struct ErasurePredicate {
+  bool operator()(const auto& x) const { return (3 <= x.first && x.first <= 5); }
+};
+
+int main(int, char**) {
+  const std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {3, 3}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
+  {
+    using M = std::flat_multimap<ThrowingAssignment, int, ThrowingComparator>;
+    for (int first_throw = 1; first_throw < 99; ++first_throw) {
+      for (int second_throw = 1; second_throw < 99; ++second_throw) {
+        g_counter = {0, 0, 0};
+        M m       = M({1, 2, 3, 3, 5, 6, 7, 8}, {1, 2, 3, 3, 5, 6, 7, 8});
+        try {
+          g_counter = {first_throw, second_throw, 0};
+          auto n    = std::erase_if(m, ErasurePredicate());
+          assert(n == 3);
+          // If it didn't throw at all, we're done.
+          g_counter = {0, 0, 0};
+          assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}}));
+          first_throw = 99; // "done"
+          break;
+        } catch (int ex) {
+          assert(ex == 42);
+          check_invariant(m);
+          LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8));
+          if (g_counter.throws == 1) {
+            // We reached the first throw but not the second throw.
+            break;
+          }
+        }
+      }
+    }
+  }
+  {
+    using M = std::flat_multimap<int, ThrowingAssignment, ThrowingComparator>;
+    for (int first_throw = 1; first_throw < 99; ++first_throw) {
+      for (int second_throw = 1; second_throw < 99; ++second_throw) {
+        g_counter = {0, 0, 0};
+        M m       = M({1, 2, 3, 3, 5, 6, 7, 8}, {1, 2, 3, 3, 5, 6, 7, 8});
+        try {
+          g_counter = {first_throw, second_throw, 0};
+          auto n    = std::erase_if(m, ErasurePredicate());
+          assert(n == 3);
+          // If it didn't throw at all, we're done.
+          g_counter = {0, 0, 0};
+          assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}}));
+          first_throw = 99; // "done"
+          break;
+        } catch (int ex) {
+          assert(ex == 42);
+          check_invariant(m);
+          LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8));
+          if (g_counter.throws == 1) {
+            // We reached the first throw but not the second throw.
+            break;
+          }
+        }
+      }
+    }
+  }
+  {
+    using M = std::
+        flat_multimap<ThrowingAssignment, int, ThrowingComparator, std::deque<ThrowingAssignment>, std::deque<int>>;
+    for (int first_throw = 1; first_throw < 99; ++first_throw) {
+      for (int second_throw = 1; second_throw < 99; ++second_throw) {
+        g_counter                                = {0, 0, 0};
+        std::deque<ThrowingAssignment> container = {5, 6, 7, 8};
+        container.insert(container.begin(), {1, 2, 3, 3});
+        M m = M(std::move(container), {1, 2, 3, 3, 5, 6, 7, 8});
+        try {
+          g_counter = {first_throw, second_throw, 0};
+          auto n    = std::erase_if(m, ErasurePredicate());
+          assert(n == 3);
+          // If it didn't throw at all, we're done.
+          g_counter = {0, 0, 0};
+          assert((m == M{{1, 1}, {2, 2}, {6, 6}, {7, 7}, {8, 8}}));
+          first_throw = 99; // "done"
+          break;
+        } catch (int ex) {
+          assert(ex == 42);
+          check_invariant(m);
+          LIBCPP_ASSERT(m.empty() || std::equal(m.begin(), m.end(), expected, expected + 8));
+          if (g_counter.throws == 1) {
+            // We reached the first throw but not the second throw.
+            break;
+          }
+        }
+      }
+    }
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
new file mode 100644
index 0000000000000..c1285955e5db6
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator.pass.cpp
@@ -0,0 +1,105 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+//       iterator begin()   noexcept;
+// const_iterator begin()   const noexcept
+//       iterator end()     noexcept;
+// const_iterator end()     const noexcept;
+//
+// const_iterator cbegin()  const noexcept;
+// const_iterator cend()    const noexcept;
+
+#include <cassert>
+#include <cstddef>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <string>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  M m         = {{1, 'a'}, {1, 'z'}, {2, 'b'}, {3, 'a'}, {3, 'b'}, {3, 'c'}, {4, 'd'}};
+  const M& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.begin()), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(m.cbegin()), typename M::const_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.begin()), typename M::const_iterator);
+  ASSERT_SAME_TYPE(decltype(m.end()), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(m.cend()), typename M::const_iterator);
+  ASSERT_SAME_TYPE(decltype(cm.end()), typename M::const_iterator);
+  static_assert(noexcept(m.begin()));
+  static_assert(noexcept(cm.begin()));
+  static_assert(noexcept(m.cbegin()));
+  static_assert(noexcept(m.end()));
+  static_assert(noexcept(cm.end()));
+  static_assert(noexcept(m.cend()));
+  assert(m.size() == 7);
+  assert(std::distance(m.begin(), m.end()) == 7);
+  assert(std::distance(cm.begin(), cm.end()) == 7);
+  assert(std::distance(m.cbegin(), m.cend()) == 7);
+  typename M::iterator i;                   // default-construct
+  i                            = m.begin(); // move-assignment
+  typename M::const_iterator k = i;         // converting constructor
+  assert(i == k);                           // comparison
+  assert(i->first == 1);                    // operator->
+  assert(i->second == 'a');                 // operator->
+  ++i;                                      // pre-increment
+  assert(i->first == 1);                    // operator->
+  assert(i->second == 'z');                 // operator->
+  i = i + 3;                                // operator+
+  assert((*i).first == 3);                  // operator*
+  assert((*i).second == 'b');               // operator*
+  i += 3;                                   // operator+=
+  assert(i == m.end());                     // operator==
+  --i;                                      // pre-decrement
+  assert(i->first == 4);                    // operator->
+  assert(i->second == 'd');                 // operator->
+  i = i - 2;                                // operator-
+  assert(i->first == 3);                    // operator->
+  assert(i->second == 'b');                 // operator->
+  i -= 2;                                   // operator-=
+  assert(i > m.begin());                    // operator>
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  {
+    // N3644 testing
+    using C = std::flat_multimap<int, char>;
+    C::iterator ii1{}, ii2{};
+    C::iterator ii4 = ii1;
+    C::const_iterator cii{};
+    assert(ii1 == ii2);
+    assert(ii1 == ii4);
+    assert(!(ii1 != ii2));
+
+    assert((ii1 == cii));
+    assert((cii == ii1));
+    assert(!(ii1 != cii));
+    assert(!(cii != ii1));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
new file mode 100644
index 0000000000000..f1b2cad743e23
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_comparison.pass.cpp
@@ -0,0 +1,155 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// flat_multimap iterators should be C++20 random access iterators
+
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using KI    = typename KeyContainer::iterator;
+  using I     = M::iterator;
+  using CI    = M::const_iterator;
+  using RI    = M::reverse_iterator;
+  using CRI   = M::const_reverse_iterator;
+
+  static_assert(std::equality_comparable<I>);
+  static_assert(std::equality_comparable<CI>);
+  static_assert(std::equality_comparable<RI>);
+  static_assert(std::equality_comparable<CRI>);
+
+  static_assert(std::totally_ordered<I>);
+  static_assert(std::totally_ordered<CI>);
+  static_assert(std::totally_ordered<RI>);
+  static_assert(std::totally_ordered<CRI>);
+
+  M m = {{1, 'a'}, {2, 'b'}, {2, 'e'}, {3, 'z'}, {3, 'y'}, {3, 'c'}, {4, 'd'}};
+
+  I i1 = m.begin();
+  I i2 = m.begin() + 1;
+
+  assert(i1 == i1);
+  assert(!(i1 != i1));
+  assert(i1 != i2);
+  assert(!(i1 == i2));
+  assert(i1 < i2);
+  assert(!(i1 < i1));
+  assert(i1 <= i1);
+  assert(i1 <= i2);
+  assert(!(i2 <= i1));
+  assert(i2 > i1);
+  assert(!(i2 > i2));
+  assert(i2 >= i1);
+  assert(i2 >= i2);
+  assert(!(i1 >= i2));
+
+  CI ci1 = m.cbegin();
+  CI ci2 = m.cbegin() + 1;
+  assert(ci1 == ci1);
+  assert(!(ci1 != ci1));
+  assert(ci1 != ci2);
+  assert(!(ci1 == ci2));
+  assert(ci1 < ci2);
+  assert(!(ci1 < ci1));
+  assert(ci1 <= ci1);
+  assert(ci1 <= ci2);
+  assert(!(ci2 <= ci1));
+  assert(ci2 > ci1);
+  assert(!(ci2 > ci2));
+  assert(ci2 >= ci1);
+  assert(ci2 >= ci2);
+  assert(!(ci1 >= ci2));
+
+  RI ri1 = m.rbegin();
+  RI ri2 = m.rbegin() + 1;
+  assert(ri1 == ri1);
+  assert(!(ri1 != ri1));
+  assert(ri1 != ri2);
+  assert(!(ri1 == ri2));
+  assert(ri1 < ri2);
+  assert(!(ri1 < ri1));
+  assert(ri1 <= ri1);
+  assert(ri1 <= ri2);
+  assert(!(ri2 <= ri1));
+  assert(ri2 > ri1);
+  assert(!(ri2 > ri2));
+  assert(ri2 >= ri1);
+  assert(ri2 >= ri2);
+  assert(!(ri1 >= ri2));
+
+  CRI cri1 = m.crbegin();
+  CRI cri2 = m.crbegin() + 1;
+  assert(cri1 == cri1);
+  assert(!(cri1 != cri1));
+  assert(cri1 != cri2);
+  assert(!(cri1 == cri2));
+  assert(cri1 < cri2);
+  assert(!(cri1 < cri1));
+  assert(cri1 <= cri1);
+  assert(cri1 <= cri2);
+  assert(!(cri2 <= cri1));
+  assert(cri2 > cri1);
+  assert(!(cri2 > cri2));
+  assert(cri2 >= cri1);
+  assert(cri2 >= cri2);
+  assert(!(cri1 >= cri2));
+
+  if constexpr (std::three_way_comparable<KI>) {
+    static_assert(std::three_way_comparable<I>); // ...of course the wrapped iterators still support <=>.
+    static_assert(std::three_way_comparable<CI>);
+    static_assert(std::three_way_comparable<RI>);
+    static_assert(std::three_way_comparable<CRI>);
+    static_assert(std::same_as<decltype(I() <=> I()), std::strong_ordering>);
+    static_assert(std::same_as<decltype(I() <=> CI()), std::strong_ordering>);
+    static_assert(std::same_as<decltype(CI() <=> CI()), std::strong_ordering>);
+    static_assert(std::same_as<decltype(RI() <=> RI()), std::strong_ordering>);
+    static_assert(std::same_as<decltype(RI() <=> CRI()), std::strong_ordering>);
+    static_assert(std::same_as<decltype(CRI() <=> CRI()), std::strong_ordering>);
+
+    assert(i1 <=> i1 == std::strong_ordering::equivalent);
+    assert(i1 <=> i2 == std::strong_ordering::less);
+    assert(i2 <=> i1 == std::strong_ordering::greater);
+
+    assert(ci1 <=> ci1 == std::strong_ordering::equivalent);
+    assert(ci1 <=> ci2 == std::strong_ordering::less);
+    assert(ci2 <=> ci1 == std::strong_ordering::greater);
+
+    assert(ri1 <=> ri1 == std::strong_ordering::equivalent);
+    assert(ri1 <=> ri2 == std::strong_ordering::less);
+    assert(ri2 <=> ri1 == std::strong_ordering::greater);
+
+    assert(cri1 <=> cri1 == std::strong_ordering::equivalent);
+    assert(cri1 <=> cri2 == std::strong_ordering::less);
+    assert(cri2 <=> cri1 == std::strong_ordering::greater);
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp
new file mode 100644
index 0000000000000..ce578e4def92b
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/iterator_concept_conformance.compile.pass.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator, const_iterator, reverse_iterator, const_reverse_iterator
+
+#include <flat_map>
+#include <deque>
+#include <functional>
+#include <iterator>
+#include <string>
+#include <vector>
+#include <type_traits>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using C     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using I     = C::iterator;
+  using CI    = C::const_iterator;
+  using RI    = C::reverse_iterator;
+  using CRI   = C::const_reverse_iterator;
+  static_assert(std::random_access_iterator<I>);
+  static_assert(std::random_access_iterator<CI>);
+  static_assert(std::random_access_iterator<RI>);
+  static_assert(std::random_access_iterator<CRI>);
+  static_assert(!std::contiguous_iterator<I>);
+  static_assert(!std::contiguous_iterator<CI>);
+  static_assert(!std::contiguous_iterator<RI>);
+  static_assert(!std::contiguous_iterator<CRI>);
+  static_assert(!std::indirectly_writable<I, std::pair<int, char>>);
+  static_assert(!std::indirectly_writable<CI, std::pair<int, char>>);
+  static_assert(!std::indirectly_writable<RI, std::pair<int, char>>);
+  static_assert(!std::indirectly_writable<CRI, std::pair<int, char>>);
+  static_assert(std::sentinel_for<I, I>);
+  static_assert(std::sentinel_for<I, CI>);
+  static_assert(!std::sentinel_for<I, RI>);
+  static_assert(!std::sentinel_for<I, CRI>);
+  static_assert(std::sentinel_for<CI, I>);
+  static_assert(std::sentinel_for<CI, CI>);
+  static_assert(!std::sentinel_for<CI, RI>);
+  static_assert(!std::sentinel_for<CI, CRI>);
+  static_assert(!std::sentinel_for<RI, I>);
+  static_assert(!std::sentinel_for<RI, CI>);
+  static_assert(std::sentinel_for<RI, RI>);
+  static_assert(std::sentinel_for<RI, CRI>);
+  static_assert(!std::sentinel_for<CRI, I>);
+  static_assert(!std::sentinel_for<CRI, CI>);
+  static_assert(std::sentinel_for<CRI, RI>);
+  static_assert(std::sentinel_for<CRI, CRI>);
+  static_assert(std::indirectly_movable_storable<I, std::pair<int, char>*>);
+  static_assert(std::indirectly_movable_storable<CI, std::pair<int, char>*>);
+  static_assert(std::indirectly_movable_storable<RI, std::pair<int, char>*>);
+  static_assert(std::indirectly_movable_storable<CRI, std::pair<int, char>*>);
+
+#ifdef _LIBCPP_VERSION
+  static_assert(std::is_same_v<typename std::iterator_traits<I>::iterator_category, std::random_access_iterator_tag>);
+  static_assert(std::is_same_v<typename std::iterator_traits<CI>::iterator_category, std::random_access_iterator_tag>);
+  static_assert(std::is_same_v<typename std::iterator_traits<RI>::iterator_category, std::random_access_iterator_tag>);
+  static_assert(std::is_same_v<typename std::iterator_traits<CRI>::iterator_category, std::random_access_iterator_tag>);
+#endif
+}
+
+void test() {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp
new file mode 100644
index 0000000000000..979c0b090fd66
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/range_concept_conformance.compile.pass.cpp
@@ -0,0 +1,55 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <string>
+#include <vector>
+#include "MinSequenceContainer.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  {
+    using Key   = typename KeyContainer::value_type;
+    using Value = typename ValueContainer::value_type;
+    using C     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+    static_assert(std::same_as<std::ranges::iterator_t<C>, typename C::iterator>);
+    static_assert(std::ranges::random_access_range<C>);
+    static_assert(!std::ranges::contiguous_range<C>);
+    static_assert(std::ranges::common_range<C>);
+    static_assert(std::ranges::input_range<C>);
+    static_assert(!std::ranges::view<C>);
+    static_assert(std::ranges::sized_range<C>);
+    static_assert(!std::ranges::borrowed_range<C>);
+    static_assert(std::ranges::viewable_range<C>);
+
+    static_assert(std::same_as<std::ranges::iterator_t<const C>, typename C::const_iterator>);
+    static_assert(std::ranges::random_access_range<const C>);
+    static_assert(!std::ranges::contiguous_range<const C>);
+    static_assert(std::ranges::common_range<const C>);
+    static_assert(std::ranges::input_range<const C>);
+    static_assert(!std::ranges::view<const C>);
+    static_assert(std::ranges::sized_range<const C>);
+    static_assert(!std::ranges::borrowed_range<const C>);
+    static_assert(!std::ranges::viewable_range<const C>);
+  }
+}
+
+void test() {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
new file mode 100644
index 0000000000000..8c1e5451f703f
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.iterators/reverse_iterator.pass.cpp
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+//       reverse_iterator rbegin() noexcept;
+// const_reverse_iterator rbegin() const noexcept;
+//       reverse_iterator rend()   noexcept;
+// const_reverse_iterator rend()   const noexcept;
+//
+// const_reverse_iterator crbegin() const noexcept;
+// const_reverse_iterator crend()   const noexcept;
+
+#include <cassert>
+#include <cstddef>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include <iterator>
+
+#include "test_macros.h"
+#include <iostream>
+
+int main(int, char**) {
+  {
+    using M     = std::flat_multimap<int, char, std::less<int>, std::deque<int>, std::deque<char>>;
+    M m         = {{1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'd'}, {3, 'e'}, {3, 'f'}, {4, 'g'}, {4, 'h'}};
+    const M& cm = m;
+    ASSERT_SAME_TYPE(decltype(m.rbegin()), M::reverse_iterator);
+    ASSERT_SAME_TYPE(decltype(m.crbegin()), M::const_reverse_iterator);
+    ASSERT_SAME_TYPE(decltype(cm.rbegin()), M::const_reverse_iterator);
+    ASSERT_SAME_TYPE(decltype(m.rend()), M::reverse_iterator);
+    ASSERT_SAME_TYPE(decltype(m.crend()), M::const_reverse_iterator);
+    ASSERT_SAME_TYPE(decltype(cm.rend()), M::const_reverse_iterator);
+    static_assert(noexcept(m.rbegin()));
+    static_assert(noexcept(cm.rbegin()));
+    static_assert(noexcept(m.crbegin()));
+    static_assert(noexcept(m.rend()));
+    static_assert(noexcept(cm.rend()));
+    static_assert(noexcept(m.crend()));
+    assert(m.size() == 8);
+    assert(std::distance(m.rbegin(), m.rend()) == 8);
+    assert(std::distance(cm.rbegin(), cm.rend()) == 8);
+    assert(std::distance(m.crbegin(), m.crend()) == 8);
+    assert(std::distance(cm.crbegin(), cm.crend()) == 8);
+    M::reverse_iterator i; // default-construct
+    ASSERT_SAME_TYPE(decltype(i->first), const int&);
+    ASSERT_SAME_TYPE(decltype(i->second), char&);
+    i                           = m.rbegin(); // move-assignment
+    M::const_reverse_iterator k = i;          // converting constructor
+    assert(i == k);                           // comparison
+    for (int j = 8; j >= 1; --j, ++i) {       // pre-increment
+      assert(i->first == (j + 1) / 2);        // operator->
+      assert(i->second == 'a' + j - 1);
+    }
+    assert(i == m.rend());
+    for (int j = 1; j <= 8; ++j) {
+      --i; // pre-decrement
+      assert((*i).first == (j + 1) / 2);
+      assert((*i).second == 'a' + j - 1);
+    }
+    assert(i == m.rbegin());
+  }
+  {
+    // N3644 testing
+    using C = std::flat_multimap<int, char>;
+    C::reverse_iterator ii1{}, ii2{};
+    C::reverse_iterator ii4 = ii1;
+    C::const_reverse_iterator cii{};
+    assert(ii1 == ii2);
+    assert(ii1 == ii4);
+    assert(!(ii1 != ii2));
+
+    assert((ii1 == cii));
+    assert((cii == ii1));
+    assert(!(ii1 != cii));
+    assert(!(cii != ii1));
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
new file mode 100644
index 0000000000000..5b0788b6826fd
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/clear.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// void clear() noexcept;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// test noexcept
+
+template <class T>
+concept NoExceptClear = requires(T t) {
+  { t.clear() } noexcept;
+};
+
+static_assert(NoExceptClear<std::flat_multimap<int, int>>);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+static_assert(
+    NoExceptClear<std::flat_multimap<int, int, std::less<int>, ThrowOnMoveContainer<int>, ThrowOnMoveContainer<int>>>);
+#endif
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  M m = {{5, 2}, {2, 1}, {2, 3}, {2, 1}, {5, 0}};
+  assert(m.size() == 5);
+  ASSERT_NOEXCEPT(m.clear());
+  ASSERT_SAME_TYPE(decltype(m.clear()), void);
+  m.clear();
+  assert(m.size() == 0);
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
new file mode 100644
index 0000000000000..9ef0c26e54ba3
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace.pass.cpp
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template <class... Args>
+// iterator emplace(Args&&... args);
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <tuple>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "../../../Emplaceable.h"
+#include "DefaultOnly.h"
+#include "min_allocator.h"
+
+// Constraints: is_constructible_v<pair<key_type, mapped_type>, Args...> is true.
+template <class M, class... Args>
+concept CanEmplace = requires(M m, Args&&... args) { m.emplace(std::forward<Args>(args)...); };
+
+using Map = std::flat_multimap<Emplaceable, Emplaceable>;
+static_assert(CanEmplace<Map>);
+static_assert(CanEmplace<Map, Emplaceable, Emplaceable>);
+static_assert(CanEmplace<Map, std::piecewise_construct_t, std::tuple<int, double>, std::tuple<int, double>>);
+static_assert(!CanEmplace<Map, Emplaceable>);
+static_assert(!CanEmplace<Map, int, double>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using R     = typename M::iterator;
+
+  {
+    // was empty
+    M m;
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 3.5));
+    assert(r == m.begin());
+    assert(m.size() == 1);
+    assert(r->first == 2);
+    assert(r->second == 3.5);
+  }
+  {
+    // key does not exist and inserted at the begin
+    M m                              = {{3, 4.0}, {3, 3.0}, {3, 1.0}, {7, 0.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin());
+    assert(m.size() == 5);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // key does not exist and inserted in the middle
+    M m                              = {{1, 4.0}, {1, 3.0}, {3, 1.0}, {4, 0.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 5);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // key does not exist and inserted at the end
+    M m                              = {{1, 4.0}, {1, 3.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 3);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // key already exists and original at the begin
+    M m                              = {{2, 4.0}, {2, 3.0}, {5, 1.0}, {6, 0.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 5);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // key already exists and original in the middle
+    M m                              = {{0, 4.0}, {2, 3.0}, {2, 1.0}, {4, 0.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 3);
+    assert(m.size() == 5);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // key already exists and original at the end
+    M m                              = {{0, 4.0}, {1, 3.0}, {2, 1.0}};
+    std::same_as<R> decltype(auto) r = m.emplace(typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 3);
+    assert(m.size() == 4);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+}
+
+template <class KeyContainer, class ValueContainer>
+void test_emplaceable() {
+  using M = std::flat_multimap<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
+  using R = typename M::iterator;
+
+  M m;
+  std::same_as<R> decltype(auto) r =
+      m.emplace(std::piecewise_construct, std::forward_as_tuple(2), std::forward_as_tuple());
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(m.begin()->first == 2);
+  assert(m.begin()->second == Emplaceable());
+  r = m.emplace(std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5));
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(m.begin()->first == 1);
+  assert(m.begin()->second == Emplaceable(2, 3.5));
+  r = m.emplace(std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5));
+  assert(r == m.begin() + 1);
+  assert(m.size() == 3);
+  assert(m.begin()->first == 1);
+  assert(m.begin()->second == Emplaceable(2, 3.5));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
+  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+  test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
+  test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  {
+    auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
+      m.emplace(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+    };
+    test_emplace_exception_guarantee(emplace_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
new file mode 100644
index 0000000000000..588d27ea54f4d
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/emplace_hint.pass.cpp
@@ -0,0 +1,228 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template <class... Args>
+//   iterator emplace_hint(const_iterator position, Args&&... args);
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "../../../Emplaceable.h"
+#include "DefaultOnly.h"
+#include "min_allocator.h"
+#include "../helpers.h"
+
+#if defined(_LIBCPP_VERSION)
+// spec only specifies `emplace(Args&&...)` is_constructible_v<pair<key_type, mapped_type>, Args...> is true.
+// nothing mentioned for emplace_hint
+template <class M, class... Args>
+concept CanEmplaceHint =
+    requires(M m, typename M::const_iterator i, Args&&... args) { m.emplace_hint(i, std::forward<Args>(args)...); };
+
+using Map = std::flat_multimap<Emplaceable, Emplaceable>;
+static_assert(CanEmplaceHint<Map>);
+static_assert(CanEmplaceHint<Map, Emplaceable, Emplaceable>);
+static_assert(CanEmplaceHint<Map, std::piecewise_construct_t, std::tuple<int, double>, std::tuple<int, double>>);
+static_assert(!CanEmplaceHint<Map, Emplaceable>);
+static_assert(!CanEmplaceHint<Map, int, double>);
+#endif
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using R     = M::iterator;
+  {
+    // was empty
+    M m;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(m.end(), typename M::value_type(2, 3.5));
+    assert(r == m.begin());
+    assert(m.size() == 1);
+    assert(m.begin()->first == 2);
+    assert(m.begin()->second == 3.5);
+  }
+  {
+    // hint correct and no duplicates
+    M m                              = {{0, 0.0}, {1, 1.0}, {3, 3.0}};
+    auto it                          = m.begin() + 2;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 4);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint correct and at the begin
+    M m                              = {{3, 3.0}, {4, 4.0}};
+    auto it                          = m.begin();
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin());
+    assert(m.size() == 3);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint correct and at the end
+    M m                              = {{0, 0.0}, {1, 1.0}};
+    auto it                          = m.end();
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 3);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint correct and at first duplicate
+    M m                              = {{0, 0.0}, {1, 1.0}, {2, 1.9}, {2, 2.1}, {3, 3.0}};
+    auto it                          = m.begin() + 2;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 6);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+    assert(std::next(r)->first == 2);
+    assert(std::next(r)->second == 1.9);
+  }
+  {
+    // hint correct and in-between duplicates
+    M m                              = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}};
+    auto it                          = m.begin() + 4;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 4);
+    assert(m.size() == 7);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+    assert(std::next(r)->first == 2);
+    assert(std::next(r)->second == 2.1);
+  }
+  {
+    // hint correct and after duplicates
+    M m                              = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}};
+    auto it                          = m.begin() + 5;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 5);
+    assert(m.size() == 7);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+    assert(std::next(r)->first == 3);
+    assert(std::next(r)->second == 3.0);
+  }
+  {
+    // hint incorrect and no duplicates
+    M m                              = {{0, 0.0}, {1, 1.0}, {3, 3.0}};
+    auto it                          = m.begin() + 1;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 4);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint incorrect and at the begin
+    M m                              = {{0, 0.0}, {1, 1.0}};
+    auto it                          = m.begin();
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin() + 2);
+    assert(m.size() == 3);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint incorrect and at the end
+    M m                              = {{3, 3.0}, {4, 4.0}};
+    auto it                          = m.end();
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    assert(r == m.begin());
+    assert(m.size() == 3);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+  }
+  {
+    // hint incorrect and before the first duplicate
+    M m                              = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}};
+    auto it                          = m.begin();
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    // the result is as left as possible
+    assert(r == m.begin() + 2);
+    assert(m.size() == 7);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+    assert(std::next(r)->first == 2);
+    assert(std::next(r)->second == 1.8);
+  }
+  {
+    // hint incorrect and after the last duplicate
+    M m                              = {{0, 0.0}, {1, 1.0}, {2, 1.8}, {2, 1.9}, {2, 2.1}, {3, 3.0}, {4, 4.0}};
+    auto it                          = m.begin() + 6;
+    std::same_as<R> decltype(auto) r = m.emplace_hint(it, typename M::value_type(2, 2.0));
+    // the result is as right as possible
+    assert(r == m.begin() + 5);
+    assert(m.size() == 8);
+    assert(r->first == 2);
+    assert(r->second == 2.0);
+    assert(std::next(r)->first == 3);
+    assert(std::next(r)->second == 3.0);
+  }
+}
+
+template <class KeyContainer, class ValueContainer>
+void test_emplaceable() {
+  using M = std::flat_multimap<int, Emplaceable, std::less<int>, KeyContainer, ValueContainer>;
+  using R = M::iterator;
+
+  M m;
+  ASSERT_SAME_TYPE(decltype(m.emplace_hint(m.cbegin())), R);
+  R r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(2), std::forward_as_tuple());
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(r->first == 2);
+  assert(r->second == Emplaceable());
+  r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.5));
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(r->first == 1);
+  assert(r->second == Emplaceable(2, 3.5));
+  r = m.emplace_hint(m.end(), std::piecewise_construct, std::forward_as_tuple(1), std::forward_as_tuple(2, 3.6));
+  assert(r == m.begin() + 1);
+  assert(m.size() == 3);
+  assert(r->first == 1);
+  assert(r->second == Emplaceable(2, 3.6));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  test_emplaceable<std::vector<int>, std::vector<Emplaceable>>();
+  test_emplaceable<std::deque<int>, std::vector<Emplaceable>>();
+  test_emplaceable<MinSequenceContainer<int>, MinSequenceContainer<Emplaceable>>();
+  test_emplaceable<std::vector<int, min_allocator<int>>, std::vector<Emplaceable, min_allocator<Emplaceable>>>();
+
+  {
+    auto emplace_func = [](auto& m, auto key_arg, auto value_arg) {
+      m.emplace_hint(m.begin(), std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+    };
+    test_emplace_exception_guarantee(emplace_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
new file mode 100644
index 0000000000000..78040be2e043d
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter.pass.cpp
@@ -0,0 +1,127 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator erase(iterator position);
+// iterator erase(const_iterator position);
+
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using P     = std::pair<Key, Value>;
+  using I     = M::iterator;
+
+  P ar[] = {
+      P(1, 1.5),
+      P(2, 2.5),
+      P(2, 2.6),
+      P(3, 3.5),
+      P(4, 4.5),
+      P(4, 4.5),
+      P(4, 4.7),
+      P(5, 5.5),
+      P(6, 6.5),
+      P(7, 7.5),
+      P(8, 8.5),
+  };
+  M m(ar, ar + sizeof(ar) / sizeof(ar[0]));
+  assert(m.size() == 11);
+  std::same_as<I> decltype(auto) i1 = m.erase(std::next(m.cbegin(), 2));
+  assert(m.size() == 10);
+  assert(i1 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(
+      m,
+      std::vector<P>{
+          {1, 1.5}, {2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+
+  std::same_as<I> decltype(auto) i2 = m.erase(std::next(m.begin(), 0));
+  assert(m.size() == 9);
+  assert(i2 == m.begin());
+  assert(std::ranges::equal(
+      m, std::vector<P>{{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+
+  std::same_as<I> decltype(auto) i3 = m.erase(std::next(m.cbegin(), 8));
+  assert(m.size() == 8);
+  assert(i3 == m.end());
+  assert(std::ranges::equal(
+      m, std::vector<P>{{2, 2.5}, {3, 3.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i4 = m.erase(std::next(m.begin(), 1));
+  assert(m.size() == 7);
+  assert(i4 == std::next(m.begin()));
+  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i5 = m.erase(std::next(m.cbegin(), 2));
+  assert(m.size() == 6);
+  assert(i5 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {4, 4.7}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i6 = m.erase(std::next(m.begin(), 2));
+  assert(m.size() == 5);
+  assert(i6 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(m, std::vector<P>{{2, 2.5}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i7 = m.erase(std::next(m.cbegin(), 0));
+  assert(m.size() == 4);
+  assert(i7 == std::next(m.begin(), 0));
+  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i8 = m.erase(std::next(m.cbegin(), 2));
+  assert(m.size() == 3);
+  assert(i8 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}, {7, 7.5}}));
+
+  std::same_as<I> decltype(auto) i9 = m.erase(std::next(m.cbegin(), 2));
+  assert(m.size() == 2);
+  assert(i9 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(m, std::vector<P>{{4, 4.5}, {5, 5.5}}));
+
+  std::same_as<I> decltype(auto) i10 = m.erase(m.cbegin());
+  assert(m.size() == 1);
+  assert(i10 == m.cbegin());
+  assert(std::ranges::equal(m, std::vector<P>{{5, 5.5}}));
+
+  std::same_as<I> decltype(auto) i11 = m.erase(m.begin());
+  assert(m.size() == 0);
+  assert(i11 == m.begin());
+  assert(i11 == m.end());
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto erase_function = [](auto& m, auto) { m.erase(m.begin() + 2); };
+    test_erase_exception_guarantee(erase_function);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
new file mode 100644
index 0000000000000..103f38c1c5d4a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_iter_iter.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator erase(const_iterator first, const_iterator last);
+
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using P     = std::pair<Key, Value>;
+  using I     = M::iterator;
+
+  P ar[] = {
+      P(1, 1.5),
+      P(2, 2.5),
+      P(2, 2.6),
+      P(3, 3.5),
+      P(3, 3.6),
+      P(3, 3.7),
+      P(4, 4.5),
+      P(5, 5.5),
+      P(6, 6.5),
+      P(7, 7.5),
+      P(8, 8.5),
+  };
+  M m(ar, ar + sizeof(ar) / sizeof(ar[0]));
+  assert(m.size() == 11);
+  std::same_as<I> decltype(auto) i1 = m.erase(m.cbegin(), m.cbegin());
+  assert(m.size() == 11);
+  assert(i1 == m.begin());
+  assert(std::ranges::equal(
+      m,
+      std::vector<P>{
+          {1, 1.5},
+          {2, 2.5},
+          {2, 2.6},
+          {3, 3.5},
+          {3, 3.6},
+          {3, 3.7},
+          {4, 4.5},
+          {5, 5.5},
+          {6, 6.5},
+          {7, 7.5},
+          {8, 8.5}}));
+
+  std::same_as<I> decltype(auto) i2 = m.erase(m.cbegin(), std::next(m.cbegin(), 2));
+  assert(m.size() == 9);
+  assert(i2 == m.begin());
+  assert(std::ranges::equal(
+      m, std::vector<P>{{2, 2.6}, {3, 3.5}, {3, 3.6}, {3, 3.7}, {4, 4.5}, {5, 5.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+
+  std::same_as<I> decltype(auto) i3 = m.erase(std::next(m.cbegin(), 2), std::next(m.cbegin(), 6));
+  assert(m.size() == 5);
+  assert(i3 == std::next(m.begin(), 2));
+  assert(std::ranges::equal(m, std::vector<P>{{2, 2.6}, {3, 3.5}, {6, 6.5}, {7, 7.5}, {8, 8.5}}));
+
+  std::same_as<I> decltype(auto) i4 = m.erase(m.cbegin(), m.cend());
+  assert(m.size() == 0);
+  assert(i4 == m.begin());
+  assert(i4 == m.end());
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto erase_function = [](auto& m, auto) { m.erase(m.begin(), m.begin() + 2); };
+    test_erase_exception_guarantee(erase_function);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
new file mode 100644
index 0000000000000..7944996fba1a0
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// size_type erase(const key_type& k);
+
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer, class Compare = std::less<>>
+void test() {
+  using M = std::flat_multimap<int, char, Compare, KeyContainer, ValueContainer>;
+
+  auto make = [](std::initializer_list<int> il) {
+    M m;
+    for (int i : il) {
+      m.emplace(i, i);
+    }
+    return m;
+  };
+  M m = make({1, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 9});
+  ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type);
+  auto n = m.erase(10);
+  assert(n == 0);
+  assert(m == make({1, 1, 2, 2, 2, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 9}));
+  n = m.erase(4);
+  assert(n == 1);
+  assert(m == make({1, 1, 2, 2, 2, 3, 5, 5, 6, 7, 8, 8, 8, 8, 9}));
+  n = m.erase(1);
+  assert(n == 2);
+  assert(m == make({2, 2, 2, 3, 5, 5, 6, 7, 8, 8, 8, 8, 9}));
+  n = m.erase(8);
+  assert(n == 4);
+  assert(m == make({2, 2, 2, 3, 5, 5, 6, 7, 9}));
+  n = m.erase(3);
+  assert(n == 1);
+  assert(m == make({2, 2, 2, 5, 5, 6, 7, 9}));
+  n = m.erase(4);
+  assert(n == 0);
+  assert(m == make({2, 2, 2, 5, 5, 6, 7, 9}));
+  n = m.erase(6);
+  assert(n == 1);
+  assert(m == make({2, 2, 2, 5, 5, 7, 9}));
+  n = m.erase(7);
+  assert(n == 1);
+  assert(m == make({2, 2, 2, 5, 5, 9}));
+  n = m.erase(2);
+  assert(n == 3);
+  assert(m == make({5, 5, 9}));
+  n = m.erase(5);
+  assert(n == 2);
+  assert(m == make({9}));
+  n = m.erase(9);
+  assert(n == 1);
+  assert(m.empty());
+  n = m.erase(1);
+  assert(n == 0);
+  assert(m.empty());
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::vector<int>, std::vector<char>, std::greater<>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  {
+    auto erase_function = [](auto& m, auto key_arg) {
+      using Map = std::decay_t<decltype(m)>;
+      using Key = typename Map::key_type;
+      const Key key{key_arg};
+      m.erase(key);
+    };
+    test_erase_exception_guarantee(erase_function);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
new file mode 100644
index 0000000000000..75a2d205b8f87
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/erase_key_transparent.pass.cpp
@@ -0,0 +1,161 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// size_type erase(K&& k);
+
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanErase        = requires(M m, Transparent<int> k) { m.erase(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanErase<TransparentMap>);
+static_assert(!CanErase<const TransparentMap>);
+static_assert(!CanErase<NonTransparentMap>);
+static_assert(!CanErase<const NonTransparentMap>);
+
+template <class Key, class It>
+struct HeterogeneousKey {
+  explicit HeterogeneousKey(Key key, It it) : key_(key), it_(it) {}
+  operator It() && { return it_; }
+  auto operator<=>(Key key) const { return key_ <=> key; }
+  friend bool operator<(const HeterogeneousKey&, const HeterogeneousKey&) {
+    assert(false);
+    return false;
+  }
+  Key key_;
+  It it_;
+};
+
+template <class KeyContainer, class ValueContainer>
+void test_simple() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::ranges::less, KeyContainer, ValueContainer>;
+
+  M m = {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 4}, {3, 5}, {4, 4}};
+  ASSERT_SAME_TYPE(decltype(m.erase(9)), typename M::size_type);
+  auto n = m.erase(3); // erase(K&&) [with K=int]
+  assert(n == 3);
+  assert((m == M{{1, 1}, {2, 2}, {2, 2}, {4, 4}}));
+  typename M::key_type lvalue = 2;
+  n                           = m.erase(lvalue); // erase(K&&) [with K=int&]
+  assert(n == 2);
+  assert((m == M{{1, 1}, {4, 4}}));
+  const typename M::key_type const_lvalue = 1;
+  n                                       = m.erase(const_lvalue); // erase(const key_type&)
+  assert(n == 1);
+  assert((m == M{{4, 4}}));
+}
+
+template <class KeyContainer, class ValueContainer>
+void test_transparent_comparator() {
+  using M = std::flat_multimap<std::string, int, TransparentComparator, KeyContainer, ValueContainer>;
+  using P = std::pair<std::string, int>;
+  M m     = {
+      {"alpha", 1}, {"beta", 2}, {"epsilon", 3}, {"epsilon", 4}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}};
+  ASSERT_SAME_TYPE(decltype(m.erase(Transparent<std::string>{"abc"})), typename M::size_type);
+
+  auto n = m.erase(Transparent<std::string>{"epsilon"});
+  assert(n == 2);
+  assert(std::ranges::equal(
+      m, std::vector<P>{{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}}));
+
+  auto n2 = m.erase(Transparent<std::string>{"aaa"});
+  assert(n2 == 0);
+  assert(std::ranges::equal(
+      m, std::vector<P>{{"alpha", 1}, {"beta", 2}, {"eta", 4}, {"gamma", 5}, {"gamma", 6}, {"gamma", 7}}));
+
+  auto n3 = m.erase(Transparent<std::string>{"gamma"});
+  assert(n3 == 3);
+  assert(std::ranges::equal(m, std::vector<P>{{"alpha", 1}, {"beta", 2}, {"eta", 4}}));
+
+  auto n4 = m.erase(Transparent<std::string>{"alpha"});
+  assert(n4 == 1);
+  assert(std::ranges::equal(m, std::vector<P>{{"beta", 2}, {"eta", 4}}));
+
+  auto n5 = m.erase(Transparent<std::string>{"alpha"});
+  assert(n5 == 0);
+  assert(std::ranges::equal(m, std::vector<P>{{"beta", 2}, {"eta", 4}}));
+
+  auto n6 = m.erase(Transparent<std::string>{"beta"});
+  assert(n6 == 1);
+  assert(std::ranges::equal(m, std::vector<P>{{"eta", 4}}));
+
+  auto n7 = m.erase(Transparent<std::string>{"eta"});
+  assert(n7 == 1);
+  assert(std::ranges::equal(m, std::vector<P>{}));
+
+  auto n8 = m.erase(Transparent<std::string>{"eta"});
+  assert(n8 == 0);
+  assert(std::ranges::equal(m, std::vector<P>{}));
+}
+
+int main(int, char**) {
+  test_simple<std::vector<int>, std::vector<double>>();
+  test_simple<std::deque<int>, std::vector<double>>();
+  test_simple<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test_simple<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  test_transparent_comparator<std::vector<std::string>, std::vector<int>>();
+  test_transparent_comparator<std::deque<std::string>, std::vector<int>>();
+  test_transparent_comparator<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test_transparent_comparator<std::vector<std::string, min_allocator<std::string>>,
+                              std::vector<int, min_allocator<int>>>();
+
+  {
+    // P2077's HeterogeneousKey example
+    using M = std::flat_multimap<int, int, std::less<>>;
+    M m     = {{1, 1}, {2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}, {8, 8}, {8, 8}};
+    auto h1 = HeterogeneousKey<int, M::iterator>(8, m.begin());
+    std::same_as<M::size_type> auto n = m.erase(h1); // lvalue is not convertible to It; erase(K&&) is the best match
+    assert(n == 2);
+    assert((m == M{{1, 1}, {2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}}));
+    std::same_as<M::iterator> auto it = m.erase(std::move(h1)); // rvalue is convertible to It; erase(K&&) drops out
+    assert(it == m.begin());
+    assert((m == M{{2, 2}, {3, 3}, {3, 3}, {4, 4}, {5, 5}, {6, 6}, {6, 6}, {7, 7}}));
+  }
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto n = m.erase(Transparent<int>{3});
+    assert(n == 2);
+    assert(transparent_used);
+  }
+  {
+    auto erase_transparent = [](auto& m, auto key_arg) {
+      using Map = std::decay_t<decltype(m)>;
+      using Key = typename Map::key_type;
+      m.erase(Transparent<Key>{key_arg});
+    };
+    test_erase_exception_guarantee(erase_transparent);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
new file mode 100644
index 0000000000000..f5ed4a9663a9d
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/extract.pass.cpp
@@ -0,0 +1,93 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// containers extract() &&;
+
+#include <algorithm>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class T>
+concept CanExtract = requires(T&& t) { std::forward<T>(t).extract(); };
+
+static_assert(CanExtract<std::flat_multimap<int, int>&&>);
+static_assert(!CanExtract<std::flat_multimap<int, int>&>);
+static_assert(!CanExtract<std::flat_multimap<int, int> const&>);
+static_assert(!CanExtract<std::flat_multimap<int, int> const&&>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using M = std::flat_multimap<int, int, std::less<int>, KeyContainer, ValueContainer>;
+  M m     = M({1, 2, 2, 2, 3, 3}, {4, 5, 6, 7, 8, 9});
+
+  std::same_as<typename M::containers> auto containers = std::move(m).extract();
+
+  auto expected_keys   = {1, 2, 2, 2, 3, 3};
+  auto expected_values = {4, 5, 6, 7, 8, 9};
+  assert(std::ranges::equal(containers.keys, expected_keys));
+  assert(std::ranges::equal(containers.values, expected_values));
+  check_invariant(m);
+  LIBCPP_ASSERT(m.empty());
+  LIBCPP_ASSERT(m.keys().size() == 0);
+  LIBCPP_ASSERT(m.values().size() == 0);
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::deque<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+  {
+    // extracted object maintains invariant if one of underlying container does not clear after move
+    using M = std::flat_multimap<int, int, std::less<>, std::vector<int>, CopyOnlyVector<int>>;
+    M m     = M({1, 2, 2, 2, 3, 3}, {1, 2, 3, 4, 5, 6});
+    std::same_as<M::containers> auto containers = std::move(m).extract();
+    assert(containers.keys.size() == 6);
+    assert(containers.values.size() == 6);
+    check_invariant(m);
+    LIBCPP_ASSERT(m.empty());
+    LIBCPP_ASSERT(m.keys().size() == 0);
+    LIBCPP_ASSERT(m.values().size() == 0);
+  }
+
+  {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    using KeyContainer   = std::vector<int>;
+    using ValueContainer = ThrowOnMoveContainer<int>;
+    using M              = std::flat_multimap<int, int, std::ranges::less, KeyContainer, ValueContainer>;
+
+    M m;
+    m.emplace(1, 1);
+    m.emplace(1, 1);
+    try {
+      auto c = std::move(m).extract();
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, we try to erase the key after value emplacement failure.
+      // and after erasure failure, we clear the flat_multimap
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+#endif
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
new file mode 100644
index 0000000000000..88c173d8a6917
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_cv.pass.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator insert(const value_type& v);
+
+#include <flat_map>
+#include <deque>
+#include <cassert>
+#include <functional>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "../helpers.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using R     = typename M::iterator;
+  using VT    = typename M::value_type;
+  M m;
+
+  const VT v1(2, 2.5);
+  std::same_as<R> decltype(auto) r = m.insert(v1);
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(r->first == 2);
+  assert(r->second == 2.5);
+
+  const VT v2(1, 1.5);
+  r = m.insert(v2);
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(r->first == 1);
+  assert(r->second == 1.5);
+
+  const VT v3(3, 3.5);
+  r = m.insert(v3);
+  assert(r == m.begin() + 2);
+  assert(m.size() == 3);
+  assert(r->first == 3);
+  assert(r->second == 3.5);
+
+  const VT v4(3, 4.5);
+  r = m.insert(v4);
+  assert(r == m.begin() + 3);
+  assert(m.size() == 4);
+  assert(r->first == 3);
+  assert(r->second == 4.5);
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using value_type = typename FlatMap::value_type;
+      const value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+      m.insert(p);
+    };
+    test_emplace_exception_guarantee(insert_func);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
new file mode 100644
index 0000000000000..098b66cc49f18
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_initializer_list.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// void insert(initializer_list<value_type> il);
+
+#include <flat_map>
+#include <cassert>
+#include <functional>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using V     = std::pair<int, double>;
+
+  M m = {{1, 1}, {1, 1.5}, {1, 2}, {3, 1}, {3, 1.5}, {3, 2}};
+  m.insert({
+      {4, 1},
+      {4, 1.5},
+      {4, 2},
+      {1, 1},
+      {1, 1.5},
+      {1, 2},
+      {2, 1},
+      {2, 1.5},
+      {2, 2},
+  });
+  assert(m.size() == 15);
+  std::vector<V> expected = {
+      {1, 1},
+      {1, 1.5},
+      {1, 2},
+      {1, 1},
+      {1, 1.5},
+      {1, 2},
+      {2, 1},
+      {2, 1.5},
+      {2, 2},
+      {3, 1},
+      {3, 1.5},
+      {3, 2},
+      {4, 1},
+      {4, 1.5},
+      {4, 2},
+  };
+  assert(std::ranges::equal(m, expected));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, const auto& newValues) {
+      using FlatMap                        = std::decay_t<decltype(m)>;
+      using value_type                     = typename FlatMap::value_type;
+      std::initializer_list<value_type> il = {{newValues[0].first, newValues[0].second}};
+      m.insert(il);
+    };
+    test_insert_range_exception_guarantee(insert_func);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
new file mode 100644
index 0000000000000..9d645043a15ca
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_cv.pass.cpp
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator insert(const_iterator position, const value_type& v);
+
+#include <flat_map>
+#include <cassert>
+#include <functional>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "../helpers.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using R     = typename M::iterator;
+  using VT    = typename M::value_type;
+
+  M m;
+  const VT v1(2, 2.5);
+  std::same_as<R> decltype(auto) r = m.insert(m.end(), v1);
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(r->first == 2);
+  assert(r->second == 2.5);
+
+  const VT v2(1, 1.5);
+  r = m.insert(m.end(), v2);
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(r->first == 1);
+  assert(r->second == 1.5);
+
+  const VT v3(3, 3.5);
+  r = m.insert(m.end(), v3);
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 3);
+  assert(r->first == 3);
+  assert(r->second == 3.5);
+
+  const VT v4(3, 4.5);
+  r = m.insert(m.end(), v4);
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 4);
+  assert(r->first == 3);
+  assert(r->second == 4.5);
+
+  const VT v5(2, 5.5);
+  r = m.insert(m.end(), v5);
+  assert(r == m.begin() + 2);
+  assert(m.size() == 5);
+  assert(r->first == 2);
+  assert(r->second == 5.5);
+
+  const VT v6(2, 6.5);
+  r = m.insert(m.begin(), v6);
+  assert(r == m.begin() + 1);
+  assert(m.size() == 6);
+  assert(r->first == 2);
+  assert(r->second == 6.5);
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using value_type = typename FlatMap::value_type;
+      const value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+      m.insert(m.begin(), p);
+    };
+    test_emplace_exception_guarantee(insert_func);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
new file mode 100644
index 0000000000000..ae031bd010f76
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_iter.pass.cpp
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template <class InputIterator>
+//   void insert(InputIterator first, InputIterator last);
+
+#include <flat_map>
+#include <cassert>
+#include <functional>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "min_allocator.h"
+
+// test constraint InputIterator
+template <class M, class... Args>
+concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward<Args>(args)...); };
+
+using Map  = std::flat_multimap<int, int>;
+using Pair = std::pair<int, int>;
+
+static_assert(CanInsert<Map, Pair*, Pair*>);
+static_assert(CanInsert<Map, cpp17_input_iterator<Pair*>, cpp17_input_iterator<Pair*>>);
+static_assert(!CanInsert<Map, int, int>);
+static_assert(!CanInsert<Map, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using P = std::pair<int, double>;
+  using M = std::flat_multimap<int, double, std::less<int>, KeyContainer, ValueContainer>;
+
+  P ar1[] = {
+      P(2, 1),
+      P(2, 1.5),
+      P(2, 2),
+      P(1, 1),
+      P(1, 1.5),
+      P(1, 2),
+      P(3, 1),
+      P(3, 1.5),
+      P(3, 2),
+  };
+  P ar2[] = {
+      P(4, 1),
+      P(4, 1.5),
+      P(4, 2),
+      P(1, 1),
+      P(1, 1.5),
+      P(1, 2),
+      P(0, 1),
+      P(0, 1.5),
+      P(0, 2),
+  };
+
+  M m;
+  m.insert(cpp17_input_iterator<P*>(ar1), cpp17_input_iterator<P*>(ar1 + sizeof(ar1) / sizeof(ar1[0])));
+  assert(m.size() == 9);
+  std::vector<P> expected{{1, 1}, {1, 1.5}, {1, 2}, {2, 1}, {2, 1.5}, {2, 2}, {3, 1}, {3, 1.5}, {3, 2}};
+  assert(std::ranges::equal(m, expected));
+
+  m.insert(cpp17_input_iterator<P*>(ar2), cpp17_input_iterator<P*>(ar2 + sizeof(ar2) / sizeof(ar2[0])));
+  assert(m.size() == 18);
+  std::vector<P> expected2{
+      {0, 1},
+      {0, 1.5},
+      {0, 2},
+      {1, 1},
+      {1, 1.5},
+      {1, 2},
+      {1, 1},
+      {1, 1.5},
+      {1, 2},
+      {2, 1},
+      {2, 1.5},
+      {2, 2},
+      {3, 1},
+      {3, 1.5},
+      {3, 2},
+      {4, 1},
+      {4, 1.5},
+      {4, 2}};
+  assert(std::ranges::equal(m, expected2));
+}
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, const auto& newValues) { m.insert(newValues.begin(), newValues.end()); };
+    test_insert_range_exception_guarantee(insert_func);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
new file mode 100644
index 0000000000000..61962f4873aee
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_iter_rv.pass.cpp
@@ -0,0 +1,103 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator insert(const_iterator position, value_type&&);
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "MoveOnly.h"
+#include "min_allocator.h"
+#include "../helpers.h"
+#include "test_macros.h"
+
+template <class Container, class Pair>
+void do_insert_iter_rv_test() {
+  using M = Container;
+  using P = Pair;
+  using R = typename M::iterator;
+  M m;
+  std::same_as<R> decltype(auto) r = m.insert(m.end(), P(2, 2));
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(r->first == 2);
+  assert(r->second == 2);
+
+  r = m.insert(m.end(), P(1, 1));
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(r->first == 1);
+  assert(r->second == 1);
+
+  r = m.insert(m.end(), P(3, 3));
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 3);
+  assert(r->first == 3);
+  assert(r->second == 3);
+
+  r = m.insert(m.end(), P(3, 4));
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 4);
+  assert(r->first == 3);
+  assert(r->second == 4);
+
+  r = m.insert(m.end(), P(2, 5));
+  assert(r == m.begin() + 2);
+  assert(m.size() == 5);
+  assert(r->first == 2);
+  assert(r->second == 5);
+
+  r = m.insert(m.begin(), P(2, 6));
+  assert(r == m.begin() + 1);
+  assert(m.size() == 6);
+  assert(r->first == 2);
+  assert(r->second == 6);
+}
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using P     = std::pair<Key, Value>;
+  using CP    = std::pair<const Key, Value>;
+
+  do_insert_iter_rv_test<M, P>();
+  do_insert_iter_rv_test<M, CP>();
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::vector<int>, std::vector<MoveOnly>>();
+  test<std::deque<int>, std::deque<double>>();
+  test<std::deque<int>, std::deque<MoveOnly>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  {
+    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using value_type = typename FlatMap::value_type;
+      value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+      m.insert(m.begin(), std::move(p));
+    };
+    test_emplace_exception_guarantee(insert_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
new file mode 100644
index 0000000000000..97b8f17d1094f
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_range.pass.cpp
@@ -0,0 +1,101 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<container-compatible-range<value_type> R>
+//   void insert_range(R&& rg);
+
+#include <algorithm>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "MoveOnly.h"
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "min_allocator.h"
+
+// test constraint container-compatible-range
+template <class M, class R>
+concept CanInsertRange = requires(M m, R&& r) { m.insert_range(std::forward<R>(r)); };
+
+using Map = std::flat_multimap<int, double>;
+
+static_assert(CanInsertRange<Map, std::ranges::subrange<std::pair<int, double>*>>);
+static_assert(CanInsertRange<Map, std::ranges::subrange<std::pair<short, double>*>>);
+static_assert(!CanInsertRange<Map, std::ranges::subrange<int*>>);
+static_assert(!CanInsertRange<Map, std::ranges::subrange<double*>>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+
+  {
+    using P                 = std::pair<int, int>;
+    using M                 = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    using It                = forward_iterator<const P*>;
+    M m                     = {{10, 1}, {8, 2}, {5, 3}, {2, 4}, {1, 5}};
+    P ar[]                  = {{3, 1}, {1, 2}, {4, 3}, {1, 4}, {5, 5}, {9, 6}};
+    std::ranges::subrange r = {It(ar), It(ar + 6)};
+    static_assert(std::ranges::common_range<decltype(r)>);
+    m.insert_range(r);
+    std::vector<P> expected = {{1, 5}, {1, 2}, {1, 4}, {2, 4}, {3, 1}, {4, 3}, {5, 3}, {5, 5}, {8, 2}, {9, 6}, {10, 1}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    using P                 = std::pair<int, int>;
+    using M                 = std::flat_multimap<Key, Value, std::greater<>, KeyContainer, ValueContainer>;
+    using It                = cpp20_input_iterator<const P*>;
+    M m                     = {{8, 1}, {5, 2}, {3, 3}, {2, 4}};
+    P ar[]                  = {{3, 1}, {1, 2}, {4, 3}, {1, 4}, {5, 5}, {9, 6}};
+    std::ranges::subrange r = {It(ar), sentinel_wrapper<It>(It(ar + 6))};
+    static_assert(!std::ranges::common_range<decltype(r)>);
+    m.insert_range(r);
+    std::vector<P> expected = {{9, 6}, {8, 1}, {5, 2}, {5, 5}, {4, 3}, {3, 3}, {3, 1}, {2, 4}, {1, 2}, {1, 4}};
+    assert(std::ranges::equal(m, expected));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::deque<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+  {
+    // Items are forwarded correctly from the input range (P2767).
+    std::pair<MoveOnly, MoveOnly> a[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
+    std::flat_multimap<MoveOnly, MoveOnly> m;
+    m.insert_range(a | std::views::as_rvalue);
+    std::pair<MoveOnly, MoveOnly> expected[] = {{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // The element type of the range doesn't need to be std::pair (P2767).
+    std::pair<int, int> pa[] = {{3, 3}, {1, 1}, {4, 4}, {1, 1}, {5, 5}};
+    std::deque<std::reference_wrapper<std::pair<int, int>>> a(pa, pa + 5);
+    std::flat_multimap<int, int> m;
+    m.insert_range(a);
+    std::pair<int, int> expected[] = {{1, 1}, {1, 1}, {3, 3}, {4, 4}, {5, 5}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    auto insert_func = [](auto& m, const auto& newValues) { m.insert_range(newValues); };
+    test_insert_range_exception_guarantee(insert_func);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
new file mode 100644
index 0000000000000..573150248ca48
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_rv.pass.cpp
@@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// iterator insert( value_type&& v);
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "MoveOnly.h"
+#include "min_allocator.h"
+#include "test_macros.h"
+#include "../helpers.h"
+
+template <class Container, class Pair>
+void do_insert_rv_test() {
+  using M = Container;
+  using P = Pair;
+  using R = typename M::iterator;
+  M m;
+  std::same_as<R> decltype(auto) r = m.insert(P(2, 2));
+  assert(r == m.begin());
+  assert(m.size() == 1);
+  assert(r->first == 2);
+  assert(r->second == 2);
+
+  r = m.insert(P(1, 1));
+  assert(r == m.begin());
+  assert(m.size() == 2);
+  assert(r->first == 1);
+  assert(r->second == 1);
+
+  r = m.insert(P(3, 3));
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 3);
+  assert(r->first == 3);
+  assert(r->second == 3);
+
+  r = m.insert(P(3, 3));
+  assert(r == std::ranges::prev(m.end()));
+  assert(m.size() == 4);
+  assert(r->first == 3);
+  assert(r->second == 3);
+}
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  using P  = std::pair<Key, Value>;
+  using CP = std::pair<const Key, Value>;
+
+  do_insert_rv_test<M, P>();
+  do_insert_rv_test<M, CP>();
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<MoveOnly>>();
+  test<std::deque<int>, std::vector<MoveOnly>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<MoveOnly>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<MoveOnly, min_allocator<MoveOnly>>>();
+
+  {
+    using M = std::flat_multimap<int, MoveOnly>;
+    using R = M::iterator;
+    M m;
+    R r = m.insert({2, MoveOnly(2)});
+    assert(r == m.begin());
+    assert(m.size() == 1);
+    assert(r->first == 2);
+    assert(r->second == 2);
+
+    r = m.insert({1, MoveOnly(1)});
+    assert(r == m.begin());
+    assert(m.size() == 2);
+    assert(r->first == 1);
+    assert(r->second == 1);
+
+    r = m.insert({3, MoveOnly(3)});
+    assert(r == std::ranges::prev(m.end()));
+    assert(m.size() == 3);
+    assert(r->first == 3);
+    assert(r->second == 3);
+
+    r = m.insert({3, MoveOnly(3)});
+    assert(r == std::ranges::prev(m.end()));
+    assert(m.size() == 4);
+    assert(r->first == 3);
+    assert(r->second == 3);
+  }
+  {
+    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using value_type = typename FlatMap::value_type;
+      value_type p(std::piecewise_construct, std::tuple(key_arg), std::tuple(value_arg));
+      m.insert(std::move(p));
+    };
+    test_emplace_exception_guarantee(insert_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
new file mode 100644
index 0000000000000..334dff0a0d2f6
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_initializer_list.pass.cpp
@@ -0,0 +1,66 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// void insert(sorted_equivalent_t, initializer_list<value_type> il);
+
+#include <flat_map>
+#include <cassert>
+#include <functional>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  using V = std::pair<const Key, Value>;
+  M m     = {{1, 1}, {1, 1.5}, {1, 2}, {3, 1}, {3, 1.5}, {3, 2}};
+  m.insert(std::sorted_equivalent,
+           {
+               {0, 1},
+               {1, 2},
+               {1, 3},
+               {2, 1},
+               {2, 4},
+               {4, 1},
+           });
+  assert(m.size() == 12);
+  V expected[] = {{0, 1}, {1, 1}, {1, 1.5}, {1, 2}, {1, 2}, {1, 3}, {2, 1}, {2, 4}, {3, 1}, {3, 1.5}, {3, 2}, {4, 1}};
+  assert(std::ranges::equal(m, expected));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, const auto& newValues) {
+      using FlatMap                        = std::decay_t<decltype(m)>;
+      using value_type                     = typename FlatMap::value_type;
+      std::initializer_list<value_type> il = {{newValues[0].first, newValues[0].second}};
+      m.insert(std::sorted_equivalent, il);
+    };
+    test_insert_range_exception_guarantee(insert_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
new file mode 100644
index 0000000000000..37808470a2cf7
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_sorted_iter_iter.pass.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template <class InputIterator>
+//   void insert(sorted_equivalent_t, InputIterator first, InputIterator last);
+
+#include <flat_map>
+#include <cassert>
+#include <functional>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "min_allocator.h"
+
+// test constraint InputIterator
+template <class M, class... Args>
+concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward<Args>(args)...); };
+
+using Map  = std::flat_multimap<int, int>;
+using Pair = std::pair<int, int>;
+
+static_assert(CanInsert<Map, std::sorted_equivalent_t, Pair*, Pair*>);
+static_assert(CanInsert<Map, std::sorted_equivalent_t, cpp17_input_iterator<Pair*>, cpp17_input_iterator<Pair*>>);
+static_assert(!CanInsert<Map, std::sorted_equivalent_t, int, int>);
+static_assert(!CanInsert<Map, std::sorted_equivalent_t, cpp20_input_iterator<Pair*>, cpp20_input_iterator<Pair*>>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using P     = std::pair<Key, Value>;
+
+  P ar1[] = {
+      P(1, 1),
+      P(1, 0),
+      P(2, 1),
+      P(2, 3),
+      P(3, 1),
+  };
+
+  P ar2[] = {
+      P(0, 1),
+      P(2, 2),
+      P(2, 5),
+      P(4, 1),
+      P(4, 4),
+  };
+
+  M m;
+  m.insert(std::sorted_equivalent,
+           cpp17_input_iterator<P*>(ar1),
+           cpp17_input_iterator<P*>(ar1 + sizeof(ar1) / sizeof(ar1[0])));
+  assert(m.size() == 5);
+  P expected[] = {{1, 1}, {1, 0}, {2, 1}, {2, 3}, {3, 1}};
+  assert(std::ranges::equal(m, expected));
+
+  m.insert(std::sorted_equivalent,
+           cpp17_input_iterator<P*>(ar2),
+           cpp17_input_iterator<P*>(ar2 + sizeof(ar2) / sizeof(ar2[0])));
+  assert(m.size() == 10);
+  P expected2[] = {{0, 1}, {1, 1}, {1, 0}, {2, 1}, {2, 3}, {2, 2}, {2, 5}, {3, 1}, {4, 1}, {4, 4}};
+  assert(std::ranges::equal(m, expected2));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    auto insert_func = [](auto& m, const auto& newValues) {
+      m.insert(std::sorted_equivalent, newValues.begin(), newValues.end());
+    };
+    test_insert_range_exception_guarantee(insert_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
new file mode 100644
index 0000000000000..33ca4d4e30469
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/insert_transparent.pass.cpp
@@ -0,0 +1,135 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> iterator insert(P&& x);
+// template<class K> iterator insert(const_iterator hint, P&& x);
+
+#include <algorithm>
+#include <compare>
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <tuple>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "test_iterators.h"
+#include "min_allocator.h"
+
+// Constraints: is_constructible_v<pair<key_type, mapped_type>, P> is true.
+template <class M, class... Args>
+concept CanInsert = requires(M m, Args&&... args) { m.insert(std::forward<Args>(args)...); };
+
+using Map  = std::flat_multimap<int, double>;
+using Iter = Map::const_iterator;
+
+static_assert(CanInsert<Map, std::pair<short, double>&&>);
+static_assert(CanInsert<Map, Iter, std::pair<short, double>&&>);
+static_assert(CanInsert<Map, std::tuple<short, double>&&>);
+static_assert(CanInsert<Map, Iter, std::tuple<short, double>&&>);
+static_assert(!CanInsert<Map, int>);
+static_assert(!CanInsert<Map, Iter, int>);
+
+static int expensive_comparisons = 0;
+static int cheap_comparisons     = 0;
+
+struct CompareCounter {
+  int i_ = 0;
+  CompareCounter(int i) : i_(i) {}
+  friend auto operator<=>(const CompareCounter& x, const CompareCounter& y) {
+    expensive_comparisons += 1;
+    return x.i_ <=> y.i_;
+  }
+  bool operator==(const CompareCounter&) const = default;
+  friend auto operator<=>(const CompareCounter& x, int y) {
+    cheap_comparisons += 1;
+    return x.i_ <=> y;
+  }
+};
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  {
+    // insert(P&&)
+    //   Unlike flat_set, here we can't use key_compare to compare value_type versus P,
+    //   so we must eagerly convert to value_type.
+    M m                                                 = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
+    expensive_comparisons                               = 0;
+    cheap_comparisons                                   = 0;
+    std::same_as<typename M::iterator> decltype(auto) r = m.insert(std::make_pair(3, 3)); // conversion happens first
+    assert(expensive_comparisons >= 2);
+    assert(cheap_comparisons == 0);
+    assert(r == m.begin() + 4);
+
+    std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {3, 3}, {4, 4}, {5, 5}};
+    assert(std::ranges::equal(m, expected));
+  }
+  {
+    // insert(const_iterator, P&&)
+    M m                                        = {{1, 1}, {2, 2}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
+    expensive_comparisons                      = 0;
+    cheap_comparisons                          = 0;
+    std::same_as<typename M::iterator> auto it = m.insert(m.begin(), std::make_pair(3, 3));
+    assert(expensive_comparisons >= 2);
+    assert(cheap_comparisons == 0);
+    assert(it == m.begin() + 2);
+    std::pair<int, int> expected[] = {{1, 1}, {2, 2}, {3, 3}, {3, 1}, {3, 4}, {4, 4}, {5, 5}};
+    assert(std::ranges::equal(m, expected));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<CompareCounter>, std::vector<double>>();
+  test<std::deque<CompareCounter>, std::vector<double>>();
+  test<MinSequenceContainer<CompareCounter>, MinSequenceContainer<double>>();
+  test<std::vector<CompareCounter, min_allocator<CompareCounter>>, std::vector<double, min_allocator<double>>>();
+
+  {
+    // no ambiguity between insert(pos, P&&) and insert(first, last)
+    using M = std::flat_multimap<int, int>;
+    struct Evil {
+      operator M::value_type() const;
+      operator M::const_iterator() const;
+    };
+    std::flat_multimap<int, int> m;
+    ASSERT_SAME_TYPE(decltype(m.insert(Evil())), M::iterator);
+    ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), Evil())), M::iterator);
+    ASSERT_SAME_TYPE(decltype(m.insert(m.begin(), m.end())), void);
+  }
+  {
+    auto insert_func = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+      tuple_type t(key_arg, value_arg);
+      m.insert(t);
+    };
+    test_emplace_exception_guarantee(insert_func);
+  }
+  {
+    auto insert_func_iter = [](auto& m, auto key_arg, auto value_arg) {
+      using FlatMap    = std::decay_t<decltype(m)>;
+      using tuple_type = std::tuple<typename FlatMap::key_type, typename FlatMap::mapped_type>;
+      tuple_type t(key_arg, value_arg);
+      m.insert(m.begin(), t);
+    };
+    test_emplace_exception_guarantee(insert_func_iter);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
new file mode 100644
index 0000000000000..86fbaff468ab6
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/replace.pass.cpp
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// void replace(key_container_type&& key_cont, mapped_container_type&& mapped_cont);
+
+#include <algorithm>
+#include <deque>
+#include <concepts>
+#include <flat_map>
+#include <functional>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class T, class... Args>
+concept CanReplace = requires(T t, Args&&... args) { t.replace(std::forward<Args>(args)...); };
+
+using Map = std::flat_multimap<int, int>;
+static_assert(CanReplace<Map, std::vector<int>, std::vector<int>>);
+static_assert(!CanReplace<Map, const std::vector<int>&, std::vector<int>>);
+static_assert(!CanReplace<Map, std::vector<int>, const std::vector<int>&>);
+static_assert(!CanReplace<Map, const std::vector<int>&, const std::vector<int>&>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  M m                       = M({1, 1, 3}, {4, 5, 6});
+  KeyContainer new_keys     = {7, 7};
+  ValueContainer new_values = {9, 10};
+  auto expected_keys        = new_keys;
+  auto expected_values      = new_values;
+  m.replace(std::move(new_keys), std::move(new_values));
+  assert(m.size() == 2);
+  assert(std::ranges::equal(m.keys(), expected_keys));
+  assert(std::ranges::equal(m.values(), expected_values));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    using KeyContainer   = std::vector<int>;
+    using ValueContainer = ThrowOnMoveContainer<int>;
+    using M              = std::flat_multimap<int, int, std::ranges::less, KeyContainer, ValueContainer>;
+
+    M m;
+    m.emplace(1, 1);
+    m.emplace(2, 2);
+    try {
+      KeyContainer new_keys{3, 4};
+      ValueContainer new_values{5, 6};
+      m.replace(std::move(new_keys), std::move(new_values));
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, we clear the map
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+#endif
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp
new file mode 100644
index 0000000000000..a1252f301309a
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_exception.pass.cpp
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// `check_assertion.h` requires Unix headers and regex support.
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: no-localization
+// UNSUPPORTED: no-exceptions
+
+// <flat_map>
+
+// class flat_multimap
+
+// void swap(flat_multimap& y) noexcept;
+// friend void swap(flat_multimap& x, flat_multimap& y) noexcept
+
+// Test that std::terminate is called if any exception is thrown during swap
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "test_macros.h"
+#include "../helpers.h"
+#include "check_assertion.h"
+
+template <class F>
+void test_swap_exception_guarantee([[maybe_unused]] F&& swap_function) {
+  {
+    // key swap throws
+    using KeyContainer   = ThrowOnMoveContainer<int>;
+    using ValueContainer = std::vector<int>;
+    using M              = std::flat_multimap<int, int, TransparentComparator, KeyContainer, ValueContainer>;
+
+    M m1, m2;
+    m1.emplace(1, 1);
+    m1.emplace(1, 2);
+    m2.emplace(3, 3);
+    m2.emplace(3, 4);
+    // swap is noexcept
+    EXPECT_STD_TERMINATE([&] { swap_function(m1, m2); });
+  }
+
+  {
+    // value swap throws
+    using KeyContainer   = std::vector<int>;
+    using ValueContainer = ThrowOnMoveContainer<int>;
+    using M              = std::flat_multimap<int, int, TransparentComparator, KeyContainer, ValueContainer>;
+
+    M m1, m2;
+    m1.emplace(1, 1);
+    m1.emplace(1, 2);
+    m2.emplace(3, 3);
+    m2.emplace(3, 4);
+
+    // swap is noexcept
+    EXPECT_STD_TERMINATE([&] { swap_function(m1, m2); });
+  }
+}
+
+int main(int, char**) {
+  {
+    auto swap_func = [](auto& m1, auto& m2) { swap(m1, m2); };
+    test_swap_exception_guarantee(swap_func);
+  }
+
+  {
+    auto swap_func = [](auto& m1, auto& m2) { m1.swap(m2); };
+    test_swap_exception_guarantee(swap_func);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
new file mode 100644
index 0000000000000..f96155d714dc9
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_free.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// friend void swap(flat_multimap& x, flat_multimap& y) noexcept
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "MoveOnly.h"
+#include "min_allocator.h"
+#include "test_macros.h"
+#include "../helpers.h"
+
+// test noexcept
+
+template <class T>
+concept NoExceptAdlSwap = requires(T t1, T t2) {
+  { swap(t1, t2) } noexcept;
+};
+
+static_assert(NoExceptAdlSwap<std::flat_multimap<int, int>>);
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+static_assert(NoExceptAdlSwap<
+              std::flat_multimap<int, int, std::less<int>, ThrowOnMoveContainer<int>, ThrowOnMoveContainer<int>>>);
+#endif
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using V     = std::pair<const Key, Value>;
+
+  {
+    M m1;
+    M m2;
+    M m1_save = m1;
+    M m2_save = m2;
+    swap(m1, m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar2[] = {V(5, 5), V(5, 6), V(5, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(10, 12)};
+    M m1;
+    M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0]));
+    M m1_save = m1;
+    M m2_save = m2;
+    swap(m1, m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)};
+    M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0]));
+    M m2;
+    M m1_save = m1;
+    M m2_save = m2;
+    swap(m1, m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)};
+    V ar2[] = {V(5, 5), V(5, 6), V(5, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(10, 12)};
+    M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0]));
+    M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0]));
+    M m1_save = m1;
+    M m2_save = m2;
+    swap(m1, m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
new file mode 100644
index 0000000000000..ab7be3b8ac22e
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.modifiers/swap_member.pass.cpp
@@ -0,0 +1,97 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// void swap(flat_multimap& y) noexcept;
+
+#include <flat_map>
+#include <cassert>
+#include <deque>
+#include <functional>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "MoveOnly.h"
+#include "min_allocator.h"
+#include "test_macros.h"
+#include "../helpers.h"
+
+// test noexcept
+
+template <class T>
+concept NoExceptMemberSwap = requires(T t1, T t2) {
+  { t1.swap(t2) } noexcept;
+};
+
+static_assert(NoExceptMemberSwap<std::flat_multimap<int, int>>);
+#ifndef TEST_HAS_NO_EXCEPTIONS
+static_assert(NoExceptMemberSwap<
+              std::flat_multimap<int, int, std::less<int>, ThrowOnMoveContainer<int>, ThrowOnMoveContainer<int>>>);
+#endif
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+  using V     = std::pair<const Key, Value>;
+  {
+    M m1;
+    M m2;
+    M m1_save = m1;
+    M m2_save = m2;
+    m1.swap(m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar2[] = {V(5, 5), V(5, 6), V(7, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(12, 12)};
+    M m1;
+    M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0]));
+    M m1_save = m1;
+    M m2_save = m2;
+    m1.swap(m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)};
+    M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0]));
+    M m2;
+    M m1_save = m1;
+    M m2_save = m2;
+    m1.swap(m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+  {
+    V ar1[] = {V(1, 1), V(1, 2), V(3, 3), V(4, 4)};
+    V ar2[] = {V(5, 5), V(5, 6), V(7, 7), V(8, 8), V(9, 9), V(10, 10), V(10, 11), V(12, 12)};
+    M m1(ar1, ar1 + sizeof(ar1) / sizeof(ar1[0]));
+    M m2(ar2, ar2 + sizeof(ar2) / sizeof(ar2[0]));
+    M m1_save = m1;
+    M m2_save = m2;
+    m1.swap(m2);
+    assert(m1 == m2_save);
+    assert(m2 == m1_save);
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<double>>();
+  test<std::deque<int>, std::vector<double>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<double>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<double, min_allocator<double>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
new file mode 100644
index 0000000000000..47140132c6e47
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/comp.pass.cpp
@@ -0,0 +1,98 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// key_compare key_comp() const;
+// value_compare value_comp() const;
+
+#include <cassert>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  {
+    using M    = std::flat_multimap<int, char>;
+    using Comp = std::less<int>; // the default
+    M m        = {};
+    ASSERT_SAME_TYPE(M::key_compare, Comp);
+    static_assert(!std::is_same_v<M::value_compare, Comp>);
+    ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp);
+    ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare);
+    Comp kc = m.key_comp();
+    assert(kc(1, 2));
+    assert(!kc(2, 1));
+    auto vc = m.value_comp();
+    ASSERT_SAME_TYPE(decltype(vc(std::make_pair(1, 2), std::make_pair(1, 2))), bool);
+    assert(vc({1, '2'}, {2, '1'}));
+    assert(!vc({2, '1'}, {1, '2'}));
+  }
+  {
+    using Comp = std::function<bool(int, int)>;
+    using M    = std::flat_multimap<int, int, Comp>;
+    Comp comp  = std::greater<int>();
+    M m({}, comp);
+    ASSERT_SAME_TYPE(M::key_compare, Comp);
+    ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp);
+    ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare);
+    Comp kc = m.key_comp();
+    assert(!kc(1, 2));
+    assert(kc(2, 1));
+    auto vc = m.value_comp();
+    auto a  = std::make_pair(1, 2);
+    ASSERT_SAME_TYPE(decltype(vc(a, a)), bool);
+    static_assert(!noexcept(vc(a, a)));
+    assert(!vc({1, 2}, {2, 1}));
+    assert(vc({2, 1}, {1, 2}));
+  }
+  {
+    using Comp = std::less<>;
+    using M    = std::flat_multimap<int, int, Comp>;
+    M m        = {};
+    ASSERT_SAME_TYPE(M::key_compare, Comp);
+    ASSERT_SAME_TYPE(decltype(m.key_comp()), Comp);
+    ASSERT_SAME_TYPE(decltype(m.value_comp()), M::value_compare);
+    Comp kc = m.key_comp();
+    assert(kc(1, 2));
+    assert(!kc(2, 1));
+    auto vc = m.value_comp();
+    auto a  = std::make_pair(1, 2);
+    ASSERT_SAME_TYPE(decltype(vc(a, a)), bool);
+    assert(vc({1, 2}, {2, 1}));
+    assert(!vc({2, 1}, {1, 2}));
+  }
+  {
+    using Comp = std::function<bool(const std::vector<int>&, const std::vector<int>&)>;
+    using M    = std::flat_multimap<std::vector<int>, int, Comp>;
+    Comp comp  = [i = 1](const auto& x, const auto& y) { return x[i] < y[i]; };
+    M m({}, comp);
+    auto vc = m.value_comp();
+    static_assert(sizeof(vc) >= sizeof(Comp));
+    comp = nullptr;
+    m    = M({}, nullptr);
+    assert(m.key_comp() == nullptr);
+    // At this point, m.key_comp() is disengaged.
+    // But the std::function captured by copy inside `vc` remains valid.
+    auto a = std::make_pair(std::vector<int>{2, 1, 4}, 42);
+    auto b = std::make_pair(std::vector<int>{1, 2, 3}, 42);
+    auto c = std::make_pair(std::vector<int>{0, 3, 2}, 42);
+    assert(vc(a, b));
+    assert(vc(b, c));
+    assert(!vc(b, a));
+    assert(!vc(c, b));
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
new file mode 100644
index 0000000000000..c7c674c034bca
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.observers/keys_values.pass.cpp
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// const key_container_type& keys() const noexcept
+// const mapped_container_type& values() const noexcept
+
+#include <algorithm>
+#include <cassert>
+#include <flat_map>
+#include <functional>
+#include <utility>
+#include <vector>
+#include <deque>
+#include <string>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "test_allocator.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  const M m                                                 = {{4, 'a'}, {2, 'b'}, {2, 'e'}, {3, 'c'}};
+  std::same_as<const KeyContainer&> decltype(auto) keys     = m.keys();
+  std::same_as<const ValueContainer&> decltype(auto) values = m.values();
+
+  // noexcept
+  static_assert(noexcept(m.keys()));
+  static_assert(noexcept(m.values()));
+
+  auto expected_keys   = {2, 2, 3, 4};
+  auto expected_values = {'b', 'e', 'c', 'a'};
+  assert(std::ranges::equal(keys, expected_keys));
+  assert(std::ranges::equal(values, expected_values));
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
new file mode 100644
index 0000000000000..b3ea0b65a3d93
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// bool contains(const key_type& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  {
+    using M = std::flat_multimap<Key, Value, std::less<>, KeyContainer, ValueContainer>;
+    M m     = {{1, 1}, {2, 2}, {2, 3}, {4, 4}, {5, 5}, {8, 1}, {8, 2}, {8, 8}};
+    assert(!m.contains(0));
+    assert(m.contains(1));
+    assert(m.contains(2));
+    assert(!m.contains(3));
+    assert(m.contains(4));
+    assert(m.contains(5));
+    assert(!m.contains(6));
+    assert(!m.contains(7));
+    assert(std::as_const(m).contains(8));
+    assert(!std::as_const(m).contains(9));
+    m.clear();
+    assert(!m.contains(1));
+  }
+  {
+    using M = std::flat_multimap<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    M m     = {{1, 0}, {2, 0}, {4, 0}, {2, 1}, {5, 1}, {5, 2}, {5, 0}, {8, 0}};
+    assert(!m.contains(0));
+    assert(m.contains(1));
+    assert(m.contains(2));
+    assert(!m.contains(3));
+    assert(m.contains(4));
+    assert(m.contains(5));
+    assert(!m.contains(6));
+    assert(!m.contains(7));
+    assert(std::as_const(m).contains(8));
+    assert(!std::as_const(m).contains(9));
+    m.clear();
+    assert(!m.contains(1));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::deque<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
new file mode 100644
index 0000000000000..8a66ec63768d7
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/contains_transparent.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> bool contains(const K& x) const;
+
+#include <cassert>
+#include <flat_map>
+#include <string>
+#include <utility>
+#include <deque>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanContains     = requires(M m, Transparent<int> k) { m.contains(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanContains<TransparentMap>);
+static_assert(CanContains<const TransparentMap>);
+static_assert(!CanContains<NonTransparentMap>);
+static_assert(!CanContains<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  M m = {{"alpha", 1}, {"beta", 2}, {"beta", 0}, {"epsilon", 3}, {"eta", 4}, {"eta", 1}, {"gamma", 5}};
+  ASSERT_SAME_TYPE(decltype(m.contains(Transparent<std::string>{"abc"})), bool);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).contains(Transparent<std::string>{"b"})), bool);
+  assert(m.contains(Transparent<std::string>{"alpha"}) == true);
+  assert(m.contains(Transparent<std::string>{"beta"}) == true);
+  assert(m.contains(Transparent<std::string>{"epsilon"}) == true);
+  assert(m.contains(Transparent<std::string>{"eta"}) == true);
+  assert(m.contains(Transparent<std::string>{"gamma"}) == true);
+  assert(m.contains(Transparent<std::string>{"al"}) == false);
+  assert(m.contains(Transparent<std::string>{""}) == false);
+  assert(m.contains(Transparent<std::string>{"g"}) == false);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {1, 2}, {2, 2}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto b = m.contains(Transparent<int>{3});
+    assert(b);
+    assert(transparent_used);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
new file mode 100644
index 0000000000000..59b88428cde3c
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count.pass.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// size_type count(const key_type& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+
+  {
+    using M = std::flat_multimap<Key, Value, std::less<>, KeyContainer, ValueContainer>;
+    M m     = {{1, 1}, {2, 2}, {2, 2}, {4, 4}, {4, 1}, {4, 3}, {4, 4}, {5, 5}, {8, 8}};
+    ASSERT_SAME_TYPE(decltype(m.count(0)), size_t);
+    assert(m.count(0) == 0);
+    assert(m.count(1) == 1);
+    assert(m.count(2) == 2);
+    assert(m.count(3) == 0);
+    assert(m.count(4) == 4);
+    assert(m.count(5) == 1);
+    assert(m.count(6) == 0);
+    assert(m.count(7) == 0);
+    assert(std::as_const(m).count(8) == 1);
+    assert(std::as_const(m).count(9) == 0);
+  }
+  {
+    using M = std::flat_multimap<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    M m     = {{1, 0}, {2, 0}, {4, 0}, {1, 0}, {1, 2}, {8, 1}, {5, 0}, {8, 0}};
+    ASSERT_SAME_TYPE(decltype(m.count(0)), size_t);
+    assert(m.count(0) == 0);
+    assert(m.count(1) == 3);
+    assert(m.count(2) == 1);
+    assert(m.count(3) == 0);
+    assert(m.count(4) == 1);
+    assert(m.count(5) == 1);
+    assert(m.count(6) == 0);
+    assert(m.count(7) == 0);
+    assert(std::as_const(m).count(8) == 2);
+    assert(std::as_const(m).count(9) == 0);
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::deque<int>, std::vector<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
new file mode 100644
index 0000000000000..41f71065b2f75
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/count_transparent.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> size_type count(const K& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanCount        = requires(M m, Transparent<int> k) { m.count(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanCount<TransparentMap>);
+static_assert(CanCount<const TransparentMap>);
+static_assert(!CanCount<NonTransparentMap>);
+static_assert(!CanCount<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  M m = {{"alpha", 1},
+         {"beta", 2},
+         {"beta", 2},
+         {"epsilon", 3},
+         {"eta", 4},
+         {"eta", 1},
+         {"eta", 5},
+         {"gamma", 6},
+         {"gamma", 5}};
+  ASSERT_SAME_TYPE(decltype(m.count(Transparent<std::string>{"abc"})), typename M::size_type);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).count(Transparent<std::string>{"b"})), typename M::size_type);
+  assert(m.count(Transparent<std::string>{"alpha"}) == 1);
+  assert(m.count(Transparent<std::string>{"beta"}) == 2);
+  assert(m.count(Transparent<std::string>{"epsilon"}) == 1);
+  assert(m.count(Transparent<std::string>{"eta"}) == 3);
+  assert(m.count(Transparent<std::string>{"gamma"}) == 2);
+  assert(m.count(Transparent<std::string>{"al"}) == 0);
+  assert(m.count(Transparent<std::string>{""}) == 0);
+  assert(m.count(Transparent<std::string>{"g"}) == 0);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(
+        std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 2}, {3, 3}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto n = m.count(Transparent<int>{3});
+    assert(n == 2);
+    assert(transparent_used);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
new file mode 100644
index 0000000000000..ac369b77a7f3d
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range.pass.cpp
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// pair<iterator,iterator>             equal_range(const key_type& k);
+// pair<const_iterator,const_iterator> equal_range(const key_type& k) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  {
+    using M  = std::flat_multimap<Key, Value, std::less<>, KeyContainer, ValueContainer>;
+    using R  = std::pair<typename M::iterator, typename M::iterator>;
+    using CR = std::pair<typename M::const_iterator, typename M::const_iterator>;
+    M m      = {{1, 'a'}, {1, 'a'}, {1, 'A'}, {2, 'b'}, {4, 'd'}, {5, 'E'}, {5, 'e'}, {8, 'h'}, {8, 'z'}};
+    ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR);
+    auto begin = m.begin();
+    assert(m.equal_range(0) == std::pair(begin, begin));
+    assert(m.equal_range(1) == std::pair(begin, begin + 3));
+    assert(m.equal_range(2) == std::pair(begin + 3, begin + 4));
+    assert(m.equal_range(3) == std::pair(begin + 4, begin + 4));
+    assert(m.equal_range(4) == std::pair(begin + 4, begin + 5));
+    assert(m.equal_range(5) == std::pair(begin + 5, begin + 7));
+    assert(m.equal_range(6) == std::pair(begin + 7, begin + 7));
+    assert(m.equal_range(7) == std::pair(begin + 7, begin + 7));
+    assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin() + 7, m.cbegin() + 9));
+    assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin() + 9, m.cbegin() + 9));
+  }
+
+  {
+    using M  = std::flat_multimap<Key, Value, std::greater<int>, KeyContainer, ValueContainer>;
+    using R  = std::pair<typename M::iterator, typename M::iterator>;
+    using CR = std::pair<typename M::const_iterator, typename M::const_iterator>;
+    M m      = {
+        {1, 'a'}, {2, 'b'}, {2, 'b'}, {2, 'c'}, {4, 'a'}, {4, 'b'}, {4, 'c'}, {4, 'd'}, {5, 'e'}, {8, 'a'}, {8, 'h'}};
+    ASSERT_SAME_TYPE(decltype(m.equal_range(0)), R);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(0)), CR);
+    auto begin = m.begin();
+    assert(m.equal_range(0) == std::pair(begin + 11, begin + 11));
+    assert(m.equal_range(1) == std::pair(begin + 10, begin + 11));
+    assert(m.equal_range(2) == std::pair(begin + 7, begin + 10));
+    assert(m.equal_range(3) == std::pair(begin + 7, begin + 7));
+    assert(m.equal_range(4) == std::pair(begin + 3, begin + 7));
+    assert(m.equal_range(5) == std::pair(begin + 2, begin + 3));
+    assert(m.equal_range(6) == std::pair(begin + 2, begin + 2));
+    assert(m.equal_range(7) == std::pair(begin + 2, begin + 2));
+    assert(std::as_const(m).equal_range(8) == std::pair(m.cbegin(), m.cbegin() + 2));
+    assert(std::as_const(m).equal_range(9) == std::pair(m.cbegin(), m.cbegin()));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
new file mode 100644
index 0000000000000..3666492bb921f
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/equal_range_transparent.pass.cpp
@@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> pair<iterator,iterator>             equal_range(const K& x);
+// template<class K> pair<const_iterator,const_iterator> equal_range(const K& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanEqualRange   = requires(M m, Transparent<int> k) { m.equal_range(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanEqualRange<TransparentMap>);
+static_assert(CanEqualRange<const TransparentMap>);
+static_assert(!CanEqualRange<NonTransparentMap>);
+static_assert(!CanEqualRange<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  using R        = std::pair<typename M::iterator, typename M::iterator>;
+  using CR       = std::pair<typename M::const_iterator, typename M::const_iterator>;
+  M m            = {{"alpha", 1},
+                    {"alpha", 1},
+                    {"alpha", 3},
+                    {"beta", 2},
+                    {"epsilon", 3},
+                    {"epsilon", 0},
+                    {"eta", 4},
+                    {"gamma", 5},
+                    {"gamma", 1}};
+  const auto& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.equal_range(Transparent<std::string>{"abc"})), R);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).equal_range(Transparent<std::string>{"b"})), CR);
+
+  auto test_found = [&](auto&& map, const auto& expected_key, std::initializer_list<Value> expected_values) {
+    auto [first, last] = map.equal_range(Transparent<std::string>{expected_key});
+    auto expected_range =
+        expected_values | std::views::transform([&](auto&& val) { return std::pair(expected_key, val); });
+    assert(std::ranges::equal(std::ranges::subrange(first, last), expected_range));
+  };
+
+  auto test_not_found = [&](auto&& map, const std::string& expected_key, long expected_offset) {
+    auto [first, last] = map.equal_range(Transparent<std::string>{expected_key});
+    assert(first == last);
+    assert(first - m.begin() == expected_offset);
+  };
+
+  test_found(m, "alpha", {1, 1, 3});
+  test_found(m, "beta", {2});
+  test_found(m, "epsilon", {3, 0});
+  test_found(m, "eta", {4});
+  test_found(m, "gamma", {5, 1});
+  test_found(cm, "alpha", {1, 1, 3});
+  test_found(cm, "beta", {2});
+  test_found(cm, "epsilon", {3, 0});
+  test_found(cm, "eta", {4});
+  test_found(cm, "gamma", {5, 1});
+
+  test_not_found(m, "charlie", 4);
+  test_not_found(m, "aaa", 0);
+  test_not_found(m, "zzz", 9);
+  test_not_found(cm, "charlie", 4);
+  test_not_found(cm, "aaa", 0);
+  test_not_found(cm, "zzz", 9);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 1}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto p = m.equal_range(Transparent<int>{3});
+    assert(p.first == m.begin() + 2);
+    assert(p.second == m.end());
+    assert(transparent_used);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
new file mode 100644
index 0000000000000..74b7051eb0d7b
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+//       iterator find(const key_type& k);
+// const_iterator find(const key_type& k) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+
+  M m = {{1, 'a'}, {1, 'a'}, {1, 'b'}, {2, 'c'}, {2, 'b'}, {4, 'a'}, {4, 'd'}, {5, 'e'}, {8, 'a'}, {8, 'h'}};
+  ASSERT_SAME_TYPE(decltype(m.find(0)), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).find(0)), typename M::const_iterator);
+  assert(m.find(0) == m.end());
+  assert(m.find(1) == m.begin());
+  assert(m.find(2) == m.begin() + 3);
+  assert(m.find(3) == m.end());
+  assert(m.find(4) == m.begin() + 5);
+  assert(m.find(5) == m.begin() + 7);
+  assert(m.find(6) == m.end());
+  assert(m.find(7) == m.end());
+  assert(std::as_const(m).find(8) == m.begin() + 8);
+  assert(std::as_const(m).find(9) == m.end());
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
new file mode 100644
index 0000000000000..be8c6f2e35440
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/find_transparent.pass.cpp
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> iterator       find(const K& x);
+// template<class K> const_iterator find(const K& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanFind         = requires(M m, Transparent<int> k) { m.find(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanFind<TransparentMap>);
+static_assert(CanFind<const TransparentMap>);
+static_assert(!CanFind<NonTransparentMap>);
+static_assert(!CanFind<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  M m            = {{"alpha", 1},
+                    {"beta", 2},
+                    {"beta", 0},
+                    {"beta", 1},
+                    {"beta", 2},
+                    {"epsilon", 3},
+                    {"epsilon", 1},
+                    {"eta", 4},
+                    {"gamma", 6},
+                    {"gamma", 5}};
+  const auto& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.find(Transparent<std::string>{"abc"})), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).find(Transparent<std::string>{"b"})), typename M::const_iterator);
+
+  auto test_find = [&](auto&& map, const std::string& expected_key, long expected_offset) {
+    auto iter = map.find(Transparent<std::string>{expected_key});
+    assert(iter - map.begin() == expected_offset);
+  };
+
+  test_find(m, "alpha", 0);
+  test_find(m, "beta", 1);
+  test_find(m, "epsilon", 5);
+  test_find(m, "eta", 7);
+  test_find(m, "gamma", 8);
+  test_find(m, "charlie", 10);
+  test_find(m, "aaa", 10);
+  test_find(m, "zzz", 10);
+  test_find(cm, "alpha", 0);
+  test_find(cm, "beta", 1);
+  test_find(cm, "epsilon", 5);
+  test_find(cm, "eta", 7);
+  test_find(cm, "gamma", 8);
+  test_find(cm, "charlie", 10);
+  test_find(cm, "aaa", 10);
+  test_find(cm, "zzz", 10);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto it = m.find(Transparent<int>{3});
+    assert(it != m.end());
+    assert(transparent_used);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
new file mode 100644
index 0000000000000..c3befdda7de6e
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound.pass.cpp
@@ -0,0 +1,73 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+//       iterator lower_bound(const key_type& k);
+// const_iterator lower_bound(const key_type& k) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  {
+    using M = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    M m     = {{1, 'a'}, {2, 'a'}, {2, 'c'}, {2, 'b'}, {4, 'd'}, {5, 'a'}, {5, 'e'}, {8, 'h'}, {8, 'a'}};
+    ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator);
+    assert(m.lower_bound(0) == m.begin());
+    assert(m.lower_bound(1) == m.begin());
+    assert(m.lower_bound(2) == m.begin() + 1);
+    assert(m.lower_bound(3) == m.begin() + 4);
+    assert(m.lower_bound(4) == m.begin() + 4);
+    assert(m.lower_bound(5) == m.begin() + 5);
+    assert(m.lower_bound(6) == m.begin() + 7);
+    assert(m.lower_bound(7) == m.begin() + 7);
+    assert(std::as_const(m).lower_bound(8) == m.begin() + 7);
+    assert(std::as_const(m).lower_bound(9) == m.end());
+  }
+  {
+    using M = std::flat_multimap<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
+    M m     = {{1, 'a'}, {1, 'b'}, {2, 'b'}, {4, 'd'}, {4, 'a'}, {4, 'e'}, {5, 'e'}, {8, 'a'}, {8, 'h'}};
+    ASSERT_SAME_TYPE(decltype(m.lower_bound(0)), typename M::iterator);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(0)), typename M::const_iterator);
+    assert(m.lower_bound(0) == m.end());
+    assert(m.lower_bound(1) == m.begin() + 7);
+    assert(m.lower_bound(2) == m.begin() + 6);
+    assert(m.lower_bound(3) == m.begin() + 6);
+    assert(m.lower_bound(4) == m.begin() + 3);
+    assert(m.lower_bound(5) == m.begin() + 2);
+    assert(m.lower_bound(6) == m.begin() + 2);
+    assert(m.lower_bound(7) == m.begin() + 2);
+    assert(std::as_const(m).lower_bound(8) == m.begin());
+    assert(std::as_const(m).lower_bound(9) == m.begin());
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
new file mode 100644
index 0000000000000..b757af132e677
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/lower_bound_transparent.pass.cpp
@@ -0,0 +1,107 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> iterator       lower_bound(const K& x);
+// template<class K> const_iterator lower_bound(const K& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanLowerBound   = requires(M m, Transparent<int> k) { m.lower_bound(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanLowerBound<TransparentMap>);
+static_assert(CanLowerBound<const TransparentMap>);
+static_assert(!CanLowerBound<NonTransparentMap>);
+static_assert(!CanLowerBound<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  M m            = {{"alpha", 1},
+                    {"alpha", 2},
+                    {"alpha", 3},
+                    {"beta", 2},
+                    {"epsilon", 3},
+                    {"epsilon", 4},
+                    {"eta", 4},
+                    {"gamma", 5},
+                    {"gamma", 5},
+                    {"gamma", 5},
+                    {"gamma", 5}};
+  const auto& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent<std::string>{"abc"})), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent<std::string>{"b"})), typename M::const_iterator);
+
+  auto test_lower_bound = [&](auto&& map, const std::string& expected_key, long expected_offset) {
+    auto iter = map.lower_bound(Transparent<std::string>{expected_key});
+    assert(iter - map.begin() == expected_offset);
+  };
+
+  test_lower_bound(m, "abc", 0);
+  test_lower_bound(m, "alpha", 0);
+  test_lower_bound(m, "beta", 3);
+  test_lower_bound(m, "bets", 4);
+  test_lower_bound(m, "charlie", 4);
+  test_lower_bound(m, "echo", 4);
+  test_lower_bound(m, "epsilon", 4);
+  test_lower_bound(m, "eta", 6);
+  test_lower_bound(m, "gamma", 7);
+  test_lower_bound(m, "golf", 11);
+  test_lower_bound(m, "zzz", 11);
+
+  test_lower_bound(cm, "abc", 0);
+  test_lower_bound(cm, "alpha", 0);
+  test_lower_bound(cm, "beta", 3);
+  test_lower_bound(cm, "bets", 4);
+  test_lower_bound(cm, "charlie", 4);
+  test_lower_bound(cm, "echo", 4);
+  test_lower_bound(cm, "epsilon", 4);
+  test_lower_bound(cm, "eta", 6);
+  test_lower_bound(cm, "gamma", 7);
+  test_lower_bound(cm, "golf", 11);
+  test_lower_bound(cm, "zzz", 11);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {2, 2}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto it = m.lower_bound(Transparent<int>{3});
+    assert(it != m.end());
+    assert(transparent_used);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
new file mode 100644
index 0000000000000..d73d030236e22
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound.pass.cpp
@@ -0,0 +1,76 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+//       iterator upper_bound(const key_type& k);
+// const_iterator upper_bound(const key_type& k) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  {
+    using M = std::flat_multimap<Key, Value, std::less<Key>, KeyContainer, ValueContainer>;
+    M m     = {
+        {1, 'a'}, {2, 'b'}, {4, 'd'}, {4, 'e'}, {4, 'a'}, {4, 'b'}, {5, 'e'}, {5, 'a'}, {8, 'a'}, {8, 'b'}, {8, 'h'}};
+    ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator);
+    assert(m.upper_bound(0) == m.begin());
+    assert(m.upper_bound(1) == m.begin() + 1);
+    assert(m.upper_bound(2) == m.begin() + 2);
+    assert(m.upper_bound(3) == m.begin() + 2);
+    assert(m.upper_bound(4) == m.begin() + 6);
+    assert(m.upper_bound(5) == m.begin() + 8);
+    assert(m.upper_bound(6) == m.begin() + 8);
+    assert(std::as_const(m).upper_bound(7) == m.begin() + 8);
+    assert(std::as_const(m).upper_bound(8) == m.end());
+    assert(std::as_const(m).upper_bound(9) == m.end());
+  }
+
+  {
+    using M = std::flat_multimap<Key, Value, std::greater<Key>, KeyContainer, ValueContainer>;
+    M m     = {
+        {1, 'a'}, {2, 'b'}, {4, 'd'}, {4, 'e'}, {4, 'a'}, {4, 'b'}, {5, 'e'}, {5, 'a'}, {8, 'a'}, {8, 'b'}, {8, 'h'}};
+    ASSERT_SAME_TYPE(decltype(m.upper_bound(0)), typename M::iterator);
+    ASSERT_SAME_TYPE(decltype(std::as_const(m).upper_bound(0)), typename M::const_iterator);
+    assert(m.upper_bound(0) == m.end());
+    assert(m.upper_bound(1) == m.end());
+    assert(m.upper_bound(2) == m.begin() + 10);
+    assert(m.upper_bound(3) == m.begin() + 9);
+    assert(m.upper_bound(4) == m.begin() + 9);
+    assert(m.upper_bound(5) == m.begin() + 5);
+    assert(m.upper_bound(6) == m.begin() + 3);
+    assert(m.upper_bound(7) == m.begin() + 3);
+    assert(std::as_const(m).upper_bound(8) == m.begin() + 3);
+    assert(std::as_const(m).upper_bound(9) == m.begin());
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<char>>();
+  test<std::deque<int>, std::vector<char>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<char>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<char, min_allocator<char>>>();
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
new file mode 100644
index 0000000000000..969489d0fe619
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/flat.multimap.operations/upper_bound_transparent.pass.cpp
@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// template<class K> iterator       upper_bound(const K& x);
+// template<class K> const_iterator upper_bound(const K& x) const;
+
+#include <cassert>
+#include <deque>
+#include <flat_map>
+#include <string>
+#include <utility>
+
+#include "MinSequenceContainer.h"
+#include "../helpers.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+
+// Constraints: The qualified-id Compare::is_transparent is valid and denotes a type.
+template <class M>
+concept CanUpperBound   = requires(M m, Transparent<int> k) { m.upper_bound(k); };
+using TransparentMap    = std::flat_multimap<int, double, TransparentComparator>;
+using NonTransparentMap = std::flat_multimap<int, double, NonTransparentComparator>;
+static_assert(CanUpperBound<TransparentMap>);
+static_assert(CanUpperBound<const TransparentMap>);
+static_assert(!CanUpperBound<NonTransparentMap>);
+static_assert(!CanUpperBound<const NonTransparentMap>);
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+  using M     = std::flat_multimap<Key, Value, TransparentComparator, KeyContainer, ValueContainer>;
+
+  M m            = {{"alpha", 1},
+                    {"alpha", 2},
+                    {"alpha", 3},
+                    {"beta", 2},
+                    {"epsilon", 3},
+                    {"epsilon", 4},
+                    {"eta", 4},
+                    {"gamma", 5},
+                    {"gamma", 5},
+                    {"gamma", 5},
+                    {"gamma", 5}};
+  const auto& cm = m;
+  ASSERT_SAME_TYPE(decltype(m.lower_bound(Transparent<std::string>{"abc"})), typename M::iterator);
+  ASSERT_SAME_TYPE(decltype(std::as_const(m).lower_bound(Transparent<std::string>{"b"})), typename M::const_iterator);
+
+  auto test_upper_bound = [&](auto&& map, const std::string& expected_key, long expected_offset) {
+    auto iter = map.upper_bound(Transparent<std::string>{expected_key});
+    assert(iter - map.begin() == expected_offset);
+  };
+
+  test_upper_bound(m, "abc", 0);
+  test_upper_bound(m, "alpha", 3);
+  test_upper_bound(m, "beta", 4);
+  test_upper_bound(m, "bets", 4);
+  test_upper_bound(m, "charlie", 4);
+  test_upper_bound(m, "echo", 4);
+  test_upper_bound(m, "epsilon", 6);
+  test_upper_bound(m, "eta", 7);
+  test_upper_bound(m, "gamma", 11);
+  test_upper_bound(m, "golf", 11);
+  test_upper_bound(m, "zzz", 11);
+
+  test_upper_bound(cm, "abc", 0);
+  test_upper_bound(cm, "alpha", 3);
+  test_upper_bound(cm, "beta", 4);
+  test_upper_bound(cm, "bets", 4);
+  test_upper_bound(cm, "charlie", 4);
+  test_upper_bound(cm, "echo", 4);
+  test_upper_bound(cm, "epsilon", 6);
+  test_upper_bound(cm, "eta", 7);
+  test_upper_bound(cm, "gamma", 11);
+  test_upper_bound(cm, "golf", 11);
+  test_upper_bound(cm, "zzz", 11);
+}
+
+int main(int, char**) {
+  test<std::vector<std::string>, std::vector<int>>();
+  test<std::deque<std::string>, std::vector<int>>();
+  test<MinSequenceContainer<std::string>, MinSequenceContainer<int>>();
+  test<std::vector<std::string, min_allocator<std::string>>, std::vector<int, min_allocator<int>>>();
+  {
+    bool transparent_used = false;
+    TransparentComparator c(transparent_used);
+    std::flat_multimap<int, int, TransparentComparator> m(std::sorted_equivalent, {{1, 1}, {2, 2}, {2, 2}, {3, 3}}, c);
+    assert(!transparent_used);
+    auto it = m.upper_bound(Transparent<int>{2});
+    assert(it == m.begin() + 3);
+    assert(transparent_used);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
new file mode 100644
index 0000000000000..252e2454d497c
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/helpers.h
@@ -0,0 +1,389 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUPPORT_FLAT_MULTIMAP_HELPERS_H
+#define SUPPORT_FLAT_MULTIMAP_HELPERS_H
+
+#include <algorithm>
+#include <cassert>
+#include <string>
+#include <vector>
+#include <flat_map>
+
+#include "test_allocator.h"
+#include "test_macros.h"
+
+template <class... Args>
+void check_invariant(const std::flat_multimap<Args...>& m) {
+  assert(m.keys().size() == m.values().size());
+  const auto& keys = m.keys();
+  assert(std::is_sorted(keys.begin(), keys.end(), m.key_comp()));
+}
+
+struct StartsWith {
+  explicit StartsWith(char ch) : lower_(1, ch), upper_(1, ch + 1) {}
+  StartsWith(const StartsWith&)     = delete;
+  void operator=(const StartsWith&) = delete;
+  struct Less {
+    using is_transparent = void;
+    bool operator()(const std::string& a, const std::string& b) const { return a < b; }
+    bool operator()(const StartsWith& a, const std::string& b) const { return a.upper_ <= b; }
+    bool operator()(const std::string& a, const StartsWith& b) const { return a < b.lower_; }
+    bool operator()(const StartsWith&, const StartsWith&) const {
+      assert(false); // should not be called
+      return false;
+    }
+  };
+
+private:
+  std::string lower_;
+  std::string upper_;
+};
+
+template <class T>
+struct CopyOnlyVector : std::vector<T> {
+  using std::vector<T>::vector;
+
+  CopyOnlyVector(const CopyOnlyVector&) = default;
+  CopyOnlyVector(CopyOnlyVector&& other) : CopyOnlyVector(other) {}
+  CopyOnlyVector(CopyOnlyVector&& other, std::vector<T>::allocator_type alloc) : CopyOnlyVector(other, alloc) {}
+
+  CopyOnlyVector& operator=(const CopyOnlyVector&) = default;
+  CopyOnlyVector& operator=(CopyOnlyVector& other) { return this->operator=(other); }
+};
+
+template <class T, bool ConvertibleToT = false>
+struct Transparent {
+  T t;
+
+  operator T() const
+    requires ConvertibleToT
+  {
+    return t;
+  }
+};
+
+template <class T>
+using ConvertibleTransparent = Transparent<T, true>;
+
+template <class T>
+using NonConvertibleTransparent = Transparent<T, false>;
+
+struct TransparentComparator {
+  using is_transparent = void;
+
+  bool* transparent_used  = nullptr;
+  TransparentComparator() = default;
+  TransparentComparator(bool& used) : transparent_used(&used) {}
+
+  template <class T, bool Convertible>
+  bool operator()(const T& t, const Transparent<T, Convertible>& transparent) const {
+    if (transparent_used != nullptr) {
+      *transparent_used = true;
+    }
+    return t < transparent.t;
+  }
+
+  template <class T, bool Convertible>
+  bool operator()(const Transparent<T, Convertible>& transparent, const T& t) const {
+    if (transparent_used != nullptr) {
+      *transparent_used = true;
+    }
+    return transparent.t < t;
+  }
+
+  template <class T>
+  bool operator()(const T& t1, const T& t2) const {
+    return t1 < t2;
+  }
+};
+
+struct NonTransparentComparator {
+  template <class T, bool Convertible>
+  bool operator()(const T&, const Transparent<T, Convertible>&) const;
+
+  template <class T, bool Convertible>
+  bool operator()(const Transparent<T, Convertible>&, const T&) const;
+
+  template <class T>
+  bool operator()(const T&, const T&) const;
+};
+
+struct NoDefaultCtr {
+  NoDefaultCtr() = delete;
+};
+
+#ifndef TEST_HAS_NO_EXCEPTIONS
+template <class T>
+struct EmplaceUnsafeContainer : std::vector<T> {
+  using std::vector<T>::vector;
+
+  template <class... Args>
+  auto emplace(Args&&... args) -> decltype(std::declval<std::vector<T>>().emplace(std::forward<Args>(args)...)) {
+    if (this->size() > 1) {
+      auto it1 = this->begin();
+      auto it2 = it1 + 1;
+      // messing up the container
+      std::iter_swap(it1, it2);
+    }
+
+    throw 42;
+  }
+
+  template <class... Args>
+  auto insert(Args&&... args) -> decltype(std::declval<std::vector<T>>().insert(std::forward<Args>(args)...)) {
+    if (this->size() > 1) {
+      auto it1 = this->begin();
+      auto it2 = it1 + 1;
+      // messing up the container
+      std::iter_swap(it1, it2);
+    }
+
+    throw 42;
+  }
+};
+
+template <class T>
+struct ThrowOnEraseContainer : std::vector<T> {
+  using std::vector<T>::vector;
+
+  template <class... Args>
+  auto erase(Args&&... args) -> decltype(std::declval<std::vector<T>>().erase(std::forward<Args>(args)...)) {
+    throw 42;
+  }
+};
+
+template <class T>
+struct ThrowOnMoveContainer : std::vector<T> {
+  using std::vector<T>::vector;
+
+  ThrowOnMoveContainer(ThrowOnMoveContainer&&) { throw 42; }
+
+  ThrowOnMoveContainer& operator=(ThrowOnMoveContainer&&) { throw 42; }
+};
+
+#endif
+
+template <class F>
+void test_emplace_exception_guarantee([[maybe_unused]] F&& emplace_function) {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  using C = TransparentComparator;
+  {
+    // Throw on emplace the key, and underlying has strong exception guarantee
+    using KeyContainer = std::vector<int, test_allocator<int>>;
+    using M            = std::flat_multimap<int, int, C, KeyContainer>;
+
+    LIBCPP_STATIC_ASSERT(std::__container_traits<KeyContainer>::__emplacement_has_strong_exception_safety_guarantee);
+
+    test_allocator_statistics stats;
+
+    KeyContainer a({1, 1, 2, 4}, test_allocator<int>{&stats});
+    std::vector<int> b                    = {5, 6, 7, 8};
+    [[maybe_unused]] auto expected_keys   = a;
+    [[maybe_unused]] auto expected_values = b;
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+
+    stats.throw_after = 1;
+    try {
+      emplace_function(m, 1, 1);
+      assert(false);
+    } catch (const std::bad_alloc&) {
+      check_invariant(m);
+      // In libc++, the flat_multimap is unchanged
+      LIBCPP_ASSERT(m.size() == 4);
+      LIBCPP_ASSERT(m.keys() == expected_keys);
+      LIBCPP_ASSERT(m.values() == expected_values);
+    }
+  }
+  {
+    // Throw on emplace the key, and underlying has no strong exception guarantee
+    using KeyContainer = EmplaceUnsafeContainer<int>;
+    using M            = std::flat_multimap<int, int, C, KeyContainer>;
+
+    LIBCPP_STATIC_ASSERT(!std::__container_traits<KeyContainer>::__emplacement_has_strong_exception_safety_guarantee);
+    KeyContainer a     = {1, 2, 2, 4};
+    std::vector<int> b = {5, 6, 7, 8};
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+    try {
+      emplace_function(m, 1, 1);
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, the flat_multimap is cleared
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+  }
+  {
+    // Throw on emplace the value, and underlying has strong exception guarantee
+    using ValueContainer = std::vector<int, test_allocator<int>>;
+    ;
+    using M = std::flat_multimap<int, int, C, std::vector<int>, ValueContainer>;
+
+    LIBCPP_STATIC_ASSERT(std::__container_traits<ValueContainer>::__emplacement_has_strong_exception_safety_guarantee);
+
+    std::vector<int> a = {1, 3, 3, 4};
+    test_allocator_statistics stats;
+    ValueContainer b({1, 2, 3, 4}, test_allocator<int>{&stats});
+
+    [[maybe_unused]] auto expected_keys   = a;
+    [[maybe_unused]] auto expected_values = b;
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+
+    stats.throw_after = 1;
+    try {
+      emplace_function(m, 3, 3);
+      assert(false);
+    } catch (const std::bad_alloc&) {
+      check_invariant(m);
+      // In libc++, the emplaced key is erased and the flat_multimap is unchanged
+      LIBCPP_ASSERT(m.size() == 4);
+      LIBCPP_ASSERT(m.keys() == expected_keys);
+      LIBCPP_ASSERT(m.values() == expected_values);
+    }
+  }
+  {
+    // Throw on emplace the value, and underlying has no strong exception guarantee
+    using ValueContainer = EmplaceUnsafeContainer<int>;
+    using M              = std::flat_multimap<int, int, C, std::vector<int>, ValueContainer>;
+
+    LIBCPP_STATIC_ASSERT(!std::__container_traits<ValueContainer>::__emplacement_has_strong_exception_safety_guarantee);
+    std::vector<int> a = {1, 1, 1, 1};
+    ValueContainer b   = {1, 2, 3, 4};
+
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+
+    try {
+      emplace_function(m, 1, 5);
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, the flat_multimap is cleared
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+  }
+  {
+    // Throw on emplace the value, then throw again on erasing the key
+    using KeyContainer   = ThrowOnEraseContainer<int>;
+    using ValueContainer = std::vector<int, test_allocator<int>>;
+    using M              = std::flat_multimap<int, int, C, KeyContainer, ValueContainer>;
+
+    LIBCPP_STATIC_ASSERT(std::__container_traits<ValueContainer>::__emplacement_has_strong_exception_safety_guarantee);
+
+    KeyContainer a = {4, 4, 4, 4};
+    test_allocator_statistics stats;
+    ValueContainer b({1, 2, 3, 4}, test_allocator<int>{&stats});
+
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+    stats.throw_after = 1;
+    try {
+      emplace_function(m, 0, 0);
+      assert(false);
+    } catch (const std::bad_alloc&) {
+      check_invariant(m);
+      // In libc++, we try to erase the key after value emplacement failure.
+      // and after erasure failure, we clear the flat_multimap
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+  }
+#endif
+}
+
+template <class F>
+void test_insert_range_exception_guarantee([[maybe_unused]] F&& insert_function) {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  using KeyContainer   = EmplaceUnsafeContainer<int>;
+  using ValueContainer = std::vector<int>;
+  using M              = std::flat_multimap<int, int, std::ranges::less, KeyContainer, ValueContainer>;
+  test_allocator_statistics stats;
+  KeyContainer a{1, 2, 3, 4};
+  ValueContainer b{1, 2, 3, 4};
+  M m(std::sorted_equivalent, std::move(a), std::move(b));
+
+  std::vector<std::pair<int, int>> newValues = {{0, 0}, {1, 1}, {5, 5}, {6, 6}, {7, 7}, {8, 8}};
+  stats.throw_after                          = 1;
+  try {
+    insert_function(m, newValues);
+    assert(false);
+  } catch (int) {
+    check_invariant(m);
+    // In libc++, we clear if anything goes wrong when inserting a range
+    LIBCPP_ASSERT(m.size() == 0);
+  }
+#endif
+}
+
+template <class F>
+void test_erase_exception_guarantee([[maybe_unused]] F&& erase_function) {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  {
+    // key erase throws
+    using KeyContainer   = ThrowOnEraseContainer<int>;
+    using ValueContainer = std::vector<int>;
+    using M              = std::flat_multimap<int, int, TransparentComparator, KeyContainer, ValueContainer>;
+
+    KeyContainer a{1, 3, 3, 4};
+    ValueContainer b{1, 3, 3, 4};
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+    try {
+      erase_function(m, 3);
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, we clear if anything goes wrong when erasing
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+  }
+  {
+    // key erase throws
+    using KeyContainer   = std::vector<int>;
+    using ValueContainer = ThrowOnEraseContainer<int>;
+    using M              = std::flat_multimap<int, int, TransparentComparator, KeyContainer, ValueContainer>;
+
+    KeyContainer a{1, 3, 3, 4};
+    ValueContainer b{1, 3, 3, 4};
+    M m(std::sorted_equivalent, std::move(a), std::move(b));
+    try {
+      erase_function(m, 3);
+      assert(false);
+    } catch (int) {
+      check_invariant(m);
+      // In libc++, we clear if anything goes wrong when erasing
+      LIBCPP_ASSERT(m.size() == 0);
+    }
+  }
+#endif
+}
+class Moveable {
+  int int_;
+  double double_;
+
+public:
+  Moveable() : int_(0), double_(0) {}
+  Moveable(int i, double d) : int_(i), double_(d) {}
+  Moveable(Moveable&& x) : int_(x.int_), double_(x.double_) {
+    x.int_    = -1;
+    x.double_ = -1;
+  }
+  Moveable& operator=(Moveable&& x) {
+    int_      = x.int_;
+    x.int_    = -1;
+    double_   = x.double_;
+    x.double_ = -1;
+    return *this;
+  }
+
+  Moveable(const Moveable&)            = delete;
+  Moveable& operator=(const Moveable&) = delete;
+  bool operator==(const Moveable& x) const { return int_ == x.int_ && double_ == x.double_; }
+  bool operator<(const Moveable& x) const { return int_ < x.int_ || (int_ == x.int_ && double_ < x.double_); }
+
+  int get() const { return int_; }
+  bool moved() const { return int_ == -1; }
+};
+
+#endif // SUPPORT_FLAT_MULTIMAP_HELPERS_H
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp
new file mode 100644
index 0000000000000..e4325b1dfe3ba
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/incomplete_type.pass.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// Check that std::flat_multimap and its iterators can be instantiated with an incomplete
+// type.
+
+#include <flat_map>
+#include <vector>
+
+struct A {
+  using Map = std::flat_multimap<A, A>;
+  int data;
+  Map m;
+  Map::iterator it;
+  Map::const_iterator cit;
+};
+
+// Implement the operator< required in order to instantiate flat_multimap<A, X>
+bool operator<(A const& L, A const& R) { return L.data < R.data; }
+
+int main(int, char**) {
+  A a;
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp
new file mode 100644
index 0000000000000..680ff1a127dda
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/op_compare.pass.cpp
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+// <flat_map>
+
+// class flat_multimap
+
+// friend bool operator==(const flat_multimap& x, const flat_multimap& y);
+// friend synth-three-way-result<value_type>
+//   operator<=>(const flat_multimap& x, const flat_multimap& y);
+
+#include <algorithm>
+#include <cassert>
+#include <deque>
+#include <compare>
+#include <flat_map>
+#include <functional>
+#include <limits>
+#include <vector>
+
+#include "MinSequenceContainer.h"
+#include "test_macros.h"
+#include "min_allocator.h"
+#include "test_allocator.h"
+#include "test_comparisons.h"
+#include "test_container_comparisons.h"
+
+template <class KeyContainer, class ValueContainer>
+void test() {
+  using Key   = typename KeyContainer::value_type;
+  using Value = typename ValueContainer::value_type;
+
+  {
+    using C = std::flat_multimap<Key, Value>;
+    C s1    = {{1, 1}};
+    C s2    = {{2, 0}}; // {{1,1}} versus {{2,0}}
+    ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering);
+    AssertComparisonsReturnBool<C>();
+    assert(testComparisons(s1, s2, false, true));
+    s2 = {{1, 1}}; // {{1,1}} versus {{1,1}}
+    assert(testComparisons(s1, s2, true, false));
+    s2 = {{1, 1}, {2, 0}}; // {{1,1}} versus {{1,1},{2,0}}
+    assert(testComparisons(s1, s2, false, true));
+    s1 = {{0, 0}, {1, 1}, {2, 2}}; // {{0,0},{1,1},{2,2}} versus {{1,1},{2,0}}
+    assert(testComparisons(s1, s2, false, true));
+    s2 = {{0, 0}, {1, 1}, {2, 3}}; // {{0,0},{1,1},{2,2}} versus {{0,0},{1,1},{2,3}}
+    assert(testComparisons(s1, s2, false, true));
+
+    s1 = {{1, 1}, {1, 1}};
+    s2 = {{1, 1}, {1, 1}};
+    assert(testComparisons(s1, s2, true, false));
+
+    s2 = {{1, 1}, {1, 1}, {2, 2}};
+    assert(testComparisons(s1, s2, false, true));
+
+    s2 = {{1, 1}, {2, 2}, {2, 2}};
+    assert(testComparisons(s1, s2, false, true));
+
+    s2 = {{0, 0}, {1, 1}, {1, 1}};
+    assert(testComparisons(s1, s2, false, false));
+  }
+  {
+    // Comparisons use value_type's native operators, not the comparator
+    using C = std::flat_multimap<Key, Value, std::greater<Key>>;
+    C s1    = {{1, 1}};
+    C s2    = {{2, 0}}; // {{1,1}} versus {{2,0}}
+    ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::strong_ordering);
+    AssertComparisonsReturnBool<C>();
+    assert(testComparisons(s1, s2, false, true));
+    s2 = {{1, 1}}; // {{1,1}} versus {{1,1}}
+    assert(testComparisons(s1, s2, true, false));
+    s2 = {{1, 1}, {2, 0}}; // {{1,1}} versus {{2,0},{1,1}}
+    assert(testComparisons(s1, s2, false, true));
+    s1 = {{0, 0}, {1, 1}, {2, 2}}; // {{2,2},{1,1},{0,0}} versus {2,0},{1,1}}
+    assert(testComparisons(s1, s2, false, false));
+    s2 = {{0, 0}, {1, 1}, {2, 3}}; // {{2,2},{1,1},{0,0}} versus {{2,3},{1,1},{0,0}}
+    assert(testComparisons(s1, s2, false, true));
+  }
+}
+
+int main(int, char**) {
+  test<std::vector<int>, std::vector<int>>();
+  test<std::deque<int>, std::deque<int>>();
+  test<MinSequenceContainer<int>, MinSequenceContainer<int>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+  test<std::vector<int, min_allocator<int>>, std::vector<int, min_allocator<int>>>();
+
+  {
+    using C = std::flat_multimap<double, int>;
+    C s1    = {{1, 1}};
+    C s2    = C(std::sorted_equivalent, {{std::numeric_limits<double>::quiet_NaN(), 2}});
+    ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering);
+    AssertComparisonsReturnBool<C>();
+    assert(testComparisonsComplete(s1, s2, false, false, false));
+  }
+  {
+    using C = std::flat_multimap<int, double>;
+    C s1    = {{1, 1}};
+    C s2    = C(std::sorted_equivalent, {{2, std::numeric_limits<double>::quiet_NaN()}});
+    ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering);
+    AssertComparisonsReturnBool<C>();
+    assert(testComparisonsComplete(s1, s2, false, true, false));
+    s2 = C(std::sorted_equivalent, {{1, std::numeric_limits<double>::quiet_NaN()}});
+    assert(testComparisonsComplete(s1, s2, false, false, false));
+  }
+  {
+    // Comparisons use value_type's native operators, not the comparator
+    struct StrongComp {
+      bool operator()(double a, double b) const { return std::strong_order(a, b) < 0; }
+    };
+    using C = std::flat_multimap<double, double, StrongComp>;
+    C s1    = {{1, 1}};
+    C s2    = {{std::numeric_limits<double>::quiet_NaN(), std::numeric_limits<double>::quiet_NaN()}};
+    ASSERT_SAME_TYPE(decltype(s1 <=> s2), std::partial_ordering);
+    AssertComparisonsReturnBool<C>();
+    assert(testComparisonsComplete(s1, s2, false, false, false));
+    s1 = {{{1, 1}, {std::numeric_limits<double>::quiet_NaN(), 1}}};
+    s2 = {{{std::numeric_limits<double>::quiet_NaN(), 1}, {1, 1}}};
+    assert(std::lexicographical_compare_three_way(
+               s1.keys().begin(), s1.keys().end(), s2.keys().begin(), s2.keys().end(), std::strong_order) ==
+           std::strong_ordering::equal);
+    assert(s1 != s2);
+    assert((s1 <=> s2) == std::partial_ordering::unordered);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp b/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp
new file mode 100644
index 0000000000000..490d51c299793
--- /dev/null
+++ b/libcxx/test/std/containers/container.adaptors/flat.multimap/types.compile.pass.cpp
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+//  using key_type               = Key;
+//  using mapped_type            = T;
+//  using value_type             = pair<key_type, mapped_type>;
+//  using key_compare            = Compare;
+//  using reference              = pair<const key_type&, mapped_type&>;
+//  using const_reference        = pair<const key_type&, const mapped_type&>;
+//  using size_type              = size_t;
+//  using difference_type        = ptrdiff_t;
+//  using iterator               = implementation-defined; // see [container.requirements]
+//  using const_iterator         = implementation-defined; // see [container.requirements]
+//  using reverse_iterator       = std::reverse_iterator<iterator>;
+//  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+//  using key_container_type     = KeyContainer;
+//  using mapped_container_type  = MappedContainer;
+
+//  class value_compare;
+
+//  struct containers {
+//    key_container_type keys;
+//    mapped_container_type values;
+//  };
+
+#include <concepts>
+#include <deque>
+#include <flat_map>
+#include <functional>
+#include <ranges>
+#include <string>
+#include <vector>
+#include "min_allocator.h"
+
+void test() {
+  {
+    using M = std::flat_multimap<int, double>;
+    static_assert(std::is_same_v<typename M::key_type, int>);
+    static_assert(std::is_same_v<typename M::mapped_type, double>);
+    static_assert(std::is_same_v<typename M::value_type, std::pair<int, double>>);
+    static_assert(std::is_same_v<typename M::key_compare, std::less<int>>);
+    static_assert(std::is_same_v<typename M::reference, std::pair<const int&, double&>>);
+    static_assert(std::is_same_v<typename M::const_reference, std::pair<const int&, const double&>>);
+    static_assert(std::is_same_v<typename M::size_type, size_t>);
+    static_assert(std::is_same_v<typename M::difference_type, ptrdiff_t>);
+    static_assert(requires { typename M::iterator; });
+    static_assert(requires { typename M::const_iterator; });
+    static_assert(std::is_same_v<typename M::reverse_iterator, std::reverse_iterator<typename M::iterator>>);
+    static_assert(
+        std::is_same_v<typename M::const_reverse_iterator, std::reverse_iterator<typename M::const_iterator>>);
+    static_assert(std::is_same_v<typename M::key_container_type, std::vector<int>>);
+    static_assert(std::is_same_v<typename M::mapped_container_type, std::vector<double>>);
+    static_assert(requires { typename M::value_compare; });
+    static_assert(requires { typename M::containers; });
+    static_assert(std::is_same_v<decltype(M::containers::keys), std::vector<int>>);
+    static_assert(std::is_same_v<decltype(M::containers::values), std::vector<double>>);
+  }
+
+  {
+    struct A {};
+    struct Compare {
+      bool operator()(const std::string&, const std::string&) const;
+    };
+    using M = std::flat_multimap<std::string, A, Compare, std::deque<std::string>, std::deque<A>>;
+    static_assert(std::is_same_v<typename M::key_type, std::string>);
+    static_assert(std::is_same_v<typename M::mapped_type, A>);
+    static_assert(std::is_same_v<typename M::value_type, std::pair<std::string, A>>);
+    static_assert(std::is_same_v<typename M::key_compare, Compare>);
+    static_assert(std::is_same_v<typename M::reference, std::pair<const std::string&, A&>>);
+    static_assert(std::is_same_v<typename M::const_reference, std::pair<const std::string&, const A&>>);
+    static_assert(std::is_same_v<typename M::size_type, size_t>);
+    static_assert(std::is_same_v<typename M::difference_type, ptrdiff_t>);
+    static_assert(requires { typename M::iterator; });
+    static_assert(requires { typename M::const_iterator; });
+    static_assert(std::is_same_v<typename M::reverse_iterator, std::reverse_iterator<typename M::iterator>>);
+    static_assert(
+        std::is_same_v<typename M::const_reverse_iterator, std::reverse_iterator<typename M::const_iterator>>);
+    static_assert(std::is_same_v<typename M::key_container_type, std::deque<std::string>>);
+    static_assert(std::is_same_v<typename M::mapped_container_type, std::deque<A>>);
+    static_assert(requires { typename M::value_compare; });
+    static_assert(requires { typename M::containers; });
+    static_assert(std::is_same_v<decltype(M::containers::keys), std::deque<std::string>>);
+    static_assert(std::is_same_v<decltype(M::containers::values), std::deque<A>>);
+  }
+  {
+    using C = std::flat_multimap<int, short>;
+    static_assert(std::is_same_v<C::key_type, int>);
+    static_assert(std::is_same_v<C::mapped_type, short>);
+    static_assert(std::is_same_v<C::value_type, std::pair<int, short>>);
+    static_assert(std::is_same_v<C::key_compare, std::less<int>>);
+    static_assert(!std::is_same_v<C::value_compare, std::less<int>>);
+    static_assert(std::is_same_v<C::reference, std::pair<const int&, short&>>);
+    static_assert(std::is_same_v<C::const_reference, std::pair<const int&, const short&>>);
+    static_assert(std::random_access_iterator<C::iterator>);
+    static_assert(std::random_access_iterator<C::const_iterator>);
+    static_assert(std::random_access_iterator<C::reverse_iterator>);
+    static_assert(std::random_access_iterator<C::const_reverse_iterator>);
+    static_assert(std::is_same_v<C::reverse_iterator, std::reverse_iterator<C::iterator>>);
+    static_assert(std::is_same_v<C::const_reverse_iterator, std::reverse_iterator<C::const_iterator>>);
+    static_assert(std::is_same_v<C::size_type, std::size_t>);
+    static_assert(std::is_same_v<C::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<C::key_container_type, std::vector<int>>);
+    static_assert(std::is_same_v<C::mapped_container_type, std::vector<short>>);
+  }
+  {
+    using C = std::flat_multimap<short, int, std::greater<long>, std::deque<short, min_allocator<short>>>;
+    static_assert(std::is_same_v<C::key_type, short>);
+    static_assert(std::is_same_v<C::mapped_type, int>);
+    static_assert(std::is_same_v<C::value_type, std::pair<short, int>>);
+    static_assert(std::is_same_v<C::key_compare, std::greater<long>>);
+    static_assert(!std::is_same_v<C::value_compare, std::greater<long>>);
+    static_assert(std::is_same_v<C::reference, std::pair<const short&, int&>>);
+    static_assert(std::is_same_v<C::const_reference, std::pair<const short&, const int&>>);
+    static_assert(std::random_access_iterator<C::iterator>);
+    static_assert(std::random_access_iterator<C::const_iterator>);
+    static_assert(std::random_access_iterator<C::reverse_iterator>);
+    static_assert(std::random_access_iterator<C::const_reverse_iterator>);
+    static_assert(std::is_same_v<C::reverse_iterator, std::reverse_iterator<C::iterator>>);
+    static_assert(std::is_same_v<C::const_reverse_iterator, std::reverse_iterator<C::const_iterator>>);
+    // size_type is invariably size_t
+    static_assert(std::is_same_v<C::size_type, std::size_t>);
+    static_assert(std::is_same_v<C::difference_type, std::ptrdiff_t>);
+    static_assert(std::is_same_v<C::key_container_type, std::deque<short, min_allocator<short>>>);
+    static_assert(std::is_same_v<C::mapped_container_type, std::vector<int>>);
+  }
+}
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
new file mode 100644
index 0000000000000..0add849312d5e
--- /dev/null
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/flat_map.version.compile.pass.cpp
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// WARNING: This test was generated by generate_feature_test_macro_components.py
+// and should not be edited manually.
+//
+// clang-format off
+
+// <flat_map>
+
+// Test the feature test macros defined by <flat_map>
+
+/*  Constant              Value
+    __cpp_lib_flat_map    202207L [C++23]
+*/
+
+#include <flat_map>
+#include "test_macros.h"
+
+#if TEST_STD_VER < 14
+
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+#elif TEST_STD_VER == 14
+
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+#elif TEST_STD_VER == 17
+
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+#elif TEST_STD_VER == 20
+
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+#elif TEST_STD_VER == 23
+
+# ifndef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should be defined in c++23"
+# endif
+# if __cpp_lib_flat_map != 202207L
+#   error "__cpp_lib_flat_map should have the value 202207L in c++23"
+# endif
+
+#elif TEST_STD_VER > 23
+
+# ifndef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should be defined in c++26"
+# endif
+# if __cpp_lib_flat_map != 202207L
+#   error "__cpp_lib_flat_map should have the value 202207L in c++26"
+# endif
+
+#endif // TEST_STD_VER > 23
+
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index 605788f559d3c..8f5788d2bed20 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -88,6 +88,8 @@
                                                             201902L [C++20]
     __cpp_lib_expected                                      202211L [C++23]
     __cpp_lib_filesystem                                    201703L [C++17]
+    __cpp_lib_flat_map                                      202207L [C++23]
+    __cpp_lib_flat_set                                      202207L [C++23]
     __cpp_lib_format                                        202110L [C++20]
     __cpp_lib_format_path                                   202403L [C++26]
     __cpp_lib_format_ranges                                 202207L [C++23]
@@ -528,6 +530,14 @@
 #   error "__cpp_lib_filesystem should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+# ifdef __cpp_lib_flat_set
+#   error "__cpp_lib_flat_set should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_format
 #   error "__cpp_lib_format should not be defined before c++20"
 # endif
@@ -1399,6 +1409,14 @@
 #   error "__cpp_lib_filesystem should not be defined before c++17"
 # endif
 
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+# ifdef __cpp_lib_flat_set
+#   error "__cpp_lib_flat_set should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_format
 #   error "__cpp_lib_format should not be defined before c++20"
 # endif
@@ -2390,6 +2408,14 @@
 #   endif
 # endif
 
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+# ifdef __cpp_lib_flat_set
+#   error "__cpp_lib_flat_set should not be defined before c++23"
+# endif
+
 # ifdef __cpp_lib_format
 #   error "__cpp_lib_format should not be defined before c++20"
 # endif
@@ -3651,6 +3677,14 @@
 #   endif
 # endif
 
+# ifdef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should not be defined before c++23"
+# endif
+
+# ifdef __cpp_lib_flat_set
+#   error "__cpp_lib_flat_set should not be defined before c++23"
+# endif
+
 # ifndef __cpp_lib_format
 #   error "__cpp_lib_format should be defined in c++20"
 # endif
@@ -5092,6 +5126,26 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should be defined in c++23"
+# endif
+# if __cpp_lib_flat_map != 202207L
+#   error "__cpp_lib_flat_map should have the value 202207L in c++23"
+# endif
+
+# if !defined(_LIBCPP_VERSION)
+#   ifndef __cpp_lib_flat_set
+#     error "__cpp_lib_flat_set should be defined in c++23"
+#   endif
+#   if __cpp_lib_flat_set != 202207L
+#     error "__cpp_lib_flat_set should have the value 202207L in c++23"
+#   endif
+# else // _LIBCPP_VERSION
+#   ifdef __cpp_lib_flat_set
+#     error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!"
+#   endif
+# endif
+
 # ifndef __cpp_lib_format
 #   error "__cpp_lib_format should be defined in c++23"
 # endif
@@ -6779,6 +6833,26 @@
 #   endif
 # endif
 
+# ifndef __cpp_lib_flat_map
+#   error "__cpp_lib_flat_map should be defined in c++26"
+# endif
+# if __cpp_lib_flat_map != 202207L
+#   error "__cpp_lib_flat_map should have the value 202207L in c++26"
+# endif
+
+# if !defined(_LIBCPP_VERSION)
+#   ifndef __cpp_lib_flat_set
+#     error "__cpp_lib_flat_set should be defined in c++26"
+#   endif
+#   if __cpp_lib_flat_set != 202207L
+#     error "__cpp_lib_flat_set should have the value 202207L in c++26"
+#   endif
+# else // _LIBCPP_VERSION
+#   ifdef __cpp_lib_flat_set
+#     error "__cpp_lib_flat_set should not be defined because it is unimplemented in libc++!"
+#   endif
+# endif
+
 # ifndef __cpp_lib_format
 #   error "__cpp_lib_format should be defined in c++26"
 # endif
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index c4065cdc1afef..58ecd79cf7469 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -505,6 +505,17 @@ def add_version_header(tc):
             "test_suite_guard": "!defined(_LIBCPP_VERSION) || (_LIBCPP_HAS_FILESYSTEM && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY)",
             "libcxx_guard": "_LIBCPP_HAS_FILESYSTEM && _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY",
         },
+        {
+            "name": "__cpp_lib_flat_map",
+            "values": {"c++23": 202207},
+            "headers": ["flat_map"],
+        },
+        {
+            "name": "__cpp_lib_flat_set",
+            "values": {"c++23": 202207},
+            "headers": ["flat_set"],
+            "unimplemented": True,
+        },
         {
             "name": "__cpp_lib_format",
             "values": {

From d578d0bb135ca337b14aabe6696fe5b0a0932932 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 25 Jan 2025 18:30:36 +0000
Subject: [PATCH 100/432] [gn build] Port def50f701f6a

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 90303821eb09f..f118d22c472d8 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -1325,8 +1325,11 @@ if (current_toolchain == default_toolchain) {
       "__filesystem/space_info.h",
       "__filesystem/u8path.h",
       "__flat_map/flat_map.h",
+      "__flat_map/flat_multimap.h",
       "__flat_map/key_value_iterator.h",
+      "__flat_map/sorted_equivalent.h",
       "__flat_map/sorted_unique.h",
+      "__flat_map/utils.h",
       "__format/buffer.h",
       "__format/concepts.h",
       "__format/container_adaptor.h",

From 2655ae54db6d7e9276a5ef4208cbeff1ae2ee72c Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Sat, 25 Jan 2025 13:52:07 -0500
Subject: [PATCH 101/432] [mlir] Fix deprecated pointer union casts in toy
 example (#124422)

---
 mlir/examples/toy/Ch4/mlir/Dialect.cpp | 2 +-
 mlir/examples/toy/Ch5/mlir/Dialect.cpp | 2 +-
 mlir/examples/toy/Ch6/mlir/Dialect.cpp | 2 +-
 mlir/examples/toy/Ch7/mlir/Dialect.cpp | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
index 6c6cdd934cea8..076a75a26619b 100644
--- a/mlir/examples/toy/Ch4/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
@@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() {
 /// Set the callee for the generic call operation, this is required by the call
 /// interface.
 void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
 }
 
 /// Get the argument operands to the called function, this is required by the
diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
index 72072f9188bf3..fb7c742a01802 100644
--- a/mlir/examples/toy/Ch5/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
@@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() {
 /// Set the callee for the generic call operation, this is required by the call
 /// interface.
 void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
 }
 
 /// Get the argument operands to the called function, this is required by the
diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
index 72072f9188bf3..fb7c742a01802 100644
--- a/mlir/examples/toy/Ch6/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
@@ -333,7 +333,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() {
 /// Set the callee for the generic call operation, this is required by the call
 /// interface.
 void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
 }
 
 /// Get the argument operands to the called function, this is required by the
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
index 7e030ffc5488c..55c44c45e0f00 100644
--- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -367,7 +367,7 @@ CallInterfaceCallable GenericCallOp::getCallableForCallee() {
 /// Set the callee for the generic call operation, this is required by the call
 /// interface.
 void GenericCallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
-  (*this)->setAttr("callee", callee.get<SymbolRefAttr>());
+  (*this)->setAttr("callee", cast<SymbolRefAttr>(callee));
 }
 
 /// Get the argument operands to the called function, this is required by the

From 4bcd8184a093d2d9f0aad1053dbb1367891da6a5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 25 Jan 2025 10:53:01 -0800
Subject: [PATCH 102/432] [TargetLowering] Pull similar code out of the
 forceExpandWideMUL into a helper. NFC (#124371)

These functions have similar code. One of them calculates the 2x width
full product from 2 sources. The other calculates the product from 2
sources that have low and high halves.

This patch introduces a new function that takes HiLHS and HiRHS as
optional values. If they are not null, they will be used in the
calculation of the Hi half. The Signed flag can only be set when
HiLHS/HiRHS are null.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    |   9 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 186 ++++++++++--------
 2 files changed, 108 insertions(+), 87 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 861cffdc115a4..4ad2835a70404 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5503,6 +5503,15 @@ class TargetLowering : public TargetLoweringBase {
   bool expandMULO(SDNode *Node, SDValue &Result, SDValue &Overflow,
                   SelectionDAG &DAG) const;
 
+  /// Calculate the product twice the width of LHS and RHS. If HiLHS/HiRHS are
+  /// non-null they will be included in the multiplication. The expansion works
+  /// by splitting the 2 inputs into 4 pieces that we can multiply and add
+  /// together without neding MULH or MUL_LOHI.
+  void forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl, bool Signed,
+                           SDValue &Lo, SDValue &Hi, SDValue LHS, SDValue RHS,
+                           SDValue HiLHS = SDValue(),
+                           SDValue HiRHS = SDValue()) const;
+
   /// forceExpandWideMUL - Unconditionally expand a MUL into either a libcall or
   /// brute force via a wide multiplication. The expansion works by
   /// attempting to do a multiplication on a wider type twice the size of the
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 0d039860b9f0f..a37ec662ce2d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10857,6 +10857,64 @@ SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {
   return DAG.getSelect(dl, VT, Cond, SatVal, Result);
 }
 
+void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl,
+                                         bool Signed, SDValue &Lo, SDValue &Hi,
+                                         SDValue LHS, SDValue RHS,
+                                         SDValue HiLHS, SDValue HiRHS) const {
+  EVT VT = LHS.getValueType();
+  assert(RHS.getValueType() == VT && "Mismatching operand types");
+
+  assert((HiLHS && HiRHS) || (!HiLHS && !HiRHS));
+  assert((!Signed || !HiLHS) &&
+         "Signed flag should only be set when HiLHS and RiRHS are null");
+
+  // We'll expand the multiplication by brute force because we have no other
+  // options. This is a trivially-generalized version of the code from
+  // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+  // 4.3.1). If Signed is set, we can use arithmetic right shifts to propagate
+  // sign bits while calculating the Hi half.
+  unsigned Bits = VT.getSizeInBits();
+  unsigned HalfBits = Bits / 2;
+  SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
+  SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
+  SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask);
+
+  SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL);
+  SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
+
+  SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
+  // This is always an unsigned shift.
+  SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
+
+  unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL;
+  SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift);
+  SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift);
+
+  SDValue U =
+      DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH);
+  SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
+  SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift);
+
+  SDValue V =
+      DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL);
+  SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift);
+
+  Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
+                   DAG.getNode(ISD::SHL, dl, VT, V, Shift));
+
+  Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH),
+                   DAG.getNode(ISD::ADD, dl, VT, UH, VH));
+
+  // If HiLHS and HiRHS are set, multiply them by the opposite low part and add
+  // the products to Hi.
+  if (HiLHS) {
+    Hi = DAG.getNode(ISD::ADD, dl, VT, Hi,
+                     DAG.getNode(ISD::ADD, dl, VT,
+                                 DAG.getNode(ISD::MUL, dl, VT, HiRHS, LHS),
+                                 DAG.getNode(ISD::MUL, dl, VT, RHS, HiLHS)));
+  }
+}
+
 void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
                                         bool Signed, EVT WideVT,
                                         const SDValue LL, const SDValue LH,
@@ -10877,45 +10935,7 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
     LC = RTLIB::MUL_I128;
 
   if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) {
-    // We'll expand the multiplication by brute force because we have no other
-    // options. This is a trivially-generalized version of the code from
-    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
-    // 4.3.1).
-    EVT VT = LL.getValueType();
-    unsigned Bits = VT.getSizeInBits();
-    unsigned HalfBits = Bits >> 1;
-    SDValue Mask =
-        DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
-    SDValue LLL = DAG.getNode(ISD::AND, dl, VT, LL, Mask);
-    SDValue RLL = DAG.getNode(ISD::AND, dl, VT, RL, Mask);
-
-    SDValue T = DAG.getNode(ISD::MUL, dl, VT, LLL, RLL);
-    SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
-
-    SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
-    SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
-    SDValue LLH = DAG.getNode(ISD::SRL, dl, VT, LL, Shift);
-    SDValue RLH = DAG.getNode(ISD::SRL, dl, VT, RL, Shift);
-
-    SDValue U = DAG.getNode(ISD::ADD, dl, VT,
-                            DAG.getNode(ISD::MUL, dl, VT, LLH, RLL), TH);
-    SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
-    SDValue UH = DAG.getNode(ISD::SRL, dl, VT, U, Shift);
-
-    SDValue V = DAG.getNode(ISD::ADD, dl, VT,
-                            DAG.getNode(ISD::MUL, dl, VT, LLL, RLH), UL);
-    SDValue VH = DAG.getNode(ISD::SRL, dl, VT, V, Shift);
-
-    SDValue W =
-        DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LLH, RLH),
-                    DAG.getNode(ISD::ADD, dl, VT, UH, VH));
-    Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
-                     DAG.getNode(ISD::SHL, dl, VT, V, Shift));
-
-    Hi = DAG.getNode(ISD::ADD, dl, VT, W,
-                     DAG.getNode(ISD::ADD, dl, VT,
-                                 DAG.getNode(ISD::MUL, dl, VT, RH, LL),
-                                 DAG.getNode(ISD::MUL, dl, VT, RL, LH)));
+    forceExpandMultiply(DAG, dl, /*Signed=*/false, Lo, Hi, LL, RL, LH, RH);
   } else {
     // Attempt a libcall.
     SDValue Ret;
@@ -10965,58 +10985,50 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
   else if (WideVT == MVT::i128)
     LC = RTLIB::MUL_I128;
 
-  if (LC != RTLIB::UNKNOWN_LIBCALL && getLibcallName(LC)) {
-    SDValue HiLHS, HiRHS;
-    if (Signed) {
-      // The high part is obtained by SRA'ing all but one of the bits of low
-      // part.
-      unsigned LoSize = VT.getFixedSizeInBits();
-      SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
-      HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
-      HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
-    } else {
-      HiLHS = DAG.getConstant(0, dl, VT);
-      HiRHS = DAG.getConstant(0, dl, VT);
-    }
-    forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
+  if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) {
+    forceExpandMultiply(DAG, dl, Signed, Lo, Hi, LHS, RHS);
     return;
   }
 
-  // Expand the multiplication by brute force. This is a generalized-version of
-  // the code from Hacker's Delight (itself derived from Knuth's Algorithm M
-  // from section 4.3.1) combined with the Hacker's delight code
-  // for calculating mulhs.
-  unsigned Bits = VT.getSizeInBits();
-  unsigned HalfBits = Bits / 2;
-  SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
-  SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
-  SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask);
-
-  SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL);
-  SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
-
-  SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
-  // This is always an unsigned shift.
-  SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
-
-  unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL;
-  SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift);
-  SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift);
-
-  SDValue U =
-      DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH);
-  SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
-  SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift);
-
-  SDValue V =
-      DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL);
-  SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift);
-
-  Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
-                   DAG.getNode(ISD::SHL, dl, VT, V, Shift));
+  SDValue HiLHS, HiRHS;
+  if (Signed) {
+    // The high part is obtained by SRA'ing all but one of the bits of low
+    // part.
+    unsigned LoSize = VT.getFixedSizeInBits();
+    SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
+    HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
+    HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
+  } else {
+    HiLHS = DAG.getConstant(0, dl, VT);
+    HiRHS = DAG.getConstant(0, dl, VT);
+  }
 
-  Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH),
-                   DAG.getNode(ISD::ADD, dl, VT, UH, VH));
+  // Attempt a libcall.
+  SDValue Ret;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setIsSigned(Signed);
+  CallOptions.setIsPostTypeLegalization(true);
+  if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
+    // Halves of WideVT are packed into registers in different order
+    // depending on platform endianness. This is usually handled by
+    // the C calling convention, but we can't defer to it in
+    // the legalizer.
+    SDValue Args[] = {LHS, HiLHS, RHS, HiRHS};
+    Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+  } else {
+    SDValue Args[] = {HiLHS, LHS, HiRHS, RHS};
+    Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
+  }
+  assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
+         "Ret value is a collection of constituent nodes holding result.");
+  if (DAG.getDataLayout().isLittleEndian()) {
+    // Same as above.
+    Lo = Ret.getOperand(0);
+    Hi = Ret.getOperand(1);
+  } else {
+    Lo = Ret.getOperand(1);
+    Hi = Ret.getOperand(0);
+  }
 }
 
 SDValue

From 5e65f430414dd9df79ca6a1056b4943110ebc14b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Sat, 25 Jan 2025 11:48:51 -0800
Subject: [PATCH 103/432] [SLP][NFC]Add a test, producing serie of
 extrtactelements, building non-extendable tree

---
 .../X86/extracts-non-extendable.ll            | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
new file mode 100644
index 0000000000000..d87c40511fcf7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-grtev4-linux-gnu -mattr="+aes,+avx,+cmov,+crc32,+cx16,+cx8,+fxsr,+mmx,+pclmul,+popcnt,+prfchw,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" < %s | FileCheck %s
+
+define void @test(i64 %v) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[V]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 0, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = and i1 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP9:%.*]] = and i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and i1 [[TMP9]], false
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 0, [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 0, 0
+; CHECK-NEXT:    [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult i64 0, 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i1 [[TMP18]], i1 false
+; CHECK-NEXT:    br i1 [[TMP20]], label %[[BB_I107_PREHEADER:.*]], label %[[BB_I27_I_PREHEADER:.*]]
+; CHECK:       [[BB_I107_PREHEADER]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTSROA_1278_10_EXTRACT_SHIFT83_I1622_1:%.*]] = xor i64 0, [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <2 x i64> zeroinitializer, [[TMP1]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <2 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <2 x i64> splat (i64 1), [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <2 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq <2 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_I27_I_PREHEADER]]:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  %.sroa.82529.14.insert.insert = or i64 0, 0
+  %.sroa.02528.sroa.0.0.insert.insert = or i64 %v, 0
+  %0 = icmp eq i64 0, %.sroa.02528.sroa.0.0.insert.insert
+  %1 = icmp eq i64 0, 0
+  %2 = and i1 %0, %1
+  %3 = icmp eq i64 0, 0
+  %4 = and i1 %2, %3
+  %5 = icmp eq i64 0, 0
+  %6 = and i1 %4, %5
+  %7 = and i1 %6, false
+  %8 = icmp eq i64 0, %.sroa.02528.sroa.0.0.insert.insert
+  %9 = and i1 %7, %8
+  %10 = icmp eq i64 0, 0
+  %11 = and i1 %9, %10
+  %12 = icmp eq i64 0, 0
+  %13 = and i1 %11, %12
+  %14 = icmp eq i64 0, 0
+  %15 = and i1 %13, %14
+  %16 = icmp ult i64 0, 0
+  %17 = select i1 %16, i1 %15, i1 false
+  br i1 %17, label %bb.i107.preheader, label %bb.i27.i.preheader
+
+bb.i107.preheader: ; preds = %bb
+  %.sroa.1278.10.extract.shift83.i1622.1 = xor i64 0, %.sroa.82529.14.insert.insert
+  %.sroa.076.2.extract.shift80.i1619.4 = xor i64 0, %.sroa.02528.sroa.0.0.insert.insert
+  %.sroa.071.2.extract.shift86.i1625.4 = or i64 %.sroa.076.2.extract.shift80.i1619.4, 0
+  %.sroa.1278.10.extract.shift83.i1622.7 = xor i64 0, %.sroa.82529.14.insert.insert
+  %.sroa.12.10.extract.shift89.i1634.7 = or i64 %.sroa.1278.10.extract.shift83.i1622.7, 0
+  %.sroa.02756.2.extract.shift6530 = or i64 %.sroa.071.2.extract.shift86.i1625.4, 1
+  %18 = and i64 %.sroa.02756.2.extract.shift6530, 0
+  %19 = icmp eq i64 %18, 0
+  %20 = or i64 1, %.sroa.12.10.extract.shift89.i1634.7
+  %21 = and i64 %20, 0
+  %22 = icmp eq i64 %21, 0
+  ret void
+
+bb.i27.i.preheader: ; preds = %bb
+  unreachable
+}
+

From e5b0132d157ad4c9a502dc8c4a61a3a3c83646c2 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Sat, 25 Jan 2025 21:15:10 +0000
Subject: [PATCH 104/432] SCEV: add samesign tests for exit-limit computation
 (#124304)

As the tests demonstrate, computeExitLimitFromICmp needs no additional
changes to compute exit limits from an icmp with samesign.
---
 .../ScalarEvolution/exit-count-non-strict.ll  | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)

diff --git a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
index f7a18c77a82c8..1e15d2d0d6461 100644
--- a/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
+++ b/llvm/test/Analysis/ScalarEvolution/exit-count-non-strict.ll
@@ -30,6 +30,35 @@ exit:
   ret void
 }
 
+define void @le_from_zero(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_zero'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_zero
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
+; CHECK-NEXT:    exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:    symbolic max exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
+; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @ule_from_one(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'ule_from_one'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_one
@@ -59,6 +88,35 @@ exit:
   ret void
 }
 
+define void @le_from_one(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_one'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_one
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (%M umin_seq (-1 + %N))
+; CHECK-NEXT:    exit count for loop: %M
+; CHECK-NEXT:    exit count for latch: (-1 + %N)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (%M umin_seq (-1 + %N))
+; CHECK-NEXT:    symbolic max exit count for loop: %M
+; CHECK-NEXT:    symbolic max exit count for latch: (-1 + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 1, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @ule_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'ule_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @ule_from_unknown
@@ -133,6 +191,51 @@ exit:
   ret void
 }
 
+define void @le_from_zero_no_nuw(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_zero_no_nuw'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_zero_no_nuw
+; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:    exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is %N
+; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated symbolic max exit count for loop: (1 + (zext i32 %M to i64))<nuw><nsw>
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:    symbolic max exit count for latch: %N
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((zext i32 %N to i64) umin (1 + (zext i32 %M to i64))<nuw><nsw>)
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {0,+,1}<%loop> Added Flags: <nusw>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @sle_from_int_min(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'sle_from_int_min'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_int_min
@@ -162,6 +265,35 @@ exit:
   ret void
 }
 
+define void @le_from_int_min(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_int_min'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_int_min
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((-2147483647 + (2147483647 umax %M)) umin_seq (-2147483648 + %N))
+; CHECK-NEXT:    exit count for loop: (-2147483647 + (2147483647 umax %M))
+; CHECK-NEXT:    exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -2147483648
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-2147483647 + (2147483647 umax %M)) umin_seq (-2147483648 + %N))
+; CHECK-NEXT:    symbolic max exit count for loop: (-2147483647 + (2147483647 umax %M))
+; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ u0x80000000, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @sle_from_int_min_plus_one(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'sle_from_int_min_plus_one'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_int_min_plus_one
@@ -191,6 +323,35 @@ exit:
   ret void
 }
 
+define void @le_from_int_min_plus_one(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_int_min_plus_one'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_int_min_plus_one
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is ((-2147483648 + (-2147483648 umax %M)) umin_seq (2147483647 + %N))
+; CHECK-NEXT:    exit count for loop: (-2147483648 + (-2147483648 umax %M))
+; CHECK-NEXT:    exit count for latch: (2147483647 + %N)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 2147483647
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is ((-2147483648 + (-2147483648 umax %M)) umin_seq (2147483647 + %N))
+; CHECK-NEXT:    symbolic max exit count for loop: (-2147483648 + (-2147483648 umax %M))
+; CHECK-NEXT:    symbolic max exit count for latch: (2147483647 + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ u0x80000001, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @sle_from_unknown(i32 %M, i32 %N, i32 %S) {
 ; CHECK-LABEL: 'sle_from_unknown'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_unknown
@@ -220,6 +381,35 @@ exit:
   ret void
 }
 
+define void @le_from_unknown(i32 %M, i32 %N, i32 %S) {
+; CHECK-LABEL: 'le_from_unknown'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_unknown
+; CHECK-NEXT:  Loop %loop: <multiple exits> backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
+; CHECK-NEXT:    exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i64 4294967295
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>)) umin_seq (zext i32 ((-1 * %S) + %N) to i64))
+; CHECK-NEXT:    symbolic max exit count for loop: ((-1 * (zext i32 %S to i64))<nsw> + ((zext i32 %S to i64) umax (1 + (zext i32 %M to i64))<nuw><nsw>))
+; CHECK-NEXT:    symbolic max exit count for latch: ((-1 * %S) + %N)
+; CHECK-NEXT:  Loop %loop: Trip multiple is 1
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %S, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 define void @sle_from_int_min_no_nsw(i32 %M, i32 %N) {
 ; CHECK-LABEL: 'sle_from_int_min_no_nsw'
 ; CHECK-NEXT:  Determining loop execution counts for: @sle_from_int_min_no_nsw
@@ -264,3 +454,48 @@ latch:
 exit:
   ret void
 }
+
+define void @le_from_int_min_no_nuw_nsw(i32 %M, i32 %N) {
+; CHECK-LABEL: 'le_from_int_min_no_nuw_nsw'
+; CHECK-NEXT:  Determining loop execution counts for: @le_from_int_min_no_nuw_nsw
+; CHECK-NEXT:  Loop %loop: <multiple exits> Unpredictable backedge-taken count.
+; CHECK-NEXT:    exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated exit count for loop: (-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64))<nuw><nsw>))<nsw>
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:    exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: constant max backedge-taken count is i32 -1
+; CHECK-NEXT:  Loop %loop: symbolic max backedge-taken count is (-2147483648 + %N)
+; CHECK-NEXT:    symbolic max exit count for loop: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:    predicated symbolic max exit count for loop: (-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64))<nuw><nsw>))<nsw>
+; CHECK-NEXT:     Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:    symbolic max exit count for latch: (-2147483648 + %N)
+; CHECK-NEXT:  Loop %loop: Predicated backedge-taken count is ((-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64))<nuw><nsw>))<nsw> umin_seq (zext i32 (-2147483648 + %N) to i64))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop: Predicated constant max backedge-taken count is i64 2147483648
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:  Loop %loop: Predicated symbolic max backedge-taken count is ((-2147483648 + (2147483648 umax (1 + (zext i32 %M to i64))<nuw><nsw>))<nsw> umin_seq (zext i32 (-2147483648 + %N) to i64))
+; CHECK-NEXT:   Predicates:
+; CHECK-NEXT:      {-2147483648,+,1}<%loop> Added Flags: <nusw>
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ u0x80000000, %entry ], [ %iv.next, %latch ]
+  %cmp1 = icmp samesign ule i32 %iv, %M
+  br i1 %cmp1, label %latch, label %exit
+
+latch:
+  %iv.next = add i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv, %N
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}

From 89f2fee9f80658650524ba4fc12f01409e45000e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 13:17:15 -0800
Subject: [PATCH 105/432] [InstCombine] Add test for incorrect retention of
 Range attribute in fshl

---
 llvm/test/Transforms/InstCombine/fsh.ll | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
index 236c69e7a5bcb..34648d586300f 100644
--- a/llvm/test/Transforms/InstCombine/fsh.ll
+++ b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -1068,3 +1068,18 @@ entry:
   %res = call <2 x i31> @llvm.fshl.v2i31(<2 x i31> %x, <2 x i31> zeroinitializer, <2 x i31>  %y)
   ret <2 x i31>  %res
 }
+
+define i8 @fshl_range_trunc(i1 %x) {
+; CHECK-LABEL: @fshl_range_trunc(
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[X:%.*]] to i32
+; CHECK-NEXT:    [[OR:%.*]] = or disjoint i32 [[ZEXT]], 126
+; CHECK-NEXT:    [[FSHL:%.*]] = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1)
+; CHECK-NEXT:    [[TR:%.*]] = trunc nuw i32 [[FSHL]] to i8
+; CHECK-NEXT:    ret i8 [[TR]]
+;
+  %zext = zext i1 %x to i32
+  %or = or disjoint i32 %zext, -2
+  %fshl = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 %or, i32 %or, i32 1)
+  %tr = trunc nsw i32 %fshl to i8
+  ret i8 %tr
+}

From 77c325b646301e394bcd89c2980b4c2da8af49cd Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sat, 25 Jan 2025 22:20:34 +0100
Subject: [PATCH 106/432] [LLD][COFF] Keep hasData true in NullChunk
 constructor (#124368)

`NullChunk` instances do write data, even if it's always zero. Setting
`hasData` to false causes `Writer::assignAddresses` to ignore them
when calculating `rawSize`. This typically isn't an issue, as null chunks
are usually positioned within a section, and later chunks adjust the
size accordingly.

However, on ARM64EC, the auxiliary IAT is placed at the end of the
`.rdata` section and terminates with a null chunk. As a result, `rawSize`
is never updated to account for it, and space for the null chunk is not
allocated. Consequently, when `NullChunk::writeTo` is called, it receives
an invalid pointer - either pointing to the next section or beyond the
allocated buffer.
---
 lld/COFF/DLL.cpp                  |  1 -
 lld/test/COFF/arm64ec-import.test | 13 +++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 6a3f8eb21e847..ae3a8047b7008 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -132,7 +132,6 @@ class ImportDirectoryChunk : public NonSectionChunk {
 class NullChunk : public NonSectionChunk {
 public:
   explicit NullChunk(size_t n, uint32_t align) : size(n) {
-    hasData = false;
     setAlignment(align);
   }
   explicit NullChunk(COFFLinkerContext &ctx)
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index 033c27884be02..bb2b772081d59 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -160,6 +160,19 @@ BASERELOC-NEXT:     Type: DIR64
 BASERELOC-NEXT:     Address: 0x5020
 BASERELOC-NEXT:   }
 
+
+Build with -filealign:8 to enable precise size checking.
+
+RUN: lld-link -machine:arm64ec -dll -noentry -out:out-size.dll loadconfig-arm64ec.obj icall.obj hybmp.obj \
+RUN:          test.obj test-arm64ec.lib test2-arm64ec.lib -filealign:8
+
+RUN: llvm-readobj --headers out-size.dll | FileCheck --check-prefix=RDATA-HEADER %s
+
+RDATA-HEADER:      Name: .rdata (2E 72 64 61 74 61 00 00)
+RDATA-HEADER-NEXT: VirtualSize: 0x2030
+RDATA-HEADER-NEXT: VirtualAddress: 0x3000
+RDATA-HEADER-NEXT: RawDataSize: 8240
+
 #--- test.s
     .section .test, "r"
     .globl arm64ec_data_sym

From 2131115be5b9d8b39af80973d9b64c0adc41d38d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 13:35:11 -0800
Subject: [PATCH 107/432] [InstCombine] Drop Range attribute when simplifying
 'fshl' based on demanded bits (#124429)

When simplifying operands based on demanded bits, the return value range
of llvm.fshl might change. Keeping the Range attribute might cause
llvm.fshl to generate a poison and lead to miscompile. Drop the Range
attribute similar to `dropPosonGeneratingFlags` elsewhere.

Fix #124387
---
 .../InstCombine/InstCombineSimplifyDemanded.cpp          | 9 ++++++---
 llvm/test/Transforms/InstCombine/fsh.ll                  | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 943598a30f040..2c8939b5a0514 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1039,11 +1039,14 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I,
         APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
         APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
         if (I->getOperand(0) != I->getOperand(1)) {
-          if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown,
-                                   Depth + 1, Q) ||
+          if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1,
+                                   Q) ||
               SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1,
-                                   Q))
+                                   Q)) {
+            // Range attribute may no longer hold.
+            I->dropPoisonGeneratingReturnAttributes();
             return I;
+          }
         } else { // fshl is a rotate
           // Avoid converting rotate into funnel shift.
           // Only simplify if one operand is constant.
diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
index 34648d586300f..3ff4f9a2abf33 100644
--- a/llvm/test/Transforms/InstCombine/fsh.ll
+++ b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -1069,11 +1069,12 @@ entry:
   ret <2 x i31>  %res
 }
 
+;; Issue #124387 Range attribute no longer holds after operands changed.
 define i8 @fshl_range_trunc(i1 %x) {
 ; CHECK-LABEL: @fshl_range_trunc(
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[OR:%.*]] = or disjoint i32 [[ZEXT]], 126
-; CHECK-NEXT:    [[FSHL:%.*]] = call range(i32 -4, 2) i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1)
+; CHECK-NEXT:    [[FSHL:%.*]] = call i32 @llvm.fshl.i32(i32 [[OR]], i32 -2, i32 1)
 ; CHECK-NEXT:    [[TR:%.*]] = trunc nuw i32 [[FSHL]] to i8
 ; CHECK-NEXT:    ret i8 [[TR]]
 ;

From 1395cd015f2edf26f8c2567870183d63f4fdd753 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sat, 25 Jan 2025 21:55:15 +0000
Subject: [PATCH 108/432] [VPlan] Support multi-exit loops in HCFG builder.

Update HCFG construction to support multi-exit loops. If there is no
unique exit block, map the middle block of the initial plan to the exit
block from the latch.

This further unifies HCFG construction and prepares for use to also
build an initial VPlan (VPlan0) for inner loops.

Effectively NFC as this isn't used on the default code path yet.
---
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  60 +++++++---
 .../Transforms/Vectorize/VPlanHCFGTest.cpp    | 109 ++++++++++++++++++
 2 files changed, 155 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 32723e5db9c45..5a2e5d7cfee48 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -350,10 +350,25 @@ void PlainCFGBuilder::buildPlainCFG() {
   // new vector preheader); here we're interested in setting BB2VPBB to the
   // latter.
   BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB;
-  BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
   Loop2Region[LI->getLoopFor(TheLoop->getHeader())] = TheRegion;
-  assert(LoopExitBB && "Loops with multiple exits are not supported.");
-  BB2VPBB[LoopExitBB] = cast<VPBasicBlock>(TheRegion->getSingleSuccessor());
+  BasicBlock *ExitBB = TheLoop->getUniqueExitBlock();
+  if (!ExitBB) {
+    // If there is no unique exit block, we must exit via the latch. This exit
+    // is mapped to the middle block in the input plan.
+    BasicBlock *Latch = TheLoop->getLoopLatch();
+    auto *Br = cast<BranchInst>(Latch->getTerminator());
+    if (TheLoop->contains(Br->getSuccessor(0))) {
+      assert(!TheLoop->contains(Br->getSuccessor(1)) &&
+             "latch must exit the loop");
+      ExitBB = Br->getSuccessor(1);
+    } else {
+      assert(!TheLoop->contains(Br->getSuccessor(0)) &&
+             "latch must exit the loop");
+      ExitBB = Br->getSuccessor(0);
+    }
+  }
+  assert(ExitBB && "Must have a unique exit block or also exit via the latch.");
+  BB2VPBB[ExitBB] = cast<VPBasicBlock>(TheRegion->getSingleSuccessor());
 
   // The existing vector region's entry and exiting VPBBs correspond to the loop
   // header and latch.
@@ -423,21 +438,38 @@ void PlainCFGBuilder::buildPlainCFG() {
     // representing the condition bit in VPlan (which may be in another VPBB).
     assert(IRDef2VPValue.contains(BI->getCondition()) &&
            "Missing condition bit in IRDef2VPValue!");
-    VPBasicBlock *Successor0 = getOrCreateVPBB(BI->getSuccessor(0));
-    VPBasicBlock *Successor1 = getOrCreateVPBB(BI->getSuccessor(1));
-    if (!LoopForBB || BB != LoopForBB->getLoopLatch()) {
-      VPBB->setTwoSuccessors(Successor0, Successor1);
-      continue;
-    }
-    // For a latch we need to set the successor of the region rather than that
-    // of VPBB and it should be set to the exit, i.e., non-header successor,
-    // except for the top region, whose successor was set when creating VPlan's
-    // skeleton.
-    if (TheRegion != Region) {
+
+    BasicBlock *IRSucc0 = BI->getSuccessor(0);
+    BasicBlock *IRSucc1 = BI->getSuccessor(1);
+    VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
+    VPBasicBlock *Successor1 = getOrCreateVPBB(IRSucc1);
+    if (BB == LoopForBB->getLoopLatch()) {
+      // For a latch we need to set the successor of the region rather than that
+      // of VPBB and it should be set to the exit, i.e., non-header successor,
+      // except for the top region, whose successor was set when creating
+      // VPlan's skeleton.
+      assert(TheRegion != Region &&
+             "Latch of the top region should have been handled earlier");
       Region->setOneSuccessor(isHeaderVPBB(Successor0) ? Successor1
                                                        : Successor0);
       Region->setExiting(VPBB);
+      continue;
     }
+
+    // Don't connect any blocks outside the current loop except the latch for
+    // now. The latch is handled above.
+    if (LoopForBB) {
+      if (!LoopForBB->contains(IRSucc0)) {
+        VPBB->setOneSuccessor(Successor1);
+        continue;
+      }
+      if (!LoopForBB->contains(IRSucc1)) {
+        VPBB->setOneSuccessor(Successor0);
+        continue;
+      }
+    }
+
+    VPBB->setTwoSuccessors(Successor0, Successor1);
   }
 
   // 2. The whole CFG has been built at this point so all the input Values must
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index dcdaf008e10fe..d787a6c977194 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -234,5 +234,114 @@ TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) {
   EXPECT_EQ(VecBB->end(), Iter);
 }
 
+TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoopMultiExit) {
+  const char *ModuleString =
+      "define void @f(ptr %A, i64 %N) {\n"
+      "entry:\n"
+      "  br label %loop.header\n"
+      "loop.header:\n"
+      "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]\n"
+      "  %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n"
+      "  %l1 = load i32, ptr %arr.idx, align 4\n"
+      "  %c = icmp eq i32 %l1, 0\n"
+      "  br i1 %c, label %exit.1, label %loop.latch\n"
+      "loop.latch:\n"
+      "  %res = add i32 %l1, 10\n"
+      "  store i32 %res, ptr %arr.idx, align 4\n"
+      "  %iv.next = add i64 %iv, 1\n"
+      "  %exitcond = icmp ne i64 %iv.next, %N\n"
+      "  br i1 %exitcond, label %loop.header, label %exit.2\n"
+      "exit.1:\n"
+      "  ret void\n"
+      "exit.2:\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("f");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  // Add an external value to check we do not print the list of external values,
+  // as this is not required with the new printing.
+  Plan->getOrAddLiveIn(&*F->arg_begin());
+  std::string FullDump;
+  raw_string_ostream OS(FullDump);
+  Plan->printDOT(OS);
+  const char *ExpectedStr = R"(digraph VPlan {
+graph [labelloc=t, fontsize=30; label="Vectorization Plan\n for UF\>=1\nLive-in vp\<%0\> = vector-trip-count\nLive-in ir\<%N\> = original trip-count\n"]
+node [shape=rect, fontname=Courier, fontsize=30]
+edge [fontname=Courier, fontsize=30]
+compound=true
+  N0 [label =
+    "ir-bb\<entry\>:\l" +
+    "Successor(s): vector.ph\l"
+  ]
+  N0 -> N1 [ label=""]
+  N1 [label =
+    "vector.ph:\l" +
+    "Successor(s): vector loop\l"
+  ]
+  N1 -> N2 [ label="" lhead=cluster_N3]
+  subgraph cluster_N3 {
+    fontname=Courier
+    label="\<x1\> vector loop"
+    N2 [label =
+      "vector.body:\l" +
+      "  WIDEN-PHI ir\<%iv\> = phi ir\<0\>, ir\<%iv.next\>\l" +
+      "  EMIT ir\<%arr.idx\> = getelementptr ir\<%A\>, ir\<%iv\>\l" +
+      "  EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" +
+      "  EMIT ir\<%c\> = icmp ir\<%l1\>, ir\<0\>\l" +
+      "Successor(s): loop.latch\l"
+    ]
+    N2 -> N4 [ label=""]
+    N4 [label =
+      "loop.latch:\l" +
+      "  EMIT ir\<%res\> = add ir\<%l1\>, ir\<10\>\l" +
+      "  EMIT store ir\<%res\>, ir\<%arr.idx\>\l" +
+      "  EMIT ir\<%iv.next\> = add ir\<%iv\>, ir\<1\>\l" +
+      "  EMIT ir\<%exitcond\> = icmp ir\<%iv.next\>, ir\<%N\>\l" +
+      "Successor(s): vector.latch\l"
+    ]
+    N4 -> N5 [ label=""]
+    N5 [label =
+      "vector.latch:\l" +
+      "No successors\l"
+    ]
+  }
+  N5 -> N6 [ label="" ltail=cluster_N3]
+  N6 [label =
+    "middle.block:\l" +
+    "  EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" +
+    "  EMIT branch-on-cond vp\<%cmp.n\>\l" +
+    "Successor(s): ir-bb\<exit.2\>, scalar.ph\l"
+  ]
+  N6 -> N7 [ label="T"]
+  N6 -> N8 [ label="F"]
+  N7 [label =
+    "ir-bb\<exit.2\>:\l" +
+    "No successors\l"
+  ]
+  N8 [label =
+    "scalar.ph:\l" +
+    "Successor(s): ir-bb\<loop.header\>\l"
+  ]
+  N8 -> N9 [ label=""]
+  N9 [label =
+    "ir-bb\<loop.header\>:\l" +
+    "  IR   %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]\l" +
+    "  IR   %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\l" +
+    "  IR   %l1 = load i32, ptr %arr.idx, align 4\l" +
+    "  IR   %c = icmp eq i32 %l1, 0\l" +
+    "No successors\l"
+  ]
+}
+)";
+  EXPECT_EQ(ExpectedStr, FullDump);
+#endif
+}
+
 } // namespace
 } // namespace llvm

From 563c7c5539f05e7f8cbb42565c1f24466019f38b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 14:05:01 -0800
Subject: [PATCH 109/432] [clang] Migrate away from PointerUnion::dyn_cast
 (NFC) (#124425)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

This patch migrates uses of PointerUnion::dyn_cast to
dyn_cast_if_present (see the definition of PointerUnion::dyn_cast).
Note that we cannot use dyn_cast in any of the migrations in this
patch; placing

  assert(!X.isNull());

just before any of dyn_cast_if_present in this patch triggers some
failure in check-clang.
---
 clang/include/clang/AST/APValue.h             |  5 +-
 clang/include/clang/AST/ASTContext.h          |  2 +-
 clang/include/clang/AST/Decl.h                |  2 +-
 clang/include/clang/AST/DeclBase.h            |  2 +-
 clang/include/clang/AST/DeclTemplate.h        |  9 ++-
 clang/include/clang/AST/Expr.h                |  4 +-
 clang/include/clang/AST/ExprCXX.h             |  6 +-
 clang/include/clang/Basic/IdentifierTable.h   |  2 +-
 clang/include/clang/Lex/Preprocessor.h        | 10 +--
 clang/lib/APINotes/APINotesManager.cpp        |  4 +-
 clang/lib/AST/Decl.cpp                        | 71 ++++++++++---------
 clang/lib/AST/DeclCXX.cpp                     | 10 +--
 clang/lib/AST/DeclTemplate.cpp                |  4 +-
 clang/lib/AST/TemplateName.cpp                | 11 +--
 .../Frontend/SerializedDiagnosticPrinter.cpp  |  2 +-
 clang/lib/Sema/SemaDecl.cpp                   |  6 +-
 clang/tools/libclang/CIndexDiagnostic.cpp     |  3 +-
 17 files changed, 83 insertions(+), 70 deletions(-)

diff --git a/clang/include/clang/AST/APValue.h b/clang/include/clang/AST/APValue.h
index 833a78c77871d..9999a30c51ade 100644
--- a/clang/include/clang/AST/APValue.h
+++ b/clang/include/clang/AST/APValue.h
@@ -161,8 +161,9 @@ class APValue {
 
     template <class T> T get() const { return cast<T>(Ptr); }
 
-    template <class T>
-    T dyn_cast() const { return Ptr.dyn_cast<T>(); }
+    template <class T> T dyn_cast() const {
+      return dyn_cast_if_present<T>(Ptr);
+    }
 
     void *getOpaqueValue() const;
 
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 4e9b961688d55..65be782c1ba43 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -769,7 +769,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// pool.
   DeclListNode *AllocateDeclListNode(clang::NamedDecl *ND) {
     if (DeclListNode *Alloc = ListNodeFreeList) {
-      ListNodeFreeList = Alloc->Rest.dyn_cast<DeclListNode*>();
+      ListNodeFreeList = dyn_cast_if_present<DeclListNode *>(Alloc->Rest);
       Alloc->D = ND;
       Alloc->Rest = nullptr;
       return Alloc;
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d01681483a918..16403774e72b3 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4035,7 +4035,7 @@ class EnumDecl : public TagDecl {
   /// Return the type source info for the underlying integer type,
   /// if no type source info exists, return 0.
   TypeSourceInfo *getIntegerTypeSourceInfo() const {
-    return IntegerType.dyn_cast<TypeSourceInfo*>();
+    return dyn_cast_if_present<TypeSourceInfo *>(IntegerType);
   }
 
   /// Retrieve the source range that covers the underlying type if
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 573b46a2321c5..2c0c3a8dc2f9d 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1391,7 +1391,7 @@ class DeclContextLookupResult {
   const_iterator end() const { return iterator(); }
 
   bool empty() const { return Result.isNull();  }
-  bool isSingleResult() const { return Result.dyn_cast<NamedDecl*>(); }
+  bool isSingleResult() const { return isa_and_present<NamedDecl *>(Result); }
   reference front() const { return *begin(); }
 
   // Find the first declaration of the given type in the list. Note that this
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 8c2da97c07a3b..caaa47d0a297c 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -2009,7 +2009,8 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
   /// Retrieve the template argument list as written in the sources,
   /// if any.
   const ASTTemplateArgumentListInfo *getTemplateArgsAsWritten() const {
-    if (auto *Info = ExplicitInfo.dyn_cast<ExplicitInstantiationInfo *>())
+    if (auto *Info =
+            dyn_cast_if_present<ExplicitInstantiationInfo *>(ExplicitInfo))
       return Info->TemplateArgsAsWritten;
     return cast<const ASTTemplateArgumentListInfo *>(ExplicitInfo);
   }
@@ -2041,7 +2042,8 @@ class ClassTemplateSpecializationDecl : public CXXRecordDecl,
 
   /// Gets the location of the template keyword, if present.
   SourceLocation getTemplateKeywordLoc() const {
-    if (auto *Info = ExplicitInfo.dyn_cast<ExplicitInstantiationInfo *>())
+    if (auto *Info =
+            dyn_cast_if_present<ExplicitInstantiationInfo *>(ExplicitInfo))
       return Info->TemplateKeywordLoc;
     return SourceLocation();
   }
@@ -2786,7 +2788,8 @@ class VarTemplateSpecializationDecl : public VarDecl,
   /// Set the template argument list as written in the sources.
   void
   setTemplateArgsAsWritten(const ASTTemplateArgumentListInfo *ArgsWritten) {
-    if (auto *Info = ExplicitInfo.dyn_cast<ExplicitInstantiationInfo *>())
+    if (auto *Info =
+            dyn_cast_if_present<ExplicitInstantiationInfo *>(ExplicitInfo))
       Info->TemplateArgsAsWritten = ArgsWritten;
     else
       ExplicitInfo = ArgsWritten;
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 708c8656decbe..7be4022649329 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -5180,7 +5180,7 @@ class InitListExpr : public Expr {
   /// than there are initializers in the list, specifies an expression to be
   /// used for value initialization of the rest of the elements.
   Expr *getArrayFiller() {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<Expr *>();
+    return dyn_cast_if_present<Expr *>(ArrayFillerOrUnionFieldInit);
   }
   const Expr *getArrayFiller() const {
     return const_cast<InitListExpr *>(this)->getArrayFiller();
@@ -5205,7 +5205,7 @@ class InitListExpr : public Expr {
   /// union. However, a designated initializer can specify the
   /// initialization of a different field within the union.
   FieldDecl *getInitializedFieldInUnion() {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<FieldDecl *>();
+    return dyn_cast_if_present<FieldDecl *>(ArrayFillerOrUnionFieldInit);
   }
   const FieldDecl *getInitializedFieldInUnion() const {
     return const_cast<InitListExpr *>(this)->getInitializedFieldInUnion();
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 4cec89c979f77..aa10945addf78 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -5026,11 +5026,11 @@ class CXXParenListInitExpr final
   void setArrayFiller(Expr *E) { ArrayFillerOrUnionFieldInit = E; }
 
   Expr *getArrayFiller() {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<Expr *>();
+    return dyn_cast_if_present<Expr *>(ArrayFillerOrUnionFieldInit);
   }
 
   const Expr *getArrayFiller() const {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<Expr *>();
+    return dyn_cast_if_present<Expr *>(ArrayFillerOrUnionFieldInit);
   }
 
   void setInitializedFieldInUnion(FieldDecl *FD) {
@@ -5038,7 +5038,7 @@ class CXXParenListInitExpr final
   }
 
   FieldDecl *getInitializedFieldInUnion() {
-    return ArrayFillerOrUnionFieldInit.dyn_cast<FieldDecl *>();
+    return dyn_cast_if_present<FieldDecl *>(ArrayFillerOrUnionFieldInit);
   }
 
   const FieldDecl *getInitializedFieldInUnion() const {
diff --git a/clang/include/clang/Basic/IdentifierTable.h b/clang/include/clang/Basic/IdentifierTable.h
index 33d1cdb46f108..e5e6be3c96600 100644
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@@ -1008,7 +1008,7 @@ class Selector {
   }
 
   const IdentifierInfo *getAsIdentifierInfo() const {
-    return InfoPtr.getPointer().dyn_cast<const IdentifierInfo *>();
+    return dyn_cast_if_present<const IdentifierInfo *>(InfoPtr.getPointer());
   }
 
   MultiKeywordSelector *getMultiKeywordSelector() const {
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 8ddc5b56eedbd..416f403c29841 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -856,7 +856,7 @@ class Preprocessor {
           !PP.CurSubmoduleState->VisibleModules.getGeneration())
         return nullptr;
 
-      auto *Info = State.dyn_cast<ModuleMacroInfo*>();
+      auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State);
       if (!Info) {
         Info = new (PP.getPreprocessorAllocator())
             ModuleMacroInfo(cast<MacroDirective *>(State));
@@ -885,18 +885,18 @@ class Preprocessor {
     }
 
     ~MacroState() {
-      if (auto *Info = State.dyn_cast<ModuleMacroInfo*>())
+      if (auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State))
         Info->~ModuleMacroInfo();
     }
 
     MacroDirective *getLatest() const {
-      if (auto *Info = State.dyn_cast<ModuleMacroInfo*>())
+      if (auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State))
         return Info->MD;
       return cast<MacroDirective *>(State);
     }
 
     void setLatest(MacroDirective *MD) {
-      if (auto *Info = State.dyn_cast<ModuleMacroInfo*>())
+      if (auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State))
         Info->MD = MD;
       else
         State = MD;
@@ -940,7 +940,7 @@ class Preprocessor {
 
     void setOverriddenMacros(Preprocessor &PP,
                              ArrayRef<ModuleMacro *> Overrides) {
-      auto *Info = State.dyn_cast<ModuleMacroInfo*>();
+      auto *Info = dyn_cast_if_present<ModuleMacroInfo *>(State);
       if (!Info) {
         if (Overrides.empty())
           return;
diff --git a/clang/lib/APINotes/APINotesManager.cpp b/clang/lib/APINotes/APINotesManager.cpp
index 70d96c735503f..7f8a126ffaa03 100644
--- a/clang/lib/APINotes/APINotesManager.cpp
+++ b/clang/lib/APINotes/APINotesManager.cpp
@@ -56,7 +56,7 @@ APINotesManager::APINotesManager(SourceManager &SM, const LangOptions &LangOpts)
 APINotesManager::~APINotesManager() {
   // Free the API notes readers.
   for (const auto &Entry : Readers) {
-    if (auto Reader = Entry.second.dyn_cast<APINotesReader *>())
+    if (auto Reader = dyn_cast_if_present<APINotesReader *>(Entry.second))
       delete Reader;
   }
 
@@ -381,7 +381,7 @@ APINotesManager::findAPINotes(SourceLocation Loc) {
       }
 
       // We have the answer.
-      if (auto Reader = Known->second.dyn_cast<APINotesReader *>())
+      if (auto Reader = dyn_cast_if_present<APINotesReader *>(Known->second))
         Results.push_back(Reader);
       break;
     }
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 5ce03ce20d284..74bcb618f2950 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2447,7 +2447,7 @@ bool VarDecl::isOutOfLine() const {
 }
 
 void VarDecl::setInit(Expr *I) {
-  if (auto *Eval = Init.dyn_cast<EvaluatedStmt *>()) {
+  if (auto *Eval = dyn_cast_if_present<EvaluatedStmt *>(Init)) {
     Eval->~EvaluatedStmt();
     getASTContext().Deallocate(Eval);
   }
@@ -2527,7 +2527,7 @@ bool VarDecl::isUsableInConstantExpressions(const ASTContext &Context) const {
 /// form, which contains extra information on the evaluated value of the
 /// initializer.
 EvaluatedStmt *VarDecl::ensureEvaluatedStmt() const {
-  auto *Eval = Init.dyn_cast<EvaluatedStmt *>();
+  auto *Eval = dyn_cast_if_present<EvaluatedStmt *>(Init);
   if (!Eval) {
     // Note: EvaluatedStmt contains an APValue, which usually holds
     // resources not allocated from the ASTContext.  We need to do some
@@ -2541,7 +2541,7 @@ EvaluatedStmt *VarDecl::ensureEvaluatedStmt() const {
 }
 
 EvaluatedStmt *VarDecl::getEvaluatedStmt() const {
-  return Init.dyn_cast<EvaluatedStmt *>();
+  return dyn_cast_if_present<EvaluatedStmt *>(Init);
 }
 
 APValue *VarDecl::evaluateValue() const {
@@ -2784,8 +2784,8 @@ SourceLocation VarDecl::getPointOfInstantiation() const {
 }
 
 VarTemplateDecl *VarDecl::getDescribedVarTemplate() const {
-  return getASTContext().getTemplateOrSpecializationInfo(this)
-      .dyn_cast<VarTemplateDecl *>();
+  return dyn_cast_if_present<VarTemplateDecl *>(
+      getASTContext().getTemplateOrSpecializationInfo(this));
 }
 
 void VarDecl::setDescribedVarTemplate(VarTemplateDecl *Template) {
@@ -2875,8 +2875,8 @@ MemberSpecializationInfo *VarDecl::getMemberSpecializationInfo() const {
   if (isStaticDataMember())
     // FIXME: Remove ?
     // return getASTContext().getInstantiatedFromStaticDataMember(this);
-    return getASTContext().getTemplateOrSpecializationInfo(this)
-        .dyn_cast<MemberSpecializationInfo *>();
+    return dyn_cast_if_present<MemberSpecializationInfo *>(
+        getASTContext().getTemplateOrSpecializationInfo(this));
   return nullptr;
 }
 
@@ -4040,11 +4040,11 @@ FunctionDecl *FunctionDecl::getInstantiatedFromMemberFunction() const {
 }
 
 MemberSpecializationInfo *FunctionDecl::getMemberSpecializationInfo() const {
-  if (auto *MSI =
-          TemplateOrSpecialization.dyn_cast<MemberSpecializationInfo *>())
+  if (auto *MSI = dyn_cast_if_present<MemberSpecializationInfo *>(
+          TemplateOrSpecialization))
     return MSI;
-  if (auto *FTSI = TemplateOrSpecialization
-                       .dyn_cast<FunctionTemplateSpecializationInfo *>())
+  if (auto *FTSI = dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+          TemplateOrSpecialization))
     return FTSI->getMemberSpecializationInfo();
   return nullptr;
 }
@@ -4062,7 +4062,7 @@ FunctionDecl::setInstantiationOfMemberFunction(ASTContext &C,
 
 FunctionTemplateDecl *FunctionDecl::getDescribedFunctionTemplate() const {
   return dyn_cast_if_present<FunctionTemplateDecl>(
-      TemplateOrSpecialization.dyn_cast<NamedDecl *>());
+      dyn_cast_if_present<NamedDecl *>(TemplateOrSpecialization));
 }
 
 void FunctionDecl::setDescribedFunctionTemplate(
@@ -4181,9 +4181,9 @@ FunctionDecl::getTemplateInstantiationPattern(bool ForDefinition) const {
 }
 
 FunctionTemplateDecl *FunctionDecl::getPrimaryTemplate() const {
-  if (FunctionTemplateSpecializationInfo *Info
-        = TemplateOrSpecialization
-            .dyn_cast<FunctionTemplateSpecializationInfo*>()) {
+  if (FunctionTemplateSpecializationInfo *Info =
+          dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     return Info->getTemplate();
   }
   return nullptr;
@@ -4191,15 +4191,15 @@ FunctionTemplateDecl *FunctionDecl::getPrimaryTemplate() const {
 
 FunctionTemplateSpecializationInfo *
 FunctionDecl::getTemplateSpecializationInfo() const {
-  return TemplateOrSpecialization
-      .dyn_cast<FunctionTemplateSpecializationInfo *>();
+  return dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+      TemplateOrSpecialization);
 }
 
 const TemplateArgumentList *
 FunctionDecl::getTemplateSpecializationArgs() const {
-  if (FunctionTemplateSpecializationInfo *Info
-        = TemplateOrSpecialization
-            .dyn_cast<FunctionTemplateSpecializationInfo*>()) {
+  if (FunctionTemplateSpecializationInfo *Info =
+          dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     return Info->TemplateArguments;
   }
   return nullptr;
@@ -4207,14 +4207,14 @@ FunctionDecl::getTemplateSpecializationArgs() const {
 
 const ASTTemplateArgumentListInfo *
 FunctionDecl::getTemplateSpecializationArgsAsWritten() const {
-  if (FunctionTemplateSpecializationInfo *Info
-        = TemplateOrSpecialization
-            .dyn_cast<FunctionTemplateSpecializationInfo*>()) {
+  if (FunctionTemplateSpecializationInfo *Info =
+          dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     return Info->TemplateArgumentsAsWritten;
   }
   if (DependentFunctionTemplateSpecializationInfo *Info =
-          TemplateOrSpecialization
-              .dyn_cast<DependentFunctionTemplateSpecializationInfo *>()) {
+          dyn_cast_if_present<DependentFunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     return Info->TemplateArgumentsAsWritten;
   }
   return nullptr;
@@ -4239,7 +4239,8 @@ void FunctionDecl::setFunctionTemplateSpecialization(
       FunctionTemplateSpecializationInfo::Create(
           C, this, Template, TSK, TemplateArgs, TemplateArgsAsWritten,
           PointOfInstantiation,
-          TemplateOrSpecialization.dyn_cast<MemberSpecializationInfo *>());
+          dyn_cast_if_present<MemberSpecializationInfo *>(
+              TemplateOrSpecialization));
   TemplateOrSpecialization = Info;
   Template->addSpecialization(Info, InsertPos);
 }
@@ -4256,8 +4257,8 @@ void FunctionDecl::setDependentTemplateSpecialization(
 
 DependentFunctionTemplateSpecializationInfo *
 FunctionDecl::getDependentSpecializationInfo() const {
-  return TemplateOrSpecialization
-      .dyn_cast<DependentFunctionTemplateSpecializationInfo *>();
+  return dyn_cast_if_present<DependentFunctionTemplateSpecializationInfo *>(
+      TemplateOrSpecialization);
 }
 
 DependentFunctionTemplateSpecializationInfo *
@@ -4288,12 +4289,13 @@ TemplateSpecializationKind FunctionDecl::getTemplateSpecializationKind() const {
   // For a function template specialization, query the specialization
   // information object.
   if (FunctionTemplateSpecializationInfo *FTSInfo =
-          TemplateOrSpecialization
-              .dyn_cast<FunctionTemplateSpecializationInfo *>())
+          dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization))
     return FTSInfo->getTemplateSpecializationKind();
 
   if (MemberSpecializationInfo *MSInfo =
-          TemplateOrSpecialization.dyn_cast<MemberSpecializationInfo *>())
+          dyn_cast_if_present<MemberSpecializationInfo *>(
+              TemplateOrSpecialization))
     return MSInfo->getTemplateSpecializationKind();
 
   // A dependent function template specialization is an explicit specialization,
@@ -4331,15 +4333,16 @@ FunctionDecl::getTemplateSpecializationKindForInstantiation() const {
   // of A<int>::f, and that A<int>::f<int> should be implicitly instantiated
   // from A::f<int> if a definition is needed.
   if (FunctionTemplateSpecializationInfo *FTSInfo =
-          TemplateOrSpecialization
-              .dyn_cast<FunctionTemplateSpecializationInfo *>()) {
+          dyn_cast_if_present<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     if (auto *MSInfo = FTSInfo->getMemberSpecializationInfo())
       return MSInfo->getTemplateSpecializationKind();
     return FTSInfo->getTemplateSpecializationKind();
   }
 
   if (MemberSpecializationInfo *MSInfo =
-          TemplateOrSpecialization.dyn_cast<MemberSpecializationInfo *>())
+          dyn_cast_if_present<MemberSpecializationInfo *>(
+              TemplateOrSpecialization))
     return MSInfo->getTemplateSpecializationKind();
 
   if (isa<DependentFunctionTemplateSpecializationInfo *>(
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 44f45898fb483..c0a4356dcb004 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1987,7 +1987,8 @@ CXXRecordDecl *CXXRecordDecl::getInstantiatedFromMemberClass() const {
 }
 
 MemberSpecializationInfo *CXXRecordDecl::getMemberSpecializationInfo() const {
-  return TemplateOrInstantiation.dyn_cast<MemberSpecializationInfo *>();
+  return dyn_cast_if_present<MemberSpecializationInfo *>(
+      TemplateOrInstantiation);
 }
 
 void
@@ -2001,7 +2002,7 @@ CXXRecordDecl::setInstantiationOfMemberClass(CXXRecordDecl *RD,
 }
 
 ClassTemplateDecl *CXXRecordDecl::getDescribedClassTemplate() const {
-  return TemplateOrInstantiation.dyn_cast<ClassTemplateDecl *>();
+  return dyn_cast_if_present<ClassTemplateDecl *>(TemplateOrInstantiation);
 }
 
 void CXXRecordDecl::setDescribedClassTemplate(ClassTemplateDecl *Template) {
@@ -2045,7 +2046,7 @@ const CXXRecordDecl *CXXRecordDecl::getTemplateInstantiationPattern() const {
   // specialization from which it was instantiated.
   if (auto *TD = dyn_cast<ClassTemplateSpecializationDecl>(this)) {
     auto From = TD->getInstantiatedFrom();
-    if (auto *CTD = From.dyn_cast<ClassTemplateDecl *>()) {
+    if (auto *CTD = dyn_cast_if_present<ClassTemplateDecl *>(From)) {
       while (auto *NewCTD = CTD->getInstantiatedFromMemberTemplate()) {
         if (NewCTD->isMemberSpecialization())
           break;
@@ -2054,7 +2055,8 @@ const CXXRecordDecl *CXXRecordDecl::getTemplateInstantiationPattern() const {
       return GetDefinitionOrSelf(CTD->getTemplatedDecl());
     }
     if (auto *CTPSD =
-            From.dyn_cast<ClassTemplatePartialSpecializationDecl *>()) {
+            dyn_cast_if_present<ClassTemplatePartialSpecializationDecl *>(
+                From)) {
       while (auto *NewCTPSD = CTPSD->getInstantiatedFromMember()) {
         if (NewCTPSD->isMemberSpecialization())
           break;
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 2933ba7fb8a29..926b2b26dd381 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1077,7 +1077,7 @@ ClassTemplateSpecializationDecl::getSourceRange() const {
 }
 
 void ClassTemplateSpecializationDecl::setExternKeywordLoc(SourceLocation Loc) {
-  auto *Info = ExplicitInfo.dyn_cast<ExplicitInstantiationInfo *>();
+  auto *Info = dyn_cast_if_present<ExplicitInstantiationInfo *>(ExplicitInfo);
   if (!Info) {
     // Don't allocate if the location is invalid.
     if (Loc.isInvalid())
@@ -1091,7 +1091,7 @@ void ClassTemplateSpecializationDecl::setExternKeywordLoc(SourceLocation Loc) {
 
 void ClassTemplateSpecializationDecl::setTemplateKeywordLoc(
     SourceLocation Loc) {
-  auto *Info = ExplicitInfo.dyn_cast<ExplicitInstantiationInfo *>();
+  auto *Info = dyn_cast_if_present<ExplicitInstantiationInfo *>(ExplicitInfo);
   if (!Info) {
     // Don't allocate if the location is invalid.
     if (Loc.isInvalid())
diff --git a/clang/lib/AST/TemplateName.cpp b/clang/lib/AST/TemplateName.cpp
index 7d6275caedc4f..3a1eb1ca12f45 100644
--- a/clang/lib/AST/TemplateName.cpp
+++ b/clang/lib/AST/TemplateName.cpp
@@ -182,7 +182,8 @@ TemplateDecl *TemplateName::getAsTemplateDecl(bool IgnoreDeduced) const {
            "Unexpected canonical DeducedTemplateName; Did you mean to use "
            "getTemplateDeclAndDefaultArgs instead?");
 
-  return cast_if_present<TemplateDecl>(Name.Storage.dyn_cast<Decl *>());
+  return cast_if_present<TemplateDecl>(
+      dyn_cast_if_present<Decl *>(Name.Storage));
 }
 
 std::pair<TemplateDecl *, DefaultArguments>
@@ -208,7 +209,7 @@ TemplateName::getTemplateDeclAndDefaultArgs() const {
 }
 
 std::optional<TemplateName> TemplateName::desugar(bool IgnoreDeduced) const {
-  if (Decl *D = Storage.dyn_cast<Decl *>()) {
+  if (Decl *D = dyn_cast_if_present<Decl *>(Storage)) {
     if (auto *USD = dyn_cast<UsingShadowDecl>(D))
       return TemplateName(USD->getTargetDecl());
     return std::nullopt;
@@ -242,7 +243,7 @@ AssumedTemplateStorage *TemplateName::getAsAssumedTemplateName() const {
 SubstTemplateTemplateParmStorage *
 TemplateName::getAsSubstTemplateTemplateParm() const {
   if (UncommonTemplateNameStorage *uncommon =
-          Storage.dyn_cast<UncommonTemplateNameStorage *>())
+          dyn_cast_if_present<UncommonTemplateNameStorage *>(Storage))
     return uncommon->getAsSubstTemplateTemplateParm();
 
   return nullptr;
@@ -258,7 +259,7 @@ TemplateName::getAsSubstTemplateTemplateParmPack() const {
 }
 
 QualifiedTemplateName *TemplateName::getAsQualifiedTemplateName() const {
-  return Storage.dyn_cast<QualifiedTemplateName *>();
+  return dyn_cast_if_present<QualifiedTemplateName *>(Storage);
 }
 
 DependentTemplateName *TemplateName::getAsDependentTemplateName() const {
@@ -276,7 +277,7 @@ UsingShadowDecl *TemplateName::getAsUsingShadowDecl() const {
 
 DeducedTemplateStorage *TemplateName::getAsDeducedTemplateName() const {
   if (UncommonTemplateNameStorage *Uncommon =
-          Storage.dyn_cast<UncommonTemplateNameStorage *>())
+          dyn_cast_if_present<UncommonTemplateNameStorage *>(Storage))
     return Uncommon->getAsDeducedTemplateName();
 
   return nullptr;
diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 0887b5a504f05..131334269aa75 100644
--- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -650,7 +650,7 @@ void SDiagsWriter::EmitDiagnosticMessage(FullSourceLoc Loc, PresumedLoc PLoc,
   Record.push_back(getStableLevel(Level));
   AddLocToRecord(Loc, PLoc, Record);
 
-  if (const Diagnostic *Info = D.dyn_cast<const Diagnostic*>()) {
+  if (const Diagnostic *Info = dyn_cast_if_present<const Diagnostic *>(D)) {
     // Emit the category string lazily and get the category ID.
     unsigned DiagID = DiagnosticIDs::getCategoryNumberForDiag(Info->getID());
     Record.push_back(getEmitCategory(DiagID));
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index ad49eac66e98e..c3ff247a6316d 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -17700,9 +17700,11 @@ Sema::ActOnTag(Scope *S, unsigned TagSpec, TagUseKind TUK, SourceLocation KWLoc,
             return PrevTagDecl;
 
           QualType EnumUnderlyingTy;
-          if (TypeSourceInfo *TI = EnumUnderlying.dyn_cast<TypeSourceInfo*>())
+          if (TypeSourceInfo *TI =
+                  dyn_cast_if_present<TypeSourceInfo *>(EnumUnderlying))
             EnumUnderlyingTy = TI->getType().getUnqualifiedType();
-          else if (const Type *T = EnumUnderlying.dyn_cast<const Type*>())
+          else if (const Type *T =
+                       dyn_cast_if_present<const Type *>(EnumUnderlying))
             EnumUnderlyingTy = QualType(T, 0);
 
           // All conflicts with previous declarations are recovered by
diff --git a/clang/tools/libclang/CIndexDiagnostic.cpp b/clang/tools/libclang/CIndexDiagnostic.cpp
index 34792d5bdfaaf..92271d9c37f86 100644
--- a/clang/tools/libclang/CIndexDiagnostic.cpp
+++ b/clang/tools/libclang/CIndexDiagnostic.cpp
@@ -92,7 +92,8 @@ class CXDiagnosticRenderer : public DiagnosticNoteRenderer {
   void beginDiagnostic(DiagOrStoredDiag D,
                        DiagnosticsEngine::Level Level) override {
 
-    const StoredDiagnostic *SD = D.dyn_cast<const StoredDiagnostic*>();
+    const StoredDiagnostic *SD =
+        dyn_cast_if_present<const StoredDiagnostic *>(D);
     if (!SD)
       return;
     

From 04d5608057f73cf8deb66ddaeddf2f9254fd864b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 14:05:11 -0800
Subject: [PATCH 110/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124430)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect P to be nonnull.
---
 clang/lib/AST/DeclTemplate.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 926b2b26dd381..e4cb7dcb16a45 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1801,8 +1801,7 @@ TemplateParameterList *clang::getReplacedTemplateParameterList(Decl *D) {
   case Decl::Kind::VarTemplateSpecialization: {
     const auto *VTSD = cast<VarTemplateSpecializationDecl>(D);
     auto P = VTSD->getSpecializedTemplateOrPartial();
-    if (const auto *VTPSD =
-            P.dyn_cast<VarTemplatePartialSpecializationDecl *>())
+    if (const auto *VTPSD = dyn_cast<VarTemplatePartialSpecializationDecl *>(P))
       return VTPSD->getTemplateParameters();
     return cast<VarTemplateDecl *>(P)->getTemplateParameters();
   }

From 19a6ac18ef3e92017db49668ee365e694157f317 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 15:18:13 -0800
Subject: [PATCH 111/432] [ELF] EhFrame: replace failOn with errOn

These diagnostics are mostly reported by a thread during writeSections.
In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1.
---
 lld/ELF/EhFrame.cpp              | 52 +++++++++++++++++++-------------
 lld/test/ELF/invalid-eh-frame2.s |  1 +
 lld/test/ELF/invalid-eh-frame4.s |  1 +
 lld/test/ELF/invalid-eh-frame6.s |  1 +
 4 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/lld/ELF/EhFrame.cpp b/lld/ELF/EhFrame.cpp
index 6e0120e14988b..5d5a7bc0ab966 100644
--- a/lld/ELF/EhFrame.cpp
+++ b/lld/ELF/EhFrame.cpp
@@ -41,11 +41,10 @@ class EhReader {
   bool hasLSDA();
 
 private:
-  template <class P> void failOn(const P *loc, const Twine &msg) {
+  template <class P> void errOn(const P *loc, const Twine &msg) {
     Ctx &ctx = isec->file->ctx;
-    Fatal(ctx) << "corrupted .eh_frame: " << msg << "\n>>> defined in "
-               << isec->getObjMsg((const uint8_t *)loc -
-                                  isec->content().data());
+    Err(ctx) << "corrupted .eh_frame: " << msg << "\n>>> defined in "
+             << isec->getObjMsg((const uint8_t *)loc - isec->content().data());
   }
 
   uint8_t readByte();
@@ -62,8 +61,10 @@ class EhReader {
 
 // Read a byte and advance D by one byte.
 uint8_t EhReader::readByte() {
-  if (d.empty())
-    failOn(d.data(), "unexpected end of CIE");
+  if (d.empty()) {
+    errOn(d.data(), "unexpected end of CIE");
+    return 0;
+  }
   uint8_t b = d.front();
   d = d.slice(1);
   return b;
@@ -71,15 +72,18 @@ uint8_t EhReader::readByte() {
 
 void EhReader::skipBytes(size_t count) {
   if (d.size() < count)
-    failOn(d.data(), "CIE is too small");
-  d = d.slice(count);
+    errOn(d.data(), "CIE is too small");
+  else
+    d = d.slice(count);
 }
 
 // Read a null-terminated string.
 StringRef EhReader::readString() {
   const uint8_t *end = llvm::find(d, '\0');
-  if (end == d.end())
-    failOn(d.data(), "corrupted CIE (failed to read string)");
+  if (end == d.end()) {
+    errOn(d.data(), "corrupted CIE (failed to read string)");
+    return {};
+  }
   StringRef s = toStringRef(d.slice(0, end - d.begin()));
   d = d.slice(s.size() + 1);
   return s;
@@ -97,7 +101,7 @@ void EhReader::skipLeb128() {
     if ((val & 0x80) == 0)
       return;
   }
-  failOn(errPos, "corrupted CIE (failed to read LEB128)");
+  errOn(errPos, "corrupted CIE (failed to read LEB128)");
 }
 
 static size_t getAugPSize(Ctx &ctx, unsigned enc) {
@@ -121,12 +125,12 @@ static size_t getAugPSize(Ctx &ctx, unsigned enc) {
 void EhReader::skipAugP() {
   uint8_t enc = readByte();
   if ((enc & 0xf0) == DW_EH_PE_aligned)
-    failOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported");
+    return errOn(d.data() - 1, "DW_EH_PE_aligned encoding is not supported");
   size_t size = getAugPSize(isec->getCtx(), enc);
   if (size == 0)
-    failOn(d.data() - 1, "unknown FDE encoding");
+    return errOn(d.data() - 1, "unknown FDE encoding");
   if (size >= d.size())
-    failOn(d.data() - 1, "corrupted CIE");
+    return errOn(d.data() - 1, "corrupted CIE");
   d = d.slice(size);
 }
 
@@ -141,9 +145,11 @@ bool elf::hasLSDA(const EhSectionPiece &p) {
 StringRef EhReader::getAugmentation() {
   skipBytes(8);
   int version = readByte();
-  if (version != 1 && version != 3)
-    failOn(d.data() - 1,
-           "FDE version 1 or 3 expected, but got " + Twine(version));
+  if (version != 1 && version != 3) {
+    errOn(d.data() - 1,
+          "FDE version 1 or 3 expected, but got " + Twine(version));
+    return {};
+  }
 
   StringRef aug = readString();
 
@@ -174,8 +180,10 @@ uint8_t EhReader::getFdeEncoding() {
       readByte();
     else if (c == 'P')
       skipAugP();
-    else if (c != 'B' && c != 'S' && c != 'G')
-      failOn(aug.data(), "unknown .eh_frame augmentation string: " + aug);
+    else if (c != 'B' && c != 'S' && c != 'G') {
+      errOn(aug.data(), "unknown .eh_frame augmentation string: " + aug);
+      break;
+    }
   }
   return DW_EH_PE_absptr;
 }
@@ -191,8 +199,10 @@ bool EhReader::hasLSDA() {
       skipAugP();
     else if (c == 'R')
       readByte();
-    else if (c != 'B' && c != 'S' && c != 'G')
-      failOn(aug.data(), "unknown .eh_frame augmentation string: " + aug);
+    else if (c != 'B' && c != 'S' && c != 'G') {
+      errOn(aug.data(), "unknown .eh_frame augmentation string: " + aug);
+      break;
+    }
   }
   return false;
 }
diff --git a/lld/test/ELF/invalid-eh-frame2.s b/lld/test/ELF/invalid-eh-frame2.s
index 87ce8ede72503..01f38738519b6 100644
--- a/lld/test/ELF/invalid-eh-frame2.s
+++ b/lld/test/ELF/invalid-eh-frame2.s
@@ -2,6 +2,7 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
 # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec
 
 # CHECK:      error: corrupted .eh_frame: corrupted CIE (failed to read string)
 # CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0x9)
diff --git a/lld/test/ELF/invalid-eh-frame4.s b/lld/test/ELF/invalid-eh-frame4.s
index a567bd40d73ef..60bbc7f22717c 100644
--- a/lld/test/ELF/invalid-eh-frame4.s
+++ b/lld/test/ELF/invalid-eh-frame4.s
@@ -2,6 +2,7 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
 # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 
 # CHECK: corrupted .eh_frame: unknown .eh_frame augmentation string:
 
diff --git a/lld/test/ELF/invalid-eh-frame6.s b/lld/test/ELF/invalid-eh-frame6.s
index 77be15f54e6b1..6888419da3e3d 100644
--- a/lld/test/ELF/invalid-eh-frame6.s
+++ b/lld/test/ELF/invalid-eh-frame6.s
@@ -2,6 +2,7 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
 # RUN: not ld.lld --eh-frame-hdr %t -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld --eh-frame-hdr %t -o /dev/null --noinhibit-exec
 
 # CHECK: error: corrupted .eh_frame: unknown FDE encoding
 # CHECK-NEXT: >>> defined in {{.*}}:(.eh_frame+0xe)

From 0f3c2884f3ccbdbe396e4388feb8be716b50dd68 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 15:25:04 -0800
Subject: [PATCH 112/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124433)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect U to be nonnull.
---
 clang/lib/AST/ParentMapContext.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ParentMapContext.cpp b/clang/lib/AST/ParentMapContext.cpp
index af7d9fcdc638b..7ff492443031d 100644
--- a/clang/lib/AST/ParentMapContext.cpp
+++ b/clang/lib/AST/ParentMapContext.cpp
@@ -103,9 +103,9 @@ class ParentMapContext::ParentMap {
 
   static DynTypedNode
   getSingleDynTypedNodeFromParentMap(ParentMapPointers::mapped_type U) {
-    if (const auto *D = U.dyn_cast<const Decl *>())
+    if (const auto *D = dyn_cast<const Decl *>(U))
       return DynTypedNode::create(*D);
-    if (const auto *S = U.dyn_cast<const Stmt *>())
+    if (const auto *S = dyn_cast<const Stmt *>(U))
       return DynTypedNode::create(*S);
     return *cast<DynTypedNode *>(U);
   }

From d2c7cabe0453d6a6d03c15b7ae1800b53de9e182 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 25 Jan 2025 15:25:17 -0800
Subject: [PATCH 113/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124434)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect EWC->getObject(i) to be nonnull.
---
 clang/lib/Sema/JumpDiagnostics.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/JumpDiagnostics.cpp b/clang/lib/Sema/JumpDiagnostics.cpp
index d465599450e7f..4b92d67e49d7d 100644
--- a/clang/lib/Sema/JumpDiagnostics.cpp
+++ b/clang/lib/Sema/JumpDiagnostics.cpp
@@ -561,12 +561,12 @@ void JumpScopeChecker::BuildScopeInformation(Stmt *S,
     // implementable but a lot of work which we haven't felt up to doing.
     ExprWithCleanups *EWC = cast<ExprWithCleanups>(S);
     for (unsigned i = 0, e = EWC->getNumObjects(); i != e; ++i) {
-      if (auto *BDecl = EWC->getObject(i).dyn_cast<BlockDecl *>())
+      if (auto *BDecl = dyn_cast<BlockDecl *>(EWC->getObject(i)))
         for (const auto &CI : BDecl->captures()) {
           VarDecl *variable = CI.getVariable();
           BuildScopeInformation(variable, BDecl, origParentScope);
         }
-      else if (auto *CLE = EWC->getObject(i).dyn_cast<CompoundLiteralExpr *>())
+      else if (auto *CLE = dyn_cast<CompoundLiteralExpr *>(EWC->getObject(i)))
         BuildScopeInformation(CLE, origParentScope);
       else
         llvm_unreachable("unexpected cleanup object type");

From 4f480481716553aa89142131f49e53e7d53c1998 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 15:28:17 -0800
Subject: [PATCH 114/432] [ELF] SHF_MERGE: avoid Fatal

In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1.
---
 lld/ELF/InputFiles.cpp                    | 11 +++++------
 lld/test/ELF/invalid/merge-invalid-size.s |  2 +-
 lld/test/ELF/invalid/merge-writable.s     |  1 +
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index c3c6812c26202..b29c7db879fa0 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -700,13 +700,12 @@ bool ObjFile<ELFT>::shouldMerge(const Elf_Shdr &sec, StringRef name) {
   if (entSize == 0)
     return false;
   if (sec.sh_size % entSize)
-    Fatal(ctx) << this << ":(" << name << "): SHF_MERGE section size ("
-               << uint64_t(sec.sh_size)
-               << ") must be a multiple of sh_entsize (" << entSize << ")";
-
+    ErrAlways(ctx) << this << ":(" << name << "): SHF_MERGE section size ("
+                   << uint64_t(sec.sh_size)
+                   << ") must be a multiple of sh_entsize (" << entSize << ")";
   if (sec.sh_flags & SHF_WRITE)
-    Fatal(ctx) << this << ":(" << name
-               << "): writable SHF_MERGE section is not supported";
+    Err(ctx) << this << ":(" << name
+             << "): writable SHF_MERGE section is not supported";
 
   return true;
 }
diff --git a/lld/test/ELF/invalid/merge-invalid-size.s b/lld/test/ELF/invalid/merge-invalid-size.s
index 71c3f98e75529..82ad1f97b4a93 100644
--- a/lld/test/ELF/invalid/merge-invalid-size.s
+++ b/lld/test/ELF/invalid/merge-invalid-size.s
@@ -1,6 +1,6 @@
 // REQUIRES: x86
 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux
-// RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+// RUN: not ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 // CHECK: merge-invalid-size.s.tmp.o:(.foo): SHF_MERGE section size (2) must be a multiple of sh_entsize (4)
 
 .section .foo,"aM",@progbits,4
diff --git a/lld/test/ELF/invalid/merge-writable.s b/lld/test/ELF/invalid/merge-writable.s
index 0c5fe92481da0..24a274b193576 100644
--- a/lld/test/ELF/invalid/merge-writable.s
+++ b/lld/test/ELF/invalid/merge-writable.s
@@ -1,6 +1,7 @@
 // REQUIRES: x86
 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux
 // RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+// RUN: ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 // CHECK: merge-writable.s.tmp.o:(.foo): writable SHF_MERGE section is not supported
 
 .section .foo,"awM",@progbits,4

From c7579bfba5969377f7fb4239cc05d6cd4a077957 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 15:50:29 -0800
Subject: [PATCH 115/432] [ELF] -o -: suppress output if disableOutput

So that LLD_IN_TEST=2 ld.lld -o - a.o only writes the output once.
---
 lld/ELF/Writer.cpp    | 8 +++++---
 lld/test/ELF/stdout.s | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index fe4a0a15ae835..b7c4790655e8a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -381,9 +381,11 @@ template <class ELFT> void Writer<ELFT>::run() {
     if (errCount(ctx))
       return;
 
-    if (auto e = buffer->commit())
-      Err(ctx) << "failed to write output '" << buffer->getPath()
-               << "': " << std::move(e);
+    if (!ctx.e.disableOutput) {
+      if (auto e = buffer->commit())
+        Err(ctx) << "failed to write output '" << buffer->getPath()
+                 << "': " << std::move(e);
+    }
 
     if (!ctx.arg.cmseOutputLib.empty())
       writeARMCmseImportLib<ELFT>(ctx);
diff --git a/lld/test/ELF/stdout.s b/lld/test/ELF/stdout.s
index 64cf64a72b4b6..b5ec07cfabfe9 100644
--- a/lld/test/ELF/stdout.s
+++ b/lld/test/ELF/stdout.s
@@ -1,7 +1,8 @@
 # REQUIRES: x86
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
-# RUN: ld.lld %t.o -o - > %t1
+## Test that we only write to "-" once.
+# RUN: LLD_IN_TEST=2 ld.lld %t.o -o - > %t1
 # RUN: llvm-objdump -d %t1 | FileCheck %s
 
 # CHECK: nop

From 7db789b5702714ffb6c96ad53c3136ca0a4300b2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 16:00:51 -0800
Subject: [PATCH 116/432] [ELF] Replace a few Fatal with Err

In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1. Change a few Fatal to
recoverable Err.
---
 lld/ELF/InputSection.cpp                    | 24 +++++++++++++--------
 lld/test/ELF/compressed-input-err.s         |  1 +
 lld/test/ELF/invalid/section-alignment.test |  1 +
 lld/test/ELF/invalid/section-alignment2.s   |  1 +
 4 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 42ef530b79d89..56928b7c9547b 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -71,8 +71,10 @@ InputSectionBase::InputSectionBase(InputFile *file, StringRef name,
   // The ELF spec states that a value of 0 means the section has
   // no alignment constraints.
   uint32_t v = std::max<uint32_t>(addralign, 1);
-  if (!isPowerOf2_64(v))
-    Fatal(getCtx()) << this << ": sh_addralign is not a power of 2";
+  if (!isPowerOf2_64(v)) {
+    Err(getCtx()) << this << ": sh_addralign is not a power of 2";
+    v = 1;
+  }
   this->addralign = v;
 
   // If SHF_COMPRESSED is set, parse the header. The legacy .zdebug format is no
@@ -104,8 +106,10 @@ InputSectionBase::InputSectionBase(ObjFile<ELFT> &file,
   // We reject object files having insanely large alignments even though
   // they are allowed by the spec. I think 4GB is a reasonable limitation.
   // We might want to relax this in the future.
-  if (hdr.sh_addralign > UINT32_MAX)
-    Fatal(getCtx()) << &file << ": section sh_addralign is too large";
+  if (hdr.sh_addralign > UINT32_MAX) {
+    Err(getCtx()) << &file << ": section sh_addralign is too large";
+    addralign = 1;
+  }
 }
 
 size_t InputSectionBase::getSize() const {
@@ -123,7 +127,7 @@ static void decompressAux(Ctx &ctx, const InputSectionBase &sec, uint8_t *out,
   if (Error e = hdr->ch_type == ELFCOMPRESS_ZLIB
                     ? compression::zlib::decompress(compressed, out, size)
                     : compression::zstd::decompress(compressed, out, size))
-    Fatal(ctx) << &sec << ": decompress failed: " << std::move(e);
+    Err(ctx) << &sec << ": decompress failed: " << std::move(e);
 }
 
 void InputSectionBase::decompress() const {
@@ -649,9 +653,11 @@ static uint64_t getRISCVUndefinedRelativeWeakVA(uint64_t type, uint64_t p) {
 // of the RW segment.
 static uint64_t getARMStaticBase(const Symbol &sym) {
   OutputSection *os = sym.getOutputSection();
-  if (!os || !os->ptLoad || !os->ptLoad->firstSec)
-    Fatal(os->ctx) << "SBREL relocation to " << sym.getName()
-                   << " without static base";
+  if (!os || !os->ptLoad || !os->ptLoad->firstSec) {
+    Err(os->ctx) << "SBREL relocation to " << sym.getName()
+                 << " without static base";
+    return 0;
+  }
   return os->ptLoad->firstSec->addr;
 }
 
@@ -1304,7 +1310,7 @@ template <class ELFT> void InputSection::writeTo(Ctx &ctx, uint8_t *buf) {
     if (Error e = hdr->ch_type == ELFCOMPRESS_ZLIB
                       ? compression::zlib::decompress(compressed, buf, size)
                       : compression::zstd::decompress(compressed, buf, size))
-      Fatal(ctx) << this << ": decompress failed: " << std::move(e);
+      Err(ctx) << this << ": decompress failed: " << std::move(e);
     uint8_t *bufEnd = buf + size;
     relocate<ELFT>(ctx, buf, bufEnd);
     return;
diff --git a/lld/test/ELF/compressed-input-err.s b/lld/test/ELF/compressed-input-err.s
index 83b1f62d7e495..7251585ed5d70 100644
--- a/lld/test/ELF/compressed-input-err.s
+++ b/lld/test/ELF/compressed-input-err.s
@@ -9,6 +9,7 @@
 
 # RUN: yaml2obj --docnum=3 %s -o %t3.o
 # RUN: not ld.lld %t3.o -o /dev/null -shared 2>&1 | FileCheck %s
+# RUN: ld.lld %t3.o -o /dev/null -shared --noinhibit-exec
 
 ## Check we are able to report zlib decompress errors.
 # CHECK: error: {{.*}}.o:(.debug_info): decompress failed: zlib error: Z_DATA_ERROR
diff --git a/lld/test/ELF/invalid/section-alignment.test b/lld/test/ELF/invalid/section-alignment.test
index 8099ec01849b6..32e673f82992b 100644
--- a/lld/test/ELF/invalid/section-alignment.test
+++ b/lld/test/ELF/invalid/section-alignment.test
@@ -1,5 +1,6 @@
 # RUN: yaml2obj %s -o %t
 # RUN: not ld.lld %t -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld %t -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 
 ## In current lld implementation, we do not accept sh_addralign
 ## larger than UINT32_MAX.
diff --git a/lld/test/ELF/invalid/section-alignment2.s b/lld/test/ELF/invalid/section-alignment2.s
index c130bbbaa071f..c180860ca4127 100644
--- a/lld/test/ELF/invalid/section-alignment2.s
+++ b/lld/test/ELF/invalid/section-alignment2.s
@@ -1,5 +1,6 @@
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec
 
 # CHECK: error: {{.*}}.o:(.text): sh_addralign is not a power of 2
 

From 6b87f01aaaa9d7c6eef8b66e48f13eb8492c7503 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 16:20:27 -0800
Subject: [PATCH 117/432] [ELF] MergeInputSection: replace Fatal with Err

In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1. Change a few Fatal to
recoverable Err.
---
 lld/ELF/InputSection.cpp                 | 13 +++++++++----
 lld/test/ELF/merge-string-error.s        | 11 -----------
 lld/test/ELF/mergeable-errors.s          |  1 +
 lld/test/ELF/relocation-past-merge-end.s |  9 ++++++++-
 4 files changed, 18 insertions(+), 16 deletions(-)
 delete mode 100644 lld/test/ELF/merge-string-error.s

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 56928b7c9547b..52c472bb89caf 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -1433,8 +1433,11 @@ static size_t findNull(StringRef s, size_t entSize) {
 void MergeInputSection::splitStrings(StringRef s, size_t entSize) {
   const bool live = !(flags & SHF_ALLOC) || !getCtx().arg.gcSections;
   const char *p = s.data(), *end = s.data() + s.size();
-  if (!std::all_of(end - entSize, end, [](char c) { return c == 0; }))
-    Fatal(getCtx()) << this << ": string is not null terminated";
+  if (!std::all_of(end - entSize, end, [](char c) { return c == 0; })) {
+    Err(getCtx()) << this << ": string is not null terminated";
+    pieces.emplace_back(entSize, 0, false);
+    return;
+  }
   if (entSize == 1) {
     // Optimize the common case.
     do {
@@ -1494,8 +1497,10 @@ void MergeInputSection::splitIntoPieces() {
 }
 
 SectionPiece &MergeInputSection::getSectionPiece(uint64_t offset) {
-  if (content().size() <= offset)
-    Fatal(getCtx()) << this << ": offset is outside the section";
+  if (content().size() <= offset) {
+    Err(getCtx()) << this << ": offset is outside the section";
+    return pieces[0];
+  }
   return partition_point(
       pieces, [=](SectionPiece p) { return p.inputOff <= offset; })[-1];
 }
diff --git a/lld/test/ELF/merge-string-error.s b/lld/test/ELF/merge-string-error.s
deleted file mode 100644
index bd77a4c1dce87..0000000000000
--- a/lld/test/ELF/merge-string-error.s
+++ /dev/null
@@ -1,11 +0,0 @@
-// REQUIRES: x86
-// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-// RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s
-
-        .section	.rodata.str1.1,"aMS",@progbits,1
-	.asciz	"abc"
-
-        .data
-        .quad .rodata.str1.1 + 4
-
-// CHECK: merge-string-error.s.tmp.o:(.rodata.str1.1): offset is outside the section
diff --git a/lld/test/ELF/mergeable-errors.s b/lld/test/ELF/mergeable-errors.s
index d67cd91c97fbf..b155d581046a8 100644
--- a/lld/test/ELF/mergeable-errors.s
+++ b/lld/test/ELF/mergeable-errors.s
@@ -1,6 +1,7 @@
 # REQUIRES: x86
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec
 
 # CHECK: error: {{.*}}.o:(.mergeable): string is not null terminated
 
diff --git a/lld/test/ELF/relocation-past-merge-end.s b/lld/test/ELF/relocation-past-merge-end.s
index 15214a5a4fc05..1dced95c49ac2 100644
--- a/lld/test/ELF/relocation-past-merge-end.s
+++ b/lld/test/ELF/relocation-past-merge-end.s
@@ -1,9 +1,16 @@
 // REQUIRES: x86
 // RUN: llvm-mc %s -o %t.o -filetype=obj -triple=x86_64-pc-linux
 // RUN: not ld.lld %t.o -o /dev/null -shared 2>&1 | FileCheck %s
-// CHECK: relocation-past-merge-end.s.tmp.o:(.foo): offset is outside the section
+// RUN: ld.lld %t.o -o /dev/null -shared --noinhibit-exec 2>&1 | FileCheck %s
+// CHECK: .o:(.foo): offset is outside the section
+// CHECCK: .o:(.rodata.str1.1): offset is outside the section
 
 .data
 .quad .foo + 10
+.quad .rodata.str1.1 + 4
+
 .section	.foo,"aM",@progbits,4
 .quad 0
+
+.section	.rodata.str1.1,"aMS",@progbits,1
+.asciz	"abc"

From a9e92beb253d4bbd7636d99f100940534f3a7f36 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 16:54:19 -0800
Subject: [PATCH 118/432] [ELF] openAuxiliaryFile: open /dev/null if
 disableOutput and filename is "-"

So that LLD_IN_TEST=2 ld.lld --print-archive-stats=- a.o (and -Map -)
only writes the output once.
---
 lld/ELF/Driver.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 13e8f8ce6df20..c0a27b3939a54 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -105,6 +105,13 @@ llvm::raw_fd_ostream Ctx::openAuxiliaryFile(llvm::StringRef filename,
   using namespace llvm::sys::fs;
   OpenFlags flags =
       auxiliaryFiles.insert(filename).second ? OF_None : OF_Append;
+  if (e.disableOutput && filename == "-") {
+#ifdef _WIN32
+    filename = "NUL";
+#else
+    filename = "/dev/null";
+#endif
+  }
   return {filename, ec, flags};
 }
 

From b7195e8e040d57bbf502f34ec84d71bd123f85b8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 16:58:52 -0800
Subject: [PATCH 119/432] [ELF,test] Add env LLD_IN_TEST=1 to make some tests
 work if RUN_LLD_MAIN_TWICE

---
 lld/test/ELF/basic.s  | 2 +-
 lld/test/ELF/stdout.s | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lld/test/ELF/basic.s b/lld/test/ELF/basic.s
index 587fd1641500a..b01f51eb4a2c7 100644
--- a/lld/test/ELF/basic.s
+++ b/lld/test/ELF/basic.s
@@ -220,7 +220,7 @@ _start:
 
 ## Test erroring on a recursive response file, but only once.
 # RUN: echo @%t.responsefile > %t.responsefile
-# RUN: not ld.lld %t @%t.responsefile 2>&1 | FileCheck %s --check-prefix=RECRSP
+# RUN: env LLD_IN_TEST=1 not ld.lld %t @%t.responsefile 2>&1 | FileCheck %s --check-prefix=RECRSP
 # RECRSP: recursive expansion of: '{{.*}}.responsefile'
 # RECRSP-NOT: recursive expansion of
 
diff --git a/lld/test/ELF/stdout.s b/lld/test/ELF/stdout.s
index b5ec07cfabfe9..e33ab3f792c40 100644
--- a/lld/test/ELF/stdout.s
+++ b/lld/test/ELF/stdout.s
@@ -2,7 +2,7 @@
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 ## Test that we only write to "-" once.
-# RUN: LLD_IN_TEST=2 ld.lld %t.o -o - > %t1
+# RUN: env LLD_IN_TEST=2 ld.lld %t.o -o - > %t1
 # RUN: llvm-objdump -d %t1 | FileCheck %s
 
 # CHECK: nop

From f21c35d54f8f7af9d0c64b566cabbc4f796a54df Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 17:29:28 -0800
Subject: [PATCH 120/432] [ELF] Replace some Fatal with Err

In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1. Change a few Fatal to
recoverable Err.
---
 lld/ELF/Driver.cpp                      |  6 ++-
 lld/ELF/InputFiles.cpp                  | 52 +++++++++++++++----------
 lld/ELF/Relocations.cpp                 |  6 ++-
 lld/test/ELF/invalid/section-index.test |  1 +
 lld/test/ELF/invalid/symbol-name.test   |  1 +
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index c0a27b3939a54..770163f4de086 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2425,8 +2425,10 @@ static void findKeepUniqueSections(Ctx &ctx, opt::InputArgList &args) {
         unsigned size;
         const char *err = nullptr;
         uint64_t symIndex = decodeULEB128(cur, &size, contents.end(), &err);
-        if (err)
-          Fatal(ctx) << f << ": could not decode addrsig section: " << err;
+        if (err) {
+          Err(ctx) << f << ": could not decode addrsig section: " << err;
+          break;
+        }
         markAddrsig(icfSafe, syms[symIndex]);
         cur += size;
       }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index b29c7db879fa0..e236057a0d6d1 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -921,17 +921,18 @@ static void readGnuProperty(Ctx &ctx, const InputSection &sec,
   using Elf_Note = typename ELFT::Note;
 
   ArrayRef<uint8_t> data = sec.content();
-  auto reportFatal = [&](const uint8_t *place, const Twine &msg) {
-    Fatal(ctx) << sec.file << ":(" << sec.name << "+0x"
-               << Twine::utohexstr(place - sec.content().data())
-               << "): " << msg;
+  auto err = [&](const uint8_t *place) -> ELFSyncStream {
+    auto diag = Err(ctx);
+    diag << sec.file << ":(" << sec.name << "+0x"
+         << Twine::utohexstr(place - sec.content().data()) << "): ";
+    return diag;
   };
   while (!data.empty()) {
     // Read one NOTE record.
     auto *nhdr = reinterpret_cast<const Elf_Nhdr *>(data.data());
     if (data.size() < sizeof(Elf_Nhdr) ||
         data.size() < nhdr->getSize(sec.addralign))
-      reportFatal(data.data(), "data is too short");
+      return void(err(data.data()) << "data is too short");
 
     Elf_Note note(*nhdr);
     if (nhdr->n_type != NT_GNU_PROPERTY_TYPE_0 || note.getName() != "GNU") {
@@ -948,30 +949,32 @@ static void readGnuProperty(Ctx &ctx, const InputSection &sec,
     while (!desc.empty()) {
       const uint8_t *place = desc.data();
       if (desc.size() < 8)
-        reportFatal(place, "program property is too short");
+        return void(err(place) << "program property is too short");
       uint32_t type = read32<ELFT::Endianness>(desc.data());
       uint32_t size = read32<ELFT::Endianness>(desc.data() + 4);
       desc = desc.slice(8);
       if (desc.size() < size)
-        reportFatal(place, "program property is too short");
+        return void(err(place) << "program property is too short");
 
       if (type == featureAndType) {
         // We found a FEATURE_1_AND field. There may be more than one of these
         // in a .note.gnu.property section, for a relocatable object we
         // accumulate the bits set.
         if (size < 4)
-          reportFatal(place, "FEATURE_1_AND entry is too short");
+          return void(err(place) << "FEATURE_1_AND entry is too short");
         f.andFeatures |= read32<ELFT::Endianness>(desc.data());
       } else if (ctx.arg.emachine == EM_AARCH64 &&
                  type == GNU_PROPERTY_AARCH64_FEATURE_PAUTH) {
         if (!f.aarch64PauthAbiCoreInfo.empty()) {
-          reportFatal(data.data(),
-                      "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
-                      "not supported");
+          return void(
+              err(data.data())
+              << "multiple GNU_PROPERTY_AARCH64_FEATURE_PAUTH entries are "
+                 "not supported");
         } else if (size != 16) {
-          reportFatal(data.data(), "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry "
-                                   "is invalid: expected 16 bytes, but got " +
-                                       Twine(size));
+          return void(err(data.data())
+                      << "GNU_PROPERTY_AARCH64_FEATURE_PAUTH entry "
+                         "is invalid: expected 16 bytes, but got "
+                      << size);
         }
         f.aarch64PauthAbiCoreInfo = desc;
       }
@@ -1173,8 +1176,10 @@ void ObjFile<ELFT>::initSectionsAndLocalSyms(bool ignoreComdats) {
       secIdx = check(getExtendedSymbolTableIndex<ELFT>(eSym, i, shndxTable));
     else if (secIdx >= SHN_LORESERVE)
       secIdx = 0;
-    if (LLVM_UNLIKELY(secIdx >= sections.size()))
-      Fatal(ctx) << this << ": invalid section index: " << secIdx;
+    if (LLVM_UNLIKELY(secIdx >= sections.size())) {
+      Err(ctx) << this << ": invalid section index: " << secIdx;
+      secIdx = 0;
+    }
     if (LLVM_UNLIKELY(eSym.getBinding() != STB_LOCAL))
       ErrAlways(ctx) << this << ": non-local symbol (" << i
                      << ") found at index < .symtab's sh_info (" << end << ")";
@@ -1183,9 +1188,12 @@ void ObjFile<ELFT>::initSectionsAndLocalSyms(bool ignoreComdats) {
     uint8_t type = eSym.getType();
     if (type == STT_FILE)
       sourceFile = CHECK2(eSym.getName(stringTable), this);
-    if (LLVM_UNLIKELY(stringTable.size() <= eSym.st_name))
-      Fatal(ctx) << this << ": invalid symbol name offset";
-    StringRef name(stringTable.data() + eSym.st_name);
+    unsigned stName = eSym.st_name;
+    if (LLVM_UNLIKELY(stringTable.size() <= stName)) {
+      Err(ctx) << this << ": invalid symbol name offset";
+      stName = 0;
+    }
+    StringRef name(stringTable.data() + stName);
 
     symbols[i] = reinterpret_cast<Symbol *>(locals + i);
     if (eSym.st_shndx == SHN_UNDEF || sec == &InputSection::discarded)
@@ -1236,8 +1244,10 @@ template <class ELFT> void ObjFile<ELFT>::postParse() {
         secIdx = 0;
     }
 
-    if (LLVM_UNLIKELY(secIdx >= sections.size()))
-      Fatal(ctx) << this << ": invalid section index: " << secIdx;
+    if (LLVM_UNLIKELY(secIdx >= sections.size())) {
+      Err(ctx) << this << ": invalid section index: " << secIdx;
+      continue;
+    }
     InputSectionBase *sec = sections[secIdx];
     if (sec == &InputSection::discarded) {
       if (sym.traced) {
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 76b151b93d517..629702b45965b 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -428,8 +428,10 @@ class OffsetGetter {
     if (j == fdes.begin() || j[-1].inputOff + j[-1].size <= off) {
       while (i != cies.end() && i->inputOff <= off)
         ++i;
-      if (i == cies.begin() || i[-1].inputOff + i[-1].size <= off)
-        Fatal(ctx) << ".eh_frame: relocation is not in any piece";
+      if (i == cies.begin() || i[-1].inputOff + i[-1].size <= off) {
+        Err(ctx) << ".eh_frame: relocation is not in any piece";
+        return 0;
+      }
       it = i;
     }
 
diff --git a/lld/test/ELF/invalid/section-index.test b/lld/test/ELF/invalid/section-index.test
index cc8c6d067265a..370597b7b7a2d 100644
--- a/lld/test/ELF/invalid/section-index.test
+++ b/lld/test/ELF/invalid/section-index.test
@@ -3,6 +3,7 @@
 
 # RUN: yaml2obj %s -o %t1.o
 # RUN: not ld.lld %t1.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld %t1.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 # CHECK: {{.*}}1.o: invalid section index: 256
 
 !ELF
diff --git a/lld/test/ELF/invalid/symbol-name.test b/lld/test/ELF/invalid/symbol-name.test
index 1ae76f0bd81e7..73284a1b9b842 100644
--- a/lld/test/ELF/invalid/symbol-name.test
+++ b/lld/test/ELF/invalid/symbol-name.test
@@ -1,5 +1,6 @@
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld %t.o -o /dev/null --noinhibit-exec
 # CHECK: error: {{.*}}.o: invalid symbol name offset
 
 ## YAML below contains symbol with name offset in st_name

From 988978f964fb84cb99c83e6cd260dcc395afb6c2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 17:51:24 -0800
Subject: [PATCH 121/432] [ELF,test] Add env LLD_IN_TEST=1 to make some tests
 work if RUN_LLD_MAIN_TWICE

---
 lld/test/ELF/invalid/bad-reloc-target.test | 2 +-
 lld/test/ELF/lto/cache-warnings.ll         | 1 +
 lld/test/ELF/lto/ltopasses-custom.ll       | 4 ++--
 lld/test/ELF/lto/verify-invalid.ll         | 6 +++---
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/lld/test/ELF/invalid/bad-reloc-target.test b/lld/test/ELF/invalid/bad-reloc-target.test
index 88b4cdf96779f..6a1619e81b80b 100644
--- a/lld/test/ELF/invalid/bad-reloc-target.test
+++ b/lld/test/ELF/invalid/bad-reloc-target.test
@@ -51,7 +51,7 @@ Symbols:
 ## Relocation refers to a symbol with index larger than
 ## symbol table size. Check we report it.
 # RUN: yaml2obj --docnum=3 %s -o %t2.o
-# RUN: not ld.lld %t2.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3
+# RUN: env LLD_IN_TEST=1 not ld.lld %t2.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR3
 # ERR3: error: {{.*}}.o: invalid symbol index
 
 --- !ELF
diff --git a/lld/test/ELF/lto/cache-warnings.ll b/lld/test/ELF/lto/cache-warnings.ll
index d8c5ea963ec13..d0224d5426ff3 100644
--- a/lld/test/ELF/lto/cache-warnings.ll
+++ b/lld/test/ELF/lto/cache-warnings.ll
@@ -1,4 +1,5 @@
 ; REQUIRES: x86, shell
+; UNSUPPORTED: main-run-twice
 
 ; RUN: opt -module-hash -module-summary %s -o %t.o
 ; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.o
diff --git a/lld/test/ELF/lto/ltopasses-custom.ll b/lld/test/ELF/lto/ltopasses-custom.ll
index ecb024cecade4..e37083ca8b8c7 100644
--- a/lld/test/ELF/lto/ltopasses-custom.ll
+++ b/lld/test/ELF/lto/ltopasses-custom.ll
@@ -24,13 +24,13 @@ define void @barrier() {
 ; ATOMIC-NEXT: ret void
 
 ; Check that invalid passes are rejected gracefully.
-; RUN: not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \
+; RUN: env LLD_IN_TEST=1 not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \
 ; RUN:   --lto-newpm-passes=iamnotapass -shared 2>&1 | \
 ; RUN:   FileCheck %s --check-prefix=INVALID
 ; INVALID: unable to parse pass pipeline description 'iamnotapass': unknown pass name 'iamnotapass'
 
 ; Check that invalid AA pipelines are rejected gracefully.
-; RUN: not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \
+; RUN: env LLD_IN_TEST=1 not --crash ld.lld -m elf_x86_64 %t.o -o /dev/null \
 ; RUN:   --lto-newpm-passes=globaldce --lto-aa-pipeline=patatino \
 ; RUN:   -shared 2>&1 | \
 ; RUN:   FileCheck %s --check-prefix=INVALIDAA
diff --git a/lld/test/ELF/lto/verify-invalid.ll b/lld/test/ELF/lto/verify-invalid.ll
index d97d0e1b78b8c..cb8bb389a608b 100644
--- a/lld/test/ELF/lto/verify-invalid.ll
+++ b/lld/test/ELF/lto/verify-invalid.ll
@@ -1,10 +1,10 @@
 ; REQUIRES: x86
 ; RUN: llvm-as %s -o %t.o
-; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \
+; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \
 ; RUN:   2>&1 | FileCheck -check-prefix=DEFAULT-NPM %s
-; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \
+; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \
 ; RUN:   -disable-verify 2>&1 | FileCheck -check-prefix=DISABLE-NPM %s
-; RUN: ld.lld %t.o -o %t2 --lto-debug-pass-manager \
+; RUN: env LLD_IN_TEST=1 ld.lld %t.o -o %t2 --lto-debug-pass-manager \
 ; RUN:   --plugin-opt=disable-verify 2>&1 | FileCheck -check-prefix=DISABLE-NPM %s
 
 target triple = "x86_64-unknown-linux-gnu"

From f359c1f524bf826eba355b8863a870450eb747b0 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 18:01:52 -0800
Subject: [PATCH 122/432] [ELF] Disable error handling script if disableOutput

Fix ELF/error-handling-script-linux.test when LLD_IN_TEST=2 is set.
---
 lld/Common/ErrorHandler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/Common/ErrorHandler.cpp b/lld/Common/ErrorHandler.cpp
index 716bce54258ce..a11960325a9cd 100644
--- a/lld/Common/ErrorHandler.cpp
+++ b/lld/Common/ErrorHandler.cpp
@@ -289,7 +289,7 @@ void ErrorHandler::error(const Twine &msg) {
 
 void ErrorHandler::error(const Twine &msg, ErrorTag tag,
                          ArrayRef<StringRef> args) {
-  if (errorHandlingScript.empty()) {
+  if (errorHandlingScript.empty() || disableOutput) {
     error(msg);
     return;
   }

From 18335f4800ae5491a11e74a574969d716acddce7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 18:06:22 -0800
Subject: [PATCH 123/432] [ELF] Ignore --time-trace if disableOutput

To avoid prevent generating two JSON for LLD_IN_TEST=2 ld.lld
--time-trace.
---
 lld/ELF/Driver.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 770163f4de086..7e0d3fca31353 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1463,7 +1463,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   }
   ctx.arg.thinLTOModulesToCompile =
       args::getStrings(args, OPT_thinlto_single_module_eq);
-  ctx.arg.timeTraceEnabled = args.hasArg(OPT_time_trace_eq);
+  ctx.arg.timeTraceEnabled =
+      args.hasArg(OPT_time_trace_eq) && !ctx.e.disableOutput;
   ctx.arg.timeTraceGranularity =
       args::getInteger(args, OPT_time_trace_granularity, 500);
   ctx.arg.trace = args.hasArg(OPT_trace);

From c1f10ef0a5c15f1dccf87ff07699055297c715a5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 18:13:42 -0800
Subject: [PATCH 124/432] [ELF] SHF_LINK_ORDER: replace Fatal with ErrAlways

In LLD_IN_TEST=2 mode, when a thread calls Fatal, there will be no
output even if the process exits with code 1. Change the Fatal to
ErrAlways (not-recoverable) as subsequent code assumes SHF_LINK_ORDER
sh_link is correct.
---
 lld/ELF/InputFiles.cpp                          | 8 +++++---
 lld/test/ELF/invalid/linkorder-invalid-sec.test | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index e236057a0d6d1..eba4c234d3f16 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -890,9 +890,11 @@ void ObjFile<ELFT>::initializeSections(bool ignoreComdats,
     InputSectionBase *linkSec = nullptr;
     if (sec.sh_link < size)
       linkSec = this->sections[sec.sh_link];
-    if (!linkSec)
-      Fatal(ctx) << this
-                 << ": invalid sh_link index: " << uint32_t(sec.sh_link);
+    if (!linkSec) {
+      ErrAlways(ctx) << this
+                     << ": invalid sh_link index: " << uint32_t(sec.sh_link);
+      continue;
+    }
 
     // A SHF_LINK_ORDER section is discarded if its linked-to section is
     // discarded.
diff --git a/lld/test/ELF/invalid/linkorder-invalid-sec.test b/lld/test/ELF/invalid/linkorder-invalid-sec.test
index a2f4ee8f5bc2b..e0132956f0ba1 100644
--- a/lld/test/ELF/invalid/linkorder-invalid-sec.test
+++ b/lld/test/ELF/invalid/linkorder-invalid-sec.test
@@ -1,6 +1,6 @@
 # REQUIRES: x86
 # RUN: yaml2obj %s -o %t.o
-# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: not ld.lld %t.o -o /dev/null --noinhibit-exec 2>&1 | FileCheck %s
 # CHECK: invalid sh_link index: 12345
 
 --- !ELF

From c1ec5beb4ab36c2c4d99ed6d735d217e74364771 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 25 Jan 2025 18:31:42 -0800
Subject: [PATCH 125/432] [clang-format] Fix a TableGen crash on comment after
 l_paren (#124380)

Fixes #124248.
---
 clang/lib/Format/TokenAnnotator.cpp           | 10 ++++------
 clang/unittests/Format/FormatTestTableGen.cpp |  3 +++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index bc41d43d1438c..655766178fbb0 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1115,7 +1115,7 @@ class AnnotatingParser {
       }
       if (!CurrentToken || CurrentToken->isNot(tok::l_paren))
         return false;
-      skipToNextNonComment();
+      next();
       // FIXME: Hack using inheritance to child context
       Contexts.back().IsTableGenBangOpe = true;
       bool Result = parseParens();
@@ -1124,12 +1124,10 @@ class AnnotatingParser {
     }
     // SimpleValue 9: Cond operator
     if (Tok->is(TT_TableGenCondOperator)) {
-      Tok = CurrentToken;
-      skipToNextNonComment();
-      if (!Tok || Tok->isNot(tok::l_paren))
+      if (!CurrentToken || CurrentToken->isNot(tok::l_paren))
         return false;
-      bool Result = parseParens();
-      return Result;
+      next();
+      return parseParens();
     }
     // We have to check identifier at the last because the kind of bang/cond
     // operators are also identifier.
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 7771f6a109a9a..92377c31f2e91 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -101,6 +101,9 @@ TEST_F(FormatTestTableGen, BangOperators) {
                "                                  \"zerozero\",\n"
                "                                  true:  // default\n"
                "                                  \"positivepositive\");\n"
+               "  let CondOpe3WithCommentAfterLParen = !cond(\n"
+               "      // comment\n"
+               "      !eq(/* comment */ x, 0): \"zero\");\n"
                "}");
 }
 

From 9b6990ff2531942d534c9ef7db728af2437c3329 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Sat, 25 Jan 2025 19:23:12 -0800
Subject: [PATCH 126/432] [Github][CI] Add Windows Premerge Job for Testing
 (#122661)

This patch adds a windows premerge job for testing. We plan to enable
this by default soon once we have evaluated stability and have
reasonable reason to believe the system is reliable.
---
 .github/workflows/premerge.yaml | 64 ++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 30f4fc807f3a5..54d6e1bf092cf 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -16,7 +16,7 @@ jobs:
     if: github.repository_owner == 'llvm'
     runs-on: llvm-premerge-linux-runners
     concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
+      group: ${{ github.workflow }}-linux-${{ github.event.pull_request.number || github.sha }}
       cancel-in-progress: true
     steps:
       - name: Checkout LLVM
@@ -70,3 +70,65 @@ jobs:
           export CXX=/opt/llvm/bin/clang++
 
           ./.ci/monolithic-linux.sh "$(echo ${linux_projects} | tr ' ' ';')" "$(echo ${linux_check_targets})" "$(echo ${linux_runtimes} | tr ' ' ';')" "$(echo ${linux_runtime_check_targets})"
+
+  premerge-checks-windows:
+    if: github.repository_owner == 'llvm'
+    runs-on: llvm-premerge-windows-runners
+    concurrency:
+      group: ${{ github.workflow }}-windows-${{ github.event.pull_request.number || github.sha }}
+      cancel-in-progress: true
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout LLVM
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+      - name: Setup ccache
+        uses: hendrikmuhs/ccache-action@v1.2.14
+        with:
+          variant: "sccache"
+          max-size: "2000M"
+      - name: Compute Projects
+        id: vars
+        run: |
+          modified_files=$(git diff --name-only HEAD~1...HEAD)
+          modified_dirs=$(echo "$modified_files" | cut -d'/' -f1 | sort | uniq)
+
+          echo $modified_files
+          echo $modified_dirs
+
+          . ./.ci/compute-projects.sh
+
+          all_projects="bolt clang clang-tools-extra compiler-rt cross-project-tests flang libc libclc lld lldb llvm mlir openmp polly pstl"
+          modified_projects="$(keep-modified-projects ${all_projects})"
+
+          windows_projects_to_test=$(exclude-windows $(compute-projects-to-test 1 ${modified_projects}))
+          windows_check_targets=$(check-targets ${windows_projects_to_test} | sort | uniq | tr -d '\r' | tr '\n' ' ')
+          windows_projects=$(add-dependencies ${windows_projects_to_test} | sort | uniq | tr -d '\r' | tr '\n' ';')
+
+          if [[ "${windows_projects}" == "" ]]; then
+            echo "No projects to build"
+          fi
+
+          echo "Building projects: ${windows_projects}"
+          echo "Running project checks targets: ${windows_check_targets}"
+
+          echo "windows-projects=${windows_projects}" >> $GITHUB_OUTPUT
+          echo "windows-check-targets=${windows_check_targets}" >> $GITHUB_OUTPUT
+      - name: Build and Test
+        # Mark the job as a success even if the step fails so that people do
+        # not get notified while the new premerge pipeline is in an
+        # experimental state.
+        # TODO(boomanaiden154): Remove this once the pipeline is stable and we
+        # are ready for people to start recieving notifications.
+        continue-on-error: true
+        if: ${{ steps.vars.outputs.windows-projects != '' }}
+        shell: cmd
+        run: |
+          set MAX_PARALLEL_COMPILE_JOBS=64
+          set MAX_PARALLEL_LINK_JOBS=64
+          call C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
+          bash .ci/monolithic-windows.sh "${{ steps.vars.outputs.windows-projects }}" "${{ steps.vars.outputs.windows-check-targets }}"
+

From 6bb70a94da1b5c53143537f1d2e96602a74331ca Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Sat, 25 Jan 2025 19:48:46 -0800
Subject: [PATCH 127/432] workflows/release-binaries: Enable builds on
 Linux/AArch64 (#120786)

---
 .github/workflows/release-binaries-all.yml |  1 +
 .github/workflows/release-binaries.yml     | 24 ++++++++++++++++------
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/release-binaries-all.yml b/.github/workflows/release-binaries-all.yml
index d5b2d33286101..d18b9b0b5c2ff 100644
--- a/.github/workflows/release-binaries-all.yml
+++ b/.github/workflows/release-binaries-all.yml
@@ -83,6 +83,7 @@ jobs:
       matrix:
         runs-on:
           - ubuntu-22.04
+          - ubuntu-22.04-arm
           - macos-13
           - macos-14
 
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index f9a264e7cf48f..2ca4aea8a3b0e 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -18,6 +18,7 @@ on:
         type: choice
         options:
           - ubuntu-22.04
+          - ubuntu-22.04-arm
           - macos-13
           - macos-14
 
@@ -55,6 +56,7 @@ jobs:
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
       target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
+      ccache: ${{ steps.vars.outputs.ccache }}
       build-flang: ${{ steps.vars.outputs.build-flang }}
       enable-pgo: ${{ steps.vars.outputs.enable-pgo }}
       release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
@@ -119,8 +121,16 @@ jobs:
         echo "release-binary-basename=$release_binary_basename" >> $GITHUB_OUTPUT
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
-        # Detect necessary CMake flags
         target="$RUNNER_OS-$RUNNER_ARCH"
+        # The hendrikmuhs/ccache-action action does not support installing sccache
+        # on arm64 Linux.
+        if [ "$target" = "Linux-ARM64" ]; then
+          echo ccache=ccache >> $GITHUB_OUTPUT
+        else
+          echo ccache=sccache >> $GITHUB_OUTPUT
+        fi
+
+        # Detect necessary CMake flags
         echo "enable-pgo=false" >> $GITHUB_OUTPUT
         target_cmake_flags="-DLLVM_RELEASE_ENABLE_PGO=OFF"
         # The macOS builds try to cross compile some libraries so we need to
@@ -146,7 +156,7 @@ jobs:
         echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
         case "${{ inputs.runs-on }}" in
-          ubuntu-22.04)
+          ubuntu-22.04*)
             build_runs_on="depot-${{ inputs.runs-on }}-16"
             test_runs_on=$build_runs_on
             ;;
@@ -221,12 +231,14 @@ jobs:
       with:
         # Default to 2G to workaround: https://github.com/hendrikmuhs/ccache-action/issues/174
         max-size: 2G
-        key: sccache-${{ runner.os }}-${{ runner.arch }}-release
-        variant: sccache
+        key: ${{ needs.prepare.outputs.ccache }}-${{ runner.os }}-${{ runner.arch }}-release
+        variant: ${{ needs.prepare.outputs.ccache }}
 
     - name: Configure
       id: build
       shell: bash
+      env:
+        CCACHE_BIN: ${{ needs.prepare.outputs.ccache }}
       run: |
         # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
         # so we need to set some extra cmake flags to disable this.
@@ -235,8 +247,8 @@ jobs:
             -C clang/cmake/caches/Release.cmake \
             -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
             -DBOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}" \
-            -DCMAKE_C_COMPILER_LAUNCHER=sccache \
-            -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
+            -DCMAKE_C_COMPILER_LAUNCHER=$CCACHE_BIN \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=$CCACHE_BIN
     - name: Build
       shell: bash
       run: |

From 44b85743498a88cb9fd1281bdfac47c93fcf6fee Mon Sep 17 00:00:00 2001
From: Palmer <palmercox@gmail.com>
Date: Sat, 25 Jan 2025 23:30:55 -0500
Subject: [PATCH 128/432] [AArch64] Fix movk parsing with an .equ operand
 (#124428)

Prior to 5da801386c2b820a4596fc6d8da6b5f4a6da94b4, this code worked:

    .equ    p4_low_b0, 0x0000
    movk    x1, p4_low_b0, lsl 16

(The code above is from the isa-l project - I discovered this issue
while trying to compile it with clang 19 on MacOS on aarch64)

That commit fixed a different bug, but accidentally broke the case where
the second operand to movk is not a literal.

In 442f066fc464e953b7783230e95ccf2a67ebfb38, a fix was applied to handle
the case where the second operand is a value like "(Val) >> 16".
However, that didn't appear to fix the test case in this commit. In this
commit, we extend the change to handle the case where the second operand
is a identifier defined by .equ.

Fixes #124427
---
 llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 4 +++-
 llvm/test/MC/AArch64/basic-a64-instructions.s          | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index d3eda48f3276e..27b052825d213 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -5017,7 +5017,9 @@ bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
       return true;
     E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
     Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
-    return false;
+
+    // Parse an optional shift/extend modifier.
+    return parseOptionalShiftExtend(getTok());
   }
   case AsmToken::Integer:
   case AsmToken::Real:
diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s
index 0ae23d672e4a3..14ac11f581a55 100644
--- a/llvm/test/MC/AArch64/basic-a64-instructions.s
+++ b/llvm/test/MC/AArch64/basic-a64-instructions.s
@@ -3347,6 +3347,11 @@ _func:
 // CHECK: mov      x2, #5299989643264             // encoding: [0x42,0x9a,0xc0,0xd2]
 // CHECK: movk     xzr, #{{4321|0x10e1}}, lsl #48 // encoding: [0x3f,0x1c,0xe2,0xf2]
 
+	.equ equvalue, 0x0001
+        movk x1, equvalue, lsl 16
+// CHECK: .set equvalue, 1
+// CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2]
+
         movz x2, #:abs_g0:sym
         movk w3, #:abs_g0_nc:sym
 

From 753028bc81c1a556eaaaf45ac77ca0cf4c7a3b4a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 25 Jan 2025 22:04:03 -0800
Subject: [PATCH 129/432] [Xtensa] Move XtensaUtils to MCTargetDesc

PR #121118 attempted to introduce `checkRegister` used by
XtensaDisassembler. Since `checkRegister` and other functions in
XtensaUtils.cpp cannot link against XtensaCodeGen, move them to
XtensaDesc, which can be used by XtensaDisassembler.

Pull Request: https://github.com/llvm/llvm-project/pull/123969
---
 llvm/lib/Target/Xtensa/CMakeLists.txt         |  1 -
 .../MCTargetDesc/XtensaMCTargetDesc.cpp       | 42 +++++++++++++
 .../Xtensa/MCTargetDesc/XtensaMCTargetDesc.h  | 10 ++++
 llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp |  4 +-
 llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp |  4 +-
 llvm/lib/Target/Xtensa/XtensaUtils.cpp        | 59 -------------------
 llvm/lib/Target/Xtensa/XtensaUtils.h          | 27 ---------
 7 files changed, 56 insertions(+), 91 deletions(-)
 delete mode 100644 llvm/lib/Target/Xtensa/XtensaUtils.cpp
 delete mode 100644 llvm/lib/Target/Xtensa/XtensaUtils.h

diff --git a/llvm/lib/Target/Xtensa/CMakeLists.txt b/llvm/lib/Target/Xtensa/CMakeLists.txt
index 726efadc87c0b..4fc1ba6dfa650 100644
--- a/llvm/lib/Target/Xtensa/CMakeLists.txt
+++ b/llvm/lib/Target/Xtensa/CMakeLists.txt
@@ -24,7 +24,6 @@ add_llvm_target(XtensaCodeGen
   XtensaRegisterInfo.cpp
   XtensaSubtarget.cpp
   XtensaTargetMachine.cpp
-  XtensaUtils.cpp
 
   LINK_COMPONENTS
   AsmPrinter
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
index 2653c293dc0c4..fc23c2356825f 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.cpp
@@ -32,6 +32,48 @@
 
 using namespace llvm;
 
+bool Xtensa::isValidAddrOffset(int Scale, int64_t OffsetVal) {
+  bool Valid = false;
+
+  switch (Scale) {
+  case 1:
+    Valid = (OffsetVal >= 0 && OffsetVal <= 255);
+    break;
+  case 2:
+    Valid = (OffsetVal >= 0 && OffsetVal <= 510) && ((OffsetVal & 0x1) == 0);
+    break;
+  case 4:
+    Valid = (OffsetVal >= 0 && OffsetVal <= 1020) && ((OffsetVal & 0x3) == 0);
+    break;
+  default:
+    break;
+  }
+  return Valid;
+}
+
+bool Xtensa::isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset) {
+  int Scale = 0;
+
+  switch (Opcode) {
+  case Xtensa::L8UI:
+  case Xtensa::S8I:
+    Scale = 1;
+    break;
+  case Xtensa::L16SI:
+  case Xtensa::L16UI:
+  case Xtensa::S16I:
+    Scale = 2;
+    break;
+  case Xtensa::LEA_ADD:
+    return (Offset >= -128 && Offset <= 127);
+  default:
+    // assume that MI is 32-bit load/store operation
+    Scale = 4;
+    break;
+  }
+  return isValidAddrOffset(Scale, Offset);
+}
+
 static MCAsmInfo *createXtensaMCAsmInfo(const MCRegisterInfo &MRI,
                                         const Triple &TT,
                                         const MCTargetOptions &Options) {
diff --git a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
index 0e075be0df07f..6be54867d84a7 100644
--- a/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
+++ b/llvm/lib/Target/Xtensa/MCTargetDesc/XtensaMCTargetDesc.h
@@ -28,6 +28,7 @@ class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
+class MachineInstr;
 class StringRef;
 class Target;
 class raw_ostream;
@@ -43,6 +44,15 @@ MCAsmBackend *createXtensaMCAsmBackend(const Target &T,
                                        const MCTargetOptions &Options);
 std::unique_ptr<MCObjectTargetWriter>
 createXtensaObjectWriter(uint8_t OSABI, bool IsLittleEndian);
+
+namespace Xtensa {
+// Check address offset for load/store instructions.
+// The offset should be multiple of scale.
+bool isValidAddrOffset(int Scale, int64_t OffsetVal);
+
+// Check address offset for load/store instructions.
+bool isValidAddrOffsetForOpcode(unsigned Opcode, int64_t Offset);
+} // namespace Xtensa
 } // end namespace llvm
 
 // Defines symbolic names for Xtensa registers.
diff --git a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
index ef14095d18efb..06cccd4831bfc 100644
--- a/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaISelDAGToDAG.cpp
@@ -10,9 +10,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/XtensaMCTargetDesc.h"
 #include "Xtensa.h"
 #include "XtensaTargetMachine.h"
-#include "XtensaUtils.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@@ -75,7 +75,7 @@ class XtensaDAGToDAGISel : public SelectionDAGISel {
       ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
       int64_t OffsetVal = CN->getSExtValue();
 
-      Valid = isValidAddrOffset(Scale, OffsetVal);
+      Valid = Xtensa::isValidAddrOffset(Scale, OffsetVal);
 
       if (Valid) {
         // If the first operand is a FI, get the TargetFI Node.
diff --git a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
index bced2d4ad0095..4a8bafc540df0 100644
--- a/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaRegisterInfo.cpp
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "XtensaRegisterInfo.h"
+#include "MCTargetDesc/XtensaMCTargetDesc.h"
 #include "XtensaInstrInfo.h"
 #include "XtensaSubtarget.h"
-#include "XtensaUtils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -99,7 +99,7 @@ bool XtensaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int64_t Offset =
       SPOffset + (int64_t)StackSize + MI.getOperand(FIOperandNum + 1).getImm();
 
-  bool Valid = isValidAddrOffset(MI, Offset);
+  bool Valid = Xtensa::isValidAddrOffsetForOpcode(MI.getOpcode(), Offset);
 
   // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
   // field.
diff --git a/llvm/lib/Target/Xtensa/XtensaUtils.cpp b/llvm/lib/Target/Xtensa/XtensaUtils.cpp
deleted file mode 100644
index 98e424f6ea440..0000000000000
--- a/llvm/lib/Target/Xtensa/XtensaUtils.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-//===--- XtensaUtils.cpp ---- Xtensa Utility Functions ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains miscellaneous utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "XtensaUtils.h"
-
-namespace llvm {
-
-bool isValidAddrOffset(int Scale, int64_t OffsetVal) {
-  bool Valid = false;
-
-  switch (Scale) {
-  case 1:
-    Valid = (OffsetVal >= 0 && OffsetVal <= 255);
-    break;
-  case 2:
-    Valid = (OffsetVal >= 0 && OffsetVal <= 510) && ((OffsetVal & 0x1) == 0);
-    break;
-  case 4:
-    Valid = (OffsetVal >= 0 && OffsetVal <= 1020) && ((OffsetVal & 0x3) == 0);
-    break;
-  default:
-    break;
-  }
-  return Valid;
-}
-
-bool isValidAddrOffset(MachineInstr &MI, int64_t Offset) {
-  int Scale = 0;
-
-  switch (MI.getOpcode()) {
-  case Xtensa::L8UI:
-  case Xtensa::S8I:
-    Scale = 1;
-    break;
-  case Xtensa::L16SI:
-  case Xtensa::L16UI:
-  case Xtensa::S16I:
-    Scale = 2;
-    break;
-  case Xtensa::LEA_ADD:
-    return (Offset >= -128 && Offset <= 127);
-  default:
-    // assume that MI is 32-bit load/store operation
-    Scale = 4;
-    break;
-  }
-  return isValidAddrOffset(Scale, Offset);
-}
-
-} // namespace llvm
diff --git a/llvm/lib/Target/Xtensa/XtensaUtils.h b/llvm/lib/Target/Xtensa/XtensaUtils.h
deleted file mode 100644
index 2b0ac37a6971a..0000000000000
--- a/llvm/lib/Target/Xtensa/XtensaUtils.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===--- XtensaUtils.h ---- Xtensa Utility Functions ------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains miscellaneous utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H
-#define LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H
-
-#include "XtensaInstrInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-
-namespace llvm {
-// Check address offset for load/store instructions.
-// The offset should be multiple of scale.
-bool isValidAddrOffset(int Scale, int64_t OffsetVal);
-
-// Check address offset for load/store instructions.
-bool isValidAddrOffset(MachineInstr &MI, int64_t Offset);
-} // namespace llvm
-#endif // LLVM_LIB_TARGET_XTENSA_XTENSAUTILS_H

From 37fdde6025c8ead27a7608643b63e0d4498211e2 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 25 Jan 2025 22:53:10 -0800
Subject: [PATCH 130/432] [CodeGen] Remove implict conversions from Register to
 unsigned from MachineOperand. NFC

---
 llvm/include/llvm/CodeGen/MachineOperand.h | 2 +-
 llvm/lib/CodeGen/MachineOperand.cpp        | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index be1b4fb7d54fb..3ec46afa781ab 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -854,7 +854,7 @@ class MachineOperand {
     Op.IsEarlyClobber = isEarlyClobber;
     Op.TiedTo = 0;
     Op.IsDebug = isDebug;
-    Op.SmallContents.RegNo = Reg;
+    Op.SmallContents.RegNo = Reg.id();
     Op.Contents.Reg.Prev = nullptr;
     Op.Contents.Reg.Next = nullptr;
     Op.setSubReg(SubReg);
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index d11ac614ace35..f498491164e14 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -71,13 +71,13 @@ void MachineOperand::setReg(Register Reg) {
   if (MachineFunction *MF = getMFIfAvailable(*this)) {
     MachineRegisterInfo &MRI = MF->getRegInfo();
     MRI.removeRegOperandFromUseList(this);
-    SmallContents.RegNo = Reg;
+    SmallContents.RegNo = Reg.id();
     MRI.addRegOperandToUseList(this);
     return;
   }
 
   // Otherwise, just change the register, no problem.  :)
-  SmallContents.RegNo = Reg;
+  SmallContents.RegNo = Reg.id();
 }
 
 void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
@@ -291,7 +291,7 @@ void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp,
   assert(!(isDead && !isDef) && "Dead flag on non-def");
   assert(!(isKill && isDef) && "Kill flag on def");
   OpKind = MO_Register;
-  SmallContents.RegNo = Reg;
+  SmallContents.RegNo = Reg.id();
   SubReg_TargetFlags = 0;
   IsDef = isDef;
   IsImp = isImp;
@@ -390,7 +390,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
     // Register operands don't have target flags.
-    return hash_combine(MO.getType(), (unsigned)MO.getReg(), MO.getSubReg(), MO.isDef());
+    return hash_combine(MO.getType(), MO.getReg().id(), MO.getSubReg(),
+                        MO.isDef());
   case MachineOperand::MO_Immediate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
   case MachineOperand::MO_CImmediate:

From f46eb1430992ba1abe246dfd0b4ccf8229fe0ab7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 26 Jan 2025 00:15:32 -0800
Subject: [PATCH 131/432] [AMDGPU] Replace unsigned with Register in
 SIMachineScheduler. NFC

Some of these may eventually need to VirtRegOrUnit.
---
 llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp | 51 +++++++++----------
 llvm/lib/Target/AMDGPU/SIMachineScheduler.h   | 26 +++++-----
 2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 77b4f25021c75..b3fa65512e4c4 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -284,10 +284,9 @@ void SIScheduleBlock::fastSchedule() {
 }
 
 // Returns if the register was set between first and last.
-static bool isDefBetween(unsigned Reg,
-                           SlotIndex First, SlotIndex Last,
-                           const MachineRegisterInfo *MRI,
-                           const LiveIntervals *LIS) {
+static bool isDefBetween(Register Reg, SlotIndex First, SlotIndex Last,
+                         const MachineRegisterInfo *MRI,
+                         const LiveIntervals *LIS) {
   for (MachineRegisterInfo::def_instr_iterator
        UI = MRI->def_instr_begin(Reg),
        UE = MRI->def_instr_end(); UI != UE; ++UI) {
@@ -581,11 +580,11 @@ void SIScheduleBlock::printDebug(bool full) {
            << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
            << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
     dbgs() << "LiveIns:\n";
-    for (unsigned Reg : LiveInRegs)
+    for (Register Reg : LiveInRegs)
       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
 
     dbgs() << "\nLiveOuts:\n";
-    for (unsigned Reg : LiveOutRegs)
+    for (Register Reg : LiveOutRegs)
       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
   }
 
@@ -1413,12 +1412,12 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   // highest topological index.
   LiveOutRegsNumUsages.resize(Blocks.size());
   for (SIScheduleBlock *Block : Blocks) {
-    for (unsigned Reg : Block->getInRegs()) {
+    for (Register Reg : Block->getInRegs()) {
       bool Found = false;
       int topoInd = -1;
       for (SIScheduleBlock* Pred: Block->getPreds()) {
-        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
-        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+        std::set<Register> PredOutRegs = Pred->getOutRegs();
+        std::set<Register>::iterator RegPos = PredOutRegs.find(Reg);
 
         if (RegPos != PredOutRegs.end()) {
           Found = true;
@@ -1453,18 +1452,18 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   }
 #endif
 
-  std::set<unsigned> InRegs = DAG->getInRegs();
+  std::set<Register> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
   // Increase LiveOutRegsNumUsages for blocks
   // producing registers consumed in another
   // scheduling region.
-  for (unsigned Reg : DAG->getOutRegs()) {
+  for (Register Reg : DAG->getOutRegs()) {
     for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
       // Do reverse traversal
       int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
       SIScheduleBlock *Block = Blocks[ID];
-      const std::set<unsigned> &OutRegs = Block->getOutRegs();
+      const std::set<Register> &OutRegs = Block->getOutRegs();
 
       if (OutRegs.find(Reg) == OutRegs.end())
         continue;
@@ -1477,11 +1476,11 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
   for (SIScheduleBlock *Block : Blocks) {
-    for (unsigned Reg : Block->getInRegs()) {
+    for (Register Reg : Block->getInRegs()) {
       bool Found = false;
       for (SIScheduleBlock* Pred: Block->getPreds()) {
-        std::set<unsigned> PredOutRegs = Pred->getOutRegs();
-        std::set<unsigned>::iterator RegPos = PredOutRegs.find(Reg);
+        std::set<Register> PredOutRegs = Pred->getOutRegs();
+        std::set<Register>::iterator RegPos = PredOutRegs.find(Reg);
 
         if (RegPos != PredOutRegs.end()) {
           Found = true;
@@ -1573,13 +1572,11 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
   if (SregCurrentUsage > maxSregUsage)
     maxSregUsage = SregCurrentUsage;
   LLVM_DEBUG(dbgs() << "Picking New Blocks\n"; dbgs() << "Available: ";
-             for (SIScheduleBlock *Block
-                  : ReadyBlocks) dbgs()
-             << Block->getID() << ' ';
+             for (SIScheduleBlock *Block : ReadyBlocks)
+               dbgs() << Block->getID() << ' ';
              dbgs() << "\nCurrent Live:\n";
-             for (unsigned Reg
-                  : LiveRegs) dbgs()
-             << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
+             for (Register Reg : LiveRegs)
+               dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
              dbgs() << '\n';
              dbgs() << "Current VGPRs: " << VregCurrentUsage << '\n';
              dbgs() << "Current SGPRs: " << SregCurrentUsage << '\n';);
@@ -1634,7 +1631,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
 
 // Tracking of currently alive registers to determine VGPR Usage.
 
-void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
+void SIScheduleBlockScheduler::addLiveRegs(std::set<Register> &Regs) {
   for (Register Reg : Regs) {
     // For now only track virtual registers.
     if (!Reg.isVirtual())
@@ -1645,10 +1642,10 @@ void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
 }
 
 void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
-                                       std::set<unsigned> &Regs) {
-  for (unsigned Reg : Regs) {
+                                                std::set<Register> &Regs) {
+  for (Register Reg : Regs) {
     // For now only track virtual registers.
-    std::set<unsigned>::iterator Pos = LiveRegs.find(Reg);
+    std::set<Register>::iterator Pos = LiveRegs.find(Reg);
     assert (Pos != LiveRegs.end() && // Reg must be live.
                LiveRegsConsumers.find(Reg) != LiveRegsConsumers.end() &&
                LiveRegsConsumers[Reg] >= 1);
@@ -1687,8 +1684,8 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
 }
 
 std::vector<int>
-SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
-                                     std::set<unsigned> &OutRegs) {
+SIScheduleBlockScheduler::checkRegUsageImpact(std::set<Register> &InRegs,
+                                              std::set<Register> &OutRegs) {
   std::vector<int> DiffSetPressure;
   DiffSetPressure.assign(DAG->getTRI()->getNumRegPressureSets(), 0);
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index f8f4b5aae338e..b219cbd5672f0 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -81,8 +81,8 @@ class SIScheduleBlock {
   // Note that some registers are not 32 bits,
   // and thus the pressure is not equal
   // to the number of live registers.
-  std::set<unsigned> LiveInRegs;
-  std::set<unsigned> LiveOutRegs;
+  std::set<Register> LiveInRegs;
+  std::set<Register> LiveOutRegs;
 
   bool Scheduled = false;
   bool HighLatencyBlock = false;
@@ -157,8 +157,8 @@ class SIScheduleBlock {
     return InternalAdditionalPressure;
   }
 
-  std::set<unsigned> &getInRegs() { return LiveInRegs; }
-  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+  std::set<Register> &getInRegs() { return LiveInRegs; }
+  std::set<Register> &getOutRegs() { return LiveOutRegs; }
 
   void printDebug(bool Full);
 
@@ -320,10 +320,10 @@ class SIScheduleBlockScheduler {
   SISchedulerBlockSchedulerVariant Variant;
   std::vector<SIScheduleBlock*> Blocks;
 
-  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
-  std::set<unsigned> LiveRegs;
+  std::vector<std::map<Register, unsigned>> LiveOutRegsNumUsages;
+  std::set<Register> LiveRegs;
   // Num of schedulable unscheduled blocks reading the register.
-  std::map<unsigned, unsigned> LiveRegsConsumers;
+  std::map<Register, unsigned> LiveRegsConsumers;
 
   std::vector<unsigned> LastPosHighLatencyParentScheduled;
   int LastPosWaitedHighLatency;
@@ -389,15 +389,15 @@ class SIScheduleBlockScheduler {
                             SIBlockSchedCandidate &TryCand);
   SIScheduleBlock *pickBlock();
 
-  void addLiveRegs(std::set<unsigned> &Regs);
-  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void addLiveRegs(std::set<Register> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<Register> &Regs);
   void releaseBlockSuccs(SIScheduleBlock *Parent);
   void blockScheduled(SIScheduleBlock *Block);
 
   // Check register pressure change
   // by scheduling a block with these LiveIn and LiveOut.
-  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
-                                       std::set<unsigned> &OutRegs);
+  std::vector<int> checkRegUsageImpact(std::set<Register> &InRegs,
+                                       std::set<Register> &OutRegs);
 
   void schedule();
 };
@@ -462,8 +462,8 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive {
                                                      unsigned &VgprUsage,
                                                      unsigned &SgprUsage);
 
-  std::set<unsigned> getInRegs() {
-    std::set<unsigned> InRegs;
+  std::set<Register> getInRegs() {
+    std::set<Register> InRegs;
     for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
       InRegs.insert(RegMaskPair.RegUnit);
     }

From ab895ad2bfb6835e8c47d8e616edb6cadaf59b77 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 26 Jan 2025 01:34:41 -0800
Subject: [PATCH 132/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124446)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect Pattern to be nonnull.
---
 clang/lib/AST/DeclTemplate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index e4cb7dcb16a45..de81bc64106f1 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1463,7 +1463,7 @@ SourceRange VarTemplateSpecializationDecl::getSourceRange() const {
     assert(!Pattern.isNull() &&
            "Variable template specialization without pattern?");
     if (const auto *VTPSD =
-            Pattern.dyn_cast<VarTemplatePartialSpecializationDecl *>())
+            dyn_cast<VarTemplatePartialSpecializationDecl *>(Pattern))
       return VTPSD->getSourceRange();
     VarTemplateDecl *VTD = cast<VarTemplateDecl *>(Pattern);
     if (hasInit()) {

From f09a6f632584c2b34f8f2d048a5420b040bb1005 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 26 Jan 2025 01:34:59 -0800
Subject: [PATCH 133/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124447)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect AnyFunc to be nonnull.
---
 clang/lib/Sema/SemaAPINotes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaAPINotes.cpp b/clang/lib/Sema/SemaAPINotes.cpp
index 4f79775bc5e91..b354bb7b06435 100644
--- a/clang/lib/Sema/SemaAPINotes.cpp
+++ b/clang/lib/Sema/SemaAPINotes.cpp
@@ -478,7 +478,7 @@ static void ProcessAPINotes(Sema &S, FunctionOrMethod AnyFunc,
                             const api_notes::FunctionInfo &Info,
                             VersionedInfoMetadata Metadata) {
   // Find the declaration itself.
-  FunctionDecl *FD = AnyFunc.dyn_cast<FunctionDecl *>();
+  FunctionDecl *FD = dyn_cast<FunctionDecl *>(AnyFunc);
   Decl *D = FD;
   ObjCMethodDecl *MD = nullptr;
   if (!D) {

From 850852e9a45f7883bd1a04c2a6b9fceb6dcdaba2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 26 Jan 2025 01:35:39 -0800
Subject: [PATCH 134/432] [CodeGen] Avoid repeated hash lookups (NFC) (#124455)

---
 llvm/lib/CodeGen/InlineSpiller.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index d98254650a001..33915d0f7f829 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -1578,7 +1578,8 @@ void HoistSpillHelper::runHoistSpills(
       for (auto *const SpillBB : SpillsInSubTree) {
         // When SpillBB is a BB contains original spill, insert the spill
         // to SpillsToRm.
-        if (SpillsToKeep.contains(SpillBB) && !SpillsToKeep[SpillBB]) {
+        if (auto It = SpillsToKeep.find(SpillBB);
+            It != SpillsToKeep.end() && !It->second) {
           MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
           SpillsToRm.push_back(SpillToRm);
         }

From 8035d38daab028b8da3cf2b01090b5f0ceacd695 Mon Sep 17 00:00:00 2001
From: Mats Petersson <mats.petersson@arm.com>
Date: Sun, 26 Jan 2025 09:44:04 +0000
Subject: [PATCH 135/432] [Flang][OpenMP]Add parsing support for DISPATCH
 construct (#121982)

This allows the Flang parser to accept the !$OMP DISPATCH and related
clauses.

Lowering is currently not implemented. Tests for unparse and parse-tree
dump is provided, and one for checking that the lowering ends in a "not
yet implemented"

---------

Co-authored-by: Kiran Chandramohan <kiran.chandramohan@arm.com>
---
 flang/include/flang/Parser/dump-parse-tree.h |  3 ++
 flang/include/flang/Parser/parse-tree.h      | 31 ++++++++++--
 flang/lib/Lower/OpenMP/OpenMP.cpp            | 10 ++++
 flang/lib/Parser/openmp-parsers.cpp          | 15 ++++++
 flang/lib/Parser/unparse.cpp                 |  9 ++++
 flang/lib/Semantics/check-omp-structure.cpp  | 30 ++++++++++++
 flang/lib/Semantics/check-omp-structure.h    |  2 +
 flang/lib/Semantics/resolve-directives.cpp   |  8 +++
 flang/test/Lower/OpenMP/Todo/dispatch.f90    | 12 +++++
 flang/test/Parser/OpenMP/dispatch.f90        | 51 ++++++++++++++++++++
 flang/test/Semantics/OpenMP/dispatch.f90     | 24 +++++++++
 11 files changed, 192 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/Todo/dispatch.f90
 create mode 100644 flang/test/Parser/OpenMP/dispatch.f90
 create mode 100644 flang/test/Semantics/OpenMP/dispatch.f90

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 11725991e9c9a..a501ae658a382 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -679,6 +679,9 @@ class ParseTreeDumper {
   NODE_ENUM(common, OmpAtomicDefaultMemOrderType)
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPUtilityConstruct)
+  NODE(parser, OpenMPDispatchConstruct)
+  NODE(parser, OmpDispatchDirective)
+  NODE(parser, OmpEndDispatchDirective)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPLoopConstruct)
   NODE(parser, OpenMPExecutableAllocate)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 00d85aa05fb3a..78962db8a84de 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4685,6 +4685,31 @@ struct OpenMPDepobjConstruct {
   std::tuple<Verbatim, OmpObject, OmpClause> t;
 };
 
+// Ref: [5.2: 200-201]
+//
+// dispatch-construct -> DISPATCH dispatch-clause
+// dispatch-clause -> depend-clause |
+//                    device-clause |
+//                    is_device_ptr-clause |
+//                    nocontext-clause |
+//                    novariants-clause |
+//                    nowait-clause
+struct OmpDispatchDirective {
+  TUPLE_CLASS_BOILERPLATE(OmpDispatchDirective);
+  CharBlock source;
+  std::tuple<Verbatim, OmpClauseList> t;
+};
+
+EMPTY_CLASS(OmpEndDispatchDirective);
+
+struct OpenMPDispatchConstruct {
+  TUPLE_CLASS_BOILERPLATE(OpenMPDispatchConstruct);
+  CharBlock source;
+  std::tuple<OmpDispatchDirective, Block,
+      std::optional<OmpEndDispatchDirective>>
+      t;
+};
+
 // 2.17.8 flush -> FLUSH [memory-order-clause] [(variable-name-list)]
 struct OpenMPFlushConstruct {
   TUPLE_CLASS_BOILERPLATE(OpenMPFlushConstruct);
@@ -4757,9 +4782,9 @@ struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
       OpenMPSectionConstruct, OpenMPLoopConstruct, OpenMPBlockConstruct,
-      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPUtilityConstruct,
-      OpenMPExecutableAllocate, OpenMPAllocatorsConstruct,
-      OpenMPCriticalConstruct>
+      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct,
+      OpenMPUtilityConstruct, OpenMPExecutableAllocate,
+      OpenMPAllocatorsConstruct, OpenMPCriticalConstruct>
       u;
 };
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1434bcd6330e0..7c8d292e90f01 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -381,6 +381,9 @@ extractOmpDirective(const parser::OpenMPConstruct &ompConstruct) {
           [](const parser::OpenMPDeclarativeAllocate &c) {
             return llvm::omp::OMPD_allocate;
           },
+          [](const parser::OpenMPDispatchConstruct &c) {
+            return llvm::omp::OMPD_dispatch;
+          },
           [](const parser::OpenMPExecutableAllocate &c) {
             return llvm::omp::OMPD_allocate;
           },
@@ -3388,6 +3391,13 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
   TODO(converter.getCurrentLocation(), "OpenMPUtilityConstruct");
 }
 
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OpenMPDispatchConstruct &) {
+  TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
+}
+
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 5ff91da082c85..aa2fec01bc640 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -740,11 +740,15 @@ TYPE_PARSER(
     "MERGEABLE" >> construct<OmpClause>(construct<OmpClause::Mergeable>()) ||
     "MESSAGE" >> construct<OmpClause>(construct<OmpClause::Message>(
                      parenthesized(Parser<OmpMessageClause>{}))) ||
+    "NOCONTEXT" >> construct<OmpClause>(construct<OmpClause::Nocontext>(
+                       parenthesized(scalarLogicalExpr))) ||
     "NOGROUP" >> construct<OmpClause>(construct<OmpClause::Nogroup>()) ||
     "NONTEMPORAL" >> construct<OmpClause>(construct<OmpClause::Nontemporal>(
                          parenthesized(nonemptyList(name)))) ||
     "NOTINBRANCH" >>
         construct<OmpClause>(construct<OmpClause::Notinbranch>()) ||
+    "NOVARIANTS" >> construct<OmpClause>(construct<OmpClause::Novariants>(
+                        parenthesized(scalarLogicalExpr))) ||
     "NOWAIT" >> construct<OmpClause>(construct<OmpClause::Nowait>()) ||
     "NUM_TASKS" >> construct<OmpClause>(construct<OmpClause::NumTasks>(
                        parenthesized(Parser<OmpNumTasksClause>{}))) ||
@@ -1119,6 +1123,16 @@ TYPE_PARSER(sourced(construct<OmpCriticalDirective>(verbatim("CRITICAL"_tok),
 TYPE_PARSER(construct<OpenMPCriticalConstruct>(
     Parser<OmpCriticalDirective>{}, block, Parser<OmpEndCriticalDirective>{}))
 
+TYPE_PARSER(sourced(construct<OmpDispatchDirective>(
+    verbatim("DISPATCH"_tok), Parser<OmpClauseList>{})))
+
+TYPE_PARSER(
+    construct<OmpEndDispatchDirective>(startOmpLine >> "END DISPATCH"_tok))
+
+TYPE_PARSER(sourced(construct<OpenMPDispatchConstruct>(
+    Parser<OmpDispatchDirective>{} / endOmpLine, block,
+    maybe(Parser<OmpEndDispatchDirective>{} / endOmpLine))))
+
 // 2.11.3 Executable Allocate directive
 TYPE_PARSER(
     sourced(construct<OpenMPExecutableAllocate>(verbatim("ALLOCATE"_tok),
@@ -1219,6 +1233,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPStandaloneConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}),
+                construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 7bf404bba2c3e..5b1ff07382c4d 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2725,6 +2725,15 @@ class UnparseVisitor {
     Walk(x.v);
     return false;
   }
+  void Unparse(const OmpDispatchDirective &x) {
+    Word("!$OMP DISPATCH");
+    Walk(x.t);
+    Put("\n");
+  }
+  void Unparse(const OmpEndDispatchDirective &) {
+    Word("!$OMP END DISPATCH");
+    Put("\n");
+  }
   void Unparse(const OmpErrorDirective &x) {
     Word("!$OMP ERROR ");
     Walk(x.t);
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index d3f2d3fd2f9dc..c7ad9cc085a21 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -1748,6 +1748,36 @@ void OmpStructureChecker::Enter(const parser::OmpErrorDirective &x) {
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_error);
 }
 
+void OmpStructureChecker::Enter(const parser::OpenMPDispatchConstruct &x) {
+  PushContextAndClauseSets(x.source, llvm::omp::Directive::OMPD_dispatch);
+  const auto &block{std::get<parser::Block>(x.t)};
+  if (block.empty() || block.size() > 1) {
+    context_.Say(x.source,
+        "The DISPATCH construct is empty or contains more than one statement"_err_en_US);
+    return;
+  }
+
+  auto it{block.begin()};
+  bool passChecks{false};
+  if (const parser::AssignmentStmt *
+      assignStmt{parser::Unwrap<parser::AssignmentStmt>(*it)}) {
+    if (parser::Unwrap<parser::FunctionReference>(assignStmt->t)) {
+      passChecks = true;
+    }
+  } else if (parser::Unwrap<parser::CallStmt>(*it)) {
+    passChecks = true;
+  }
+
+  if (!passChecks) {
+    context_.Say(x.source,
+        "The DISPATCH construct does not contain a SUBROUTINE or FUNCTION"_err_en_US);
+  }
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDispatchConstruct &x) {
+  dirContext_.pop_back();
+}
+
 void OmpStructureChecker::Leave(const parser::OmpErrorDirective &x) {
   dirContext_.pop_back();
 }
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index dc360957c873b..2b8304cb17037 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -105,6 +105,8 @@ class OmpStructureChecker
   void Enter(const parser::OmpDeclareTargetWithList &);
   void Enter(const parser::OmpDeclareTargetWithClause &);
   void Leave(const parser::OmpDeclareTargetWithClause &);
+  void Enter(const parser::OpenMPDispatchConstruct &);
+  void Leave(const parser::OpenMPDispatchConstruct &);
   void Enter(const parser::OmpErrorDirective &);
   void Leave(const parser::OmpErrorDirective &);
   void Enter(const parser::OpenMPExecutableAllocate &);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index ea102371334a6..4e6d819f545a2 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -441,6 +441,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPDeclarativeAllocate &);
   void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); }
 
+  bool Pre(const parser::OpenMPDispatchConstruct &);
+  void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); }
+
   bool Pre(const parser::OpenMPExecutableAllocate &);
   void Post(const parser::OpenMPExecutableAllocate &);
 
@@ -1976,6 +1979,11 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) {
   return false;
 }
 
+bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
+  PushContext(x.source, llvm::omp::Directive::OMPD_dispatch);
+  return true;
+}
+
 bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
   const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
diff --git a/flang/test/Lower/OpenMP/Todo/dispatch.f90 b/flang/test/Lower/OpenMP/Todo/dispatch.f90
new file mode 100644
index 0000000000000..380dfa14eaae1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/dispatch.f90
@@ -0,0 +1,12 @@
+! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s
+
+! CHECK: not yet implemented: OpenMPDispatchConstruct
+program p
+  integer r
+  r = 1
+!$omp dispatch nowait
+  call foo()
+contains
+  subroutine foo
+  end subroutine
+end program p
diff --git a/flang/test/Parser/OpenMP/dispatch.f90 b/flang/test/Parser/OpenMP/dispatch.f90
new file mode 100644
index 0000000000000..98cd6090334f3
--- /dev/null
+++ b/flang/test/Parser/OpenMP/dispatch.f90
@@ -0,0 +1,51 @@
+! RUN: %flang_fc1 -fopenmp -fdebug-dump-parse-tree %s | FileCheck %s
+! RUN: %flang_fc1 -fopenmp -fdebug-unparse %s | FileCheck %s --check-prefix="UNPARSE"
+
+integer function func(a, b, c)
+  integer  :: a, b, c
+  func = a + b + c
+end function func
+
+subroutine sub(x)
+  use iso_c_binding
+  integer :: func
+  integer :: r
+  type(c_ptr) :: x
+  integer :: a = 14, b = 7, c = 21
+!UNPARSE: !$OMP DISPATCH DEVICE(3_4) NOWAIT NOCONTEXT(.false._4) NOVARIANTS(.true._4)
+!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
+!CHECK-NEXT: | | | OmpDispatchDirective
+!CHECK: | | | | OmpClauseList -> OmpClause -> Device -> OmpDeviceClause
+!CHECK-NEXT: | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK-NEXT: | | | | | | LiteralConstant -> IntLiteralConstant = '3'
+!CHECK-NEXT: | | | | OmpClause -> Nowait
+!CHECK-NEXT: | | | | OmpClause -> Nocontext -> Scalar -> Logical -> Expr = '.false._4'
+!CHECK-NEXT: | | | | | LiteralConstant -> LogicalLiteralConstant
+!CHECK-NEXT: | | | | | | bool = 'false'
+!CHECK-NEXT: | | | | OmpClause -> Novariants -> Scalar -> Logical -> Expr = '.true._4'
+!CHECK-NEXT: | | | | | EQ
+!CHECK-NEXT: | | | | | | Expr = '1_4'
+!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-NEXT: | | | | | | Expr = '1_4'
+!CHECK-NEXT: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK-NEXT: | | | Block
+ 
+  !$omp dispatch device(3) nowait nocontext(.false.) novariants(1.eq.1)
+  r = func(a, b, c)
+!UNPARSE: !$OMP END DISPATCH
+!CHECK: | | | OmpEndDispatchDirective
+  !$omp end dispatch
+
+!! Test the "no end dispatch" option.
+!UNPARSE: !$OMP DISPATCH  DEVICE(3_4) IS_DEVICE_PTR(x)
+!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPDispatchConstruct
+!CHECK-NEXT: | | | OmpDispatchDirective
+!CHECK: | | | | OmpClause -> IsDevicePtr ->  OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'x'  
+  !$omp dispatch device(3) is_device_ptr(x)
+  r = func(a+1, b+2, c+3)
+!CHECK-NOT: | | | OmpEndDispatchDirective
+
+end subroutine sub
+
+
+
diff --git a/flang/test/Semantics/OpenMP/dispatch.f90 b/flang/test/Semantics/OpenMP/dispatch.f90
new file mode 100644
index 0000000000000..7dfbeecb2fc1d
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/dispatch.f90
@@ -0,0 +1,24 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+
+subroutine sb1
+  integer :: r
+  r = 1
+  !ERROR: The DISPATCH construct does not contain a SUBROUTINE or FUNCTION
+  !$omp dispatch nowait
+  print *,r
+end subroutine
+subroutine sb2
+  integer :: r
+!ERROR: The DISPATCH construct is empty or contains more than one statement
+  !$omp dispatch
+  call foo()
+  r = bar()
+  !$omp end dispatch
+contains
+  subroutine foo
+  end subroutine foo
+  function bar
+    integer :: bar
+    bar = 2
+  end function
+end subroutine

From 81d38da65e336dfb023df89f1bdc32633ad05fb2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 26 Jan 2025 13:46:00 +0000
Subject: [PATCH 136/432] [LV] Add more tests for narrowing interleave groups
 for AArch64.

Add additional tests for
https://github.com/llvm/llvm-project/pull/106441.
---
 ...sform-narrow-interleave-to-widen-memory.ll | 205 ++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
new file mode 100644
index 0000000000000..3fca274a3bb12
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -0,0 +1,205 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
+; RUN: opt -p loop-vectorize -S %s | FileCheck --check-prefixes=CHECK %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx15.0.0"
+
+define void @test_complex_add_float(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
+; CHECK-LABEL: define void @test_complex_add_float(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x float>, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x float> [[WIDE_VEC2]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <8 x float>, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <8 x float> [[WIDE_VEC5]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <8 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x float> [[WIDE_VEC8]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[STRIDED_VEC]], [[STRIDED_VEC6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <4 x float> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x float> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <4 x float> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <8 x float> [[TMP13]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x float> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV1]]
+; CHECK-NEXT:    [[L_A_0:%.*]] = load float, ptr [[GEP_A_2]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_2]], i64 4
+; CHECK-NEXT:    [[L_A_1:%.*]] = load float, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd float [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_2]], i64 4
+; CHECK-NEXT:    [[L_B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd float [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[RES]], i64 [[IV1]]
+; CHECK-NEXT:    store float [[ADD_0]], ptr [[GEP_RES_0]], align 4
+; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 4
+; CHECK-NEXT:    store float [[ADD_1]], ptr [[GEP_RES_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { float, float }, ptr %A, i64 %iv
+  %gep.B.0 = getelementptr inbounds nuw { float, float }, ptr %B, i64 %iv
+  %l.A.0 = load float, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 4
+  %l.A.1 = load float, ptr %gep.A.1, align 4
+  %l.B.0 = load float, ptr %gep.B.0, align 4
+  %add.0 = fadd float %l.A.0, %l.B.0
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 4
+  %l.B.1 = load float, ptr %gep.B.1, align 4
+  %add.1 = fadd float %l.A.1, %l.B.1
+  %gep.res.0 = getelementptr inbounds nuw { float, float }, ptr %res, i64 %iv
+  store float %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 4
+  store float %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_complex_add_double(ptr %res, ptr noalias %A, ptr noalias %B, i64 %N) {
+; CHECK-LABEL: define void @test_complex_add_double(
+; CHECK-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[TMP1]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC5:%.*]] = load <4 x double>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC6:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC7:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <4 x double>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC8]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[STRIDED_VEC9]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[STRIDED_VEC7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[STRIDED_VEC10]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> [[TMP8]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC11:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC11]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_A_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_B_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[B]], i64 [[IV]]
+; CHECK-NEXT:    [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_A_0]], i64 8
+; CHECK-NEXT:    [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
+; CHECK-NEXT:    [[ADD_0:%.*]] = fadd double [[L_A_0]], [[L_B_0]]
+; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_B_0]], i64 8
+; CHECK-NEXT:    [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd double [[L_A_1]], [[L_B_1]]
+; CHECK-NEXT:    [[GEP_RES_0:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[IV]]
+; CHECK-NEXT:    store double [[ADD_0]], ptr [[GEP_RES_0]], align 4
+; CHECK-NEXT:    [[GEP_RES_1:%.*]] = getelementptr inbounds nuw i8, ptr [[GEP_RES_0]], i64 8
+; CHECK-NEXT:    store double [[ADD_1]], ptr [[GEP_RES_1]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.A.0 = getelementptr inbounds nuw { double, double }, ptr %A, i64 %iv
+  %gep.B.0 = getelementptr inbounds nuw { double, double }, ptr %B, i64 %iv
+  %l.A.0 = load double, ptr %gep.A.0, align 4
+  %gep.A.1 = getelementptr inbounds nuw i8, ptr %gep.A.0, i64 8
+  %l.A.1 = load double, ptr %gep.A.1, align 4
+  %l.B.0 = load double, ptr %gep.B.0, align 4
+  %add.0 = fadd double %l.A.0, %l.B.0
+  %gep.B.1 = getelementptr inbounds nuw i8, ptr %gep.B.0, i64 8
+  %l.B.1 = load double, ptr %gep.B.1, align 4
+  %add.1 = fadd double %l.A.1, %l.B.1
+  %gep.res.0 = getelementptr inbounds nuw { double, double }, ptr %res, i64 %iv
+  store double %add.0, ptr %gep.res.0, align 4
+  %gep.res.1 = getelementptr inbounds nuw i8, ptr %gep.res.0, i64 8
+  store double %add.1, ptr %gep.res.1, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, %N
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

From dec47b76f406242dfb9d36da4d7adfb171c71104 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 26 Jan 2025 14:43:51 +0000
Subject: [PATCH 137/432] [CostModel][X86] Update baseline CTTZ/CTLZ costs for
 x86_64 (#124312)

Followup to #123623 - now that the CMOV has been removed, the throughput has improved, reducing the benefit of vectorization on pre-x86-64-v3 CPUs
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  10 +-
 .../Analysis/CostModel/X86/ctlz-codesize.ll   |   8 +-
 .../CostModel/X86/ctlz-sizelatency.ll         |   8 +-
 llvm/test/Analysis/CostModel/X86/ctlz.ll      |   4 +-
 .../Analysis/CostModel/X86/cttz-codesize.ll   |   4 +-
 .../CostModel/X86/cttz-sizelatency.ll         |   8 +-
 llvm/test/Analysis/CostModel/X86/cttz.ll      |   4 +-
 .../CostModel/X86/intrinsic-cost-kinds.ll     |   6 +-
 .../test/Transforms/SLPVectorizer/X86/ctlz.ll | 172 +++++++++++-------
 .../test/Transforms/SLPVectorizer/X86/cttz.ll |  74 +++++++-
 10 files changed, 200 insertions(+), 98 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index d3c923a76d074..cdc2ce752743c 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4329,9 +4329,15 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::ABS,        MVT::i64,     {  1,  2,  3,  3 } }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
-    { ISD::CTLZ,       MVT::i64,     {  2,  2,  4,  5 } }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTLZ,       MVT::i64,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i32,     {  1,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i16,     {  2,  2,  3,  3 } }, // MOV+BSR+XOR
+    { ISD::CTLZ,       MVT::i8,      {  2,  2,  4,  3 } }, // MOV+BSR+XOR
     { ISD::CTLZ_ZERO_UNDEF, MVT::i64,{  1,  2,  2,  2 } }, // BSR+XOR
-    { ISD::CTTZ,       MVT::i64,     {  2,  2,  3,  4 } }, // TEST+BSF+CMOV/BRANCH
+    { ISD::CTTZ,       MVT::i64,     {  1,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i32,     {  1,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i16,     {  2,  2,  2,  2 } }, // MOV+BSF
+    { ISD::CTTZ,       MVT::i8,      {  2,  2,  2,  2 } }, // MOV+BSF
     { ISD::CTTZ_ZERO_UNDEF, MVT::i64,{  1,  2,  1,  2 } }, // BSF
     { ISD::CTPOP,      MVT::i64,     { 10,  6, 19, 19 } },
     { ISD::ROTL,       MVT::i64,     {  2,  3,  1,  3 } },
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
index da0f71c63ef80..9f8e4edf7a0fc 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-codesize.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
index 2425e7286265b..fc3516695852a 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz-sizelatency.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
@@ -69,7 +69,7 @@ define i32 @var_ctlz_i32u(i32 %a) {
 
 define i16 @var_ctlz_i16(i16 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i16'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i16 @llvm.ctlz.i16(i16 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i16'
@@ -95,7 +95,7 @@ define i16 @var_ctlz_i16u(i16 %a) {
 
 define i8 @var_ctlz_i8(i8 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i8'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %ctlz = call i8 @llvm.ctlz.i8(i8 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/ctlz.ll b/llvm/test/Analysis/CostModel/X86/ctlz.ll
index fa7982ce09e9c..d9d04de12467d 100644
--- a/llvm/test/Analysis/CostModel/X86/ctlz.ll
+++ b/llvm/test/Analysis/CostModel/X86/ctlz.ll
@@ -17,7 +17,7 @@ declare  i8 @llvm.ctlz.i8(i8, i1)
 
 define i64 @var_ctlz_i64(i64 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i64'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i64'
@@ -43,7 +43,7 @@ define i64 @var_ctlz_i64u(i64 %a) {
 
 define i32 @var_ctlz_i32(i32 %a) {
 ; NOLZCNT-LABEL: 'var_ctlz_i32'
-; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ctlz = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
 ; NOLZCNT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctlz
 ;
 ; LZCNT-LABEL: 'var_ctlz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
index 07bf1dd7a2ff6..621c1b9320fc8 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-codesize.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -40,7 +40,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
index afe5cb8c55fe6..34d363ce00879 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz-sizelatency.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
@@ -70,7 +70,7 @@ define i32 @var_cttz_i32u(i32 %a) {
 
 define i16 @var_cttz_i16(i16 %a) {
 ; NOBMI-LABEL: 'var_cttz_i16'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i16 @llvm.cttz.i16(i16 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i16 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i16'
@@ -96,7 +96,7 @@ define i16 @var_cttz_i16u(i16 %a) {
 
 define i8 @var_cttz_i8(i8 %a) {
 ; NOBMI-LABEL: 'var_cttz_i8'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i8 @llvm.cttz.i8(i8 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i8 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i8'
diff --git a/llvm/test/Analysis/CostModel/X86/cttz.ll b/llvm/test/Analysis/CostModel/X86/cttz.ll
index fa0f10f886f63..3f5a731b27d9b 100644
--- a/llvm/test/Analysis/CostModel/X86/cttz.ll
+++ b/llvm/test/Analysis/CostModel/X86/cttz.ll
@@ -18,7 +18,7 @@ declare  i8 @llvm.cttz.i8(i8, i1)
 
 define i64 @var_cttz_i64(i64 %a) {
 ; NOBMI-LABEL: 'var_cttz_i64'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i64 @llvm.cttz.i64(i64 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i64'
@@ -44,7 +44,7 @@ define i64 @var_cttz_i64u(i64 %a) {
 
 define i32 @var_cttz_i32(i32 %a) {
 ; NOBMI-LABEL: 'var_cttz_i32'
-; NOBMI-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; NOBMI-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cttz = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; NOBMI-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %cttz
 ;
 ; BMI-LABEL: 'var_cttz_i32'
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
index 062e5f157bae2..bcef47ee9e056 100644
--- a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
+++ b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -232,7 +232,7 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
 
 define void @cttz(i32 %a, <16 x i32> %va) {
 ; THRU-LABEL: 'cttz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -242,12 +242,12 @@ define void @cttz(i32 %a, <16 x i32> %va) {
 ; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
 ; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
index 8a22e45fe1ca5..9bf2ade3176d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
@@ -136,32 +136,47 @@ define void @ctlz_4i64() #0 {
 }
 
 define void @ctlz_4i32() #0 {
-; SSE2-LABEL: @ctlz_4i32(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; SSE2-NEXT:    ret void
+; SSE-LABEL: @ctlz_4i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
+; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_4i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
+; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT:    ret void
 ;
-; SSE4-LABEL: @ctlz_4i32(
-; SSE4-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
-; SSE4-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
-; SSE4-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
-; SSE4-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
-; SSE4-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
-; SSE4-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
-; SSE4-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
-; SSE4-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
-; SSE4-NEXT:    ret void
+; AVX2-LABEL: @ctlz_4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT:    ret void
 ;
-; AVX-LABEL: @ctlz_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT:    ret void
+; AVX512-LABEL: @ctlz_4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 4
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -179,47 +194,71 @@ define void @ctlz_4i32() #0 {
 }
 
 define void @ctlz_8i32() #0 {
-; SSE2-LABEL: @ctlz_8i32(
-; SSE2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
-; SSE2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; SSE2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 2
-; SSE2-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE2-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP3]], i1 false)
-; SSE2-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE2-NEXT:    ret void
+; SSE-LABEL: @ctlz_8i32(
+; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
+; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; SSE-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; SSE-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; SSE-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; SSE-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; SSE-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
+; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; SSE-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; SSE-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; SSE-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; SSE-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    ret void
+;
+; AVX1-LABEL: @ctlz_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
+; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
 ;
-; SSE4-LABEL: @ctlz_8i32(
-; SSE4-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
-; SSE4-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
-; SSE4-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
-; SSE4-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
-; SSE4-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
-; SSE4-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
-; SSE4-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
-; SSE4-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
-; SSE4-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
-; SSE4-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
-; SSE4-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
-; SSE4-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
-; SSE4-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
-; SSE4-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
-; SSE4-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
-; SSE4-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
-; SSE4-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
-; SSE4-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
-; SSE4-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
-; SSE4-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
-; SSE4-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
-; SSE4-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
-; SSE4-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
-; SSE4-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
-; SSE4-NEXT:    ret void
+; AVX2-LABEL: @ctlz_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX2-NEXT:    ret void
 ;
-; AVX-LABEL: @ctlz_8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
-; AVX-NEXT:    ret void
+; AVX512-LABEL: @ctlz_8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 2
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
@@ -1063,3 +1102,6 @@ define void @ctlz_undef_32i8() #0 {
 }
 
 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; SSE2: {{.*}}
+; SSE4: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
index 22f0c3f936509..896be6f2fe213 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/cttz.ll
@@ -142,11 +142,32 @@ define void @cttz_4i32() #0 {
 ; SSE-NEXT:    store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @cttz_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @cttz_4i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    store i32 [[CTTZ0]], ptr @dst32, align 4
+; AVX1-NEXT:    store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
+; AVX1-NEXT:    store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
+; AVX1-NEXT:    store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @cttz_4i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
+; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 4
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
@@ -191,11 +212,44 @@ define void @cttz_8i32() #0 {
 ; SSE-NEXT:    store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
 ; SSE-NEXT:    ret void
 ;
-; AVX-LABEL: @cttz_8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
-; AVX-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
-; AVX-NEXT:    ret void
+; AVX1-LABEL: @cttz_8i32(
+; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
+; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
+; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
+; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
+; AVX1-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
+; AVX1-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
+; AVX1-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
+; AVX1-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
+; AVX1-NEXT:    [[CTTZ0:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD0]], i1 false)
+; AVX1-NEXT:    [[CTTZ1:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD1]], i1 false)
+; AVX1-NEXT:    [[CTTZ2:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD2]], i1 false)
+; AVX1-NEXT:    [[CTTZ3:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD3]], i1 false)
+; AVX1-NEXT:    [[CTTZ4:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD4]], i1 false)
+; AVX1-NEXT:    [[CTTZ5:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD5]], i1 false)
+; AVX1-NEXT:    [[CTTZ6:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD6]], i1 false)
+; AVX1-NEXT:    [[CTTZ7:%.*]] = call i32 @llvm.cttz.i32(i32 [[LD7]], i1 false)
+; AVX1-NEXT:    store i32 [[CTTZ0]], ptr @dst32, align 2
+; AVX1-NEXT:    store i32 [[CTTZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
+; AVX1-NEXT:    store i32 [[CTTZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
+; AVX1-NEXT:    store i32 [[CTTZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
+; AVX1-NEXT:    store i32 [[CTTZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
+; AVX1-NEXT:    store i32 [[CTTZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
+; AVX1-NEXT:    store i32 [[CTTZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
+; AVX1-NEXT:    store i32 [[CTTZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
+; AVX1-NEXT:    ret void
+;
+; AVX2-LABEL: @cttz_8i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX2-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX2-NEXT:    ret void
+;
+; AVX512-LABEL: @cttz_8i32(
+; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
+; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> [[TMP1]], i1 false)
+; AVX512-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
+; AVX512-NEXT:    ret void
 ;
   %ld0 = load i32, ptr @src32, align 2
   %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2

From e4514293f99962b47d881d5b40722c6b56a1f425 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Sun, 26 Jan 2025 16:17:21 +0100
Subject: [PATCH 138/432] [Clang] Correctly determine constexprness of
 dependent lambdas. (#124468)

We skipped checking if a lambda is constexpr if the parent context was
dependent, even if the lambda itself wasn't (and there is no other
opportunity to establish constexprness)


Fixes #114234
Fixes #97958
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaLambda.cpp                 | 14 +++++------
 .../test/SemaCXX/cxx1z-constexpr-lambdas.cpp  | 24 +++++++++++++++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index e9fffddd507c6..b1238db758845 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -991,6 +991,7 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
+- Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index 87b3ca53cefaf..ceb32ee15dfa3 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -2239,18 +2239,18 @@ ExprResult Sema::BuildLambdaExpr(SourceLocation StartLoc, SourceLocation EndLoc,
 
   Cleanup.mergeFrom(LambdaCleanup);
 
-  LambdaExpr *Lambda = LambdaExpr::Create(Context, Class, IntroducerRange,
-                                          CaptureDefault, CaptureDefaultLoc,
-                                          ExplicitParams, ExplicitResultType,
-                                          CaptureInits, EndLoc,
-                                          ContainsUnexpandedParameterPack);
+  LambdaExpr *Lambda =
+      LambdaExpr::Create(Context, Class, IntroducerRange, CaptureDefault,
+                         CaptureDefaultLoc, ExplicitParams, ExplicitResultType,
+                         CaptureInits, EndLoc, ContainsUnexpandedParameterPack);
+
   // If the lambda expression's call operator is not explicitly marked constexpr
-  // and we are not in a dependent context, analyze the call operator to infer
+  // and is not dependent, analyze the call operator to infer
   // its constexpr-ness, suppressing diagnostics while doing so.
   if (getLangOpts().CPlusPlus17 && !CallOperator->isInvalidDecl() &&
       !CallOperator->isConstexpr() &&
       !isa<CoroutineBodyStmt>(CallOperator->getBody()) &&
-      !Class->getDeclContext()->isDependentContext()) {
+      !Class->isDependentContext()) {
     CallOperator->setConstexprKind(
         CheckConstexprFunctionDefinition(CallOperator,
                                          CheckConstexprKind::CheckValid)
diff --git a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
index 6a1f48bf7958f..0c20dd9dc58c6 100644
--- a/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
+++ b/clang/test/SemaCXX/cxx1z-constexpr-lambdas.cpp
@@ -349,3 +349,27 @@ static_assert(OtherCaptures(), "");
 } // namespace PR36054
 
 #endif // ndef CPP14_AND_EARLIER
+
+
+#if __cpp_constexpr >= 201907L
+namespace GH114234 {
+template <auto Arg>
+auto g() { return Arg; }
+
+template <typename>
+auto f() {
+    []<typename>() {
+        g<[] { return 123; }()>();
+    }.template operator()<int>();
+}
+
+void test() { f<int>(); }
+}
+
+namespace GH97958 {
+static_assert(
+  []<int I=0>() -> decltype([]{ return true; })
+  { return {}; }()());
+}
+
+#endif

From 0c784851c50b6b5b844e6a1f21bbe73efac332d4 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Sun, 26 Jan 2025 15:32:46 +0000
Subject: [PATCH 139/432] [MathExtras] Favor using the hexadecimal FP constants
 (#123180)

This just fixes a TODO now that we are using C++17.
---
 llvm/include/llvm/Support/MathExtras.h | 61 +++++++++++++-------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 574e9a6116603..5a6f51adc07f3 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -43,38 +43,37 @@ using common_sint =
 /// Mathematical constants.
 namespace numbers {
 // TODO: Track C++20 std::numbers.
-// TODO: Favor using the hexadecimal FP constants (requires C++17).
 // clang-format off
-constexpr double e          = 2.7182818284590452354, // (0x1.5bf0a8b145769P+1) https://oeis.org/A001113
-                 egamma     = .57721566490153286061, // (0x1.2788cfc6fb619P-1) https://oeis.org/A001620
-                 ln2        = .69314718055994530942, // (0x1.62e42fefa39efP-1) https://oeis.org/A002162
-                 ln10       = 2.3025850929940456840, // (0x1.26bb1bbb55516P+1) https://oeis.org/A002392
-                 log2e      = 1.4426950408889634074, // (0x1.71547652b82feP+0)
-                 log10e     = .43429448190325182765, // (0x1.bcb7b1526e50eP-2)
-                 pi         = 3.1415926535897932385, // (0x1.921fb54442d18P+1) https://oeis.org/A000796
-                 inv_pi     = .31830988618379067154, // (0x1.45f306dc9c883P-2) https://oeis.org/A049541
-                 sqrtpi     = 1.7724538509055160273, // (0x1.c5bf891b4ef6bP+0) https://oeis.org/A002161
-                 inv_sqrtpi = .56418958354775628695, // (0x1.20dd750429b6dP-1) https://oeis.org/A087197
-                 sqrt2      = 1.4142135623730950488, // (0x1.6a09e667f3bcdP+0) https://oeis.org/A00219
-                 inv_sqrt2  = .70710678118654752440, // (0x1.6a09e667f3bcdP-1)
-                 sqrt3      = 1.7320508075688772935, // (0x1.bb67ae8584caaP+0) https://oeis.org/A002194
-                 inv_sqrt3  = .57735026918962576451, // (0x1.279a74590331cP-1)
-                 phi        = 1.6180339887498948482; // (0x1.9e3779b97f4a8P+0) https://oeis.org/A001622
-constexpr float ef          = 2.71828183F, // (0x1.5bf0a8P+1) https://oeis.org/A001113
-                egammaf     = .577215665F, // (0x1.2788d0P-1) https://oeis.org/A001620
-                ln2f        = .693147181F, // (0x1.62e430P-1) https://oeis.org/A002162
-                ln10f       = 2.30258509F, // (0x1.26bb1cP+1) https://oeis.org/A002392
-                log2ef      = 1.44269504F, // (0x1.715476P+0)
-                log10ef     = .434294482F, // (0x1.bcb7b2P-2)
-                pif         = 3.14159265F, // (0x1.921fb6P+1) https://oeis.org/A000796
-                inv_pif     = .318309886F, // (0x1.45f306P-2) https://oeis.org/A049541
-                sqrtpif     = 1.77245385F, // (0x1.c5bf8aP+0) https://oeis.org/A002161
-                inv_sqrtpif = .564189584F, // (0x1.20dd76P-1) https://oeis.org/A087197
-                sqrt2f      = 1.41421356F, // (0x1.6a09e6P+0) https://oeis.org/A002193
-                inv_sqrt2f  = .707106781F, // (0x1.6a09e6P-1)
-                sqrt3f      = 1.73205081F, // (0x1.bb67aeP+0) https://oeis.org/A002194
-                inv_sqrt3f  = .577350269F, // (0x1.279a74P-1)
-                phif        = 1.61803399F; // (0x1.9e377aP+0) https://oeis.org/A001622
+constexpr double e          = 0x1.5bf0a8b145769P+1, // (2.7182818284590452354) https://oeis.org/A001113
+                 egamma     = 0x1.2788cfc6fb619P-1, // (.57721566490153286061) https://oeis.org/A001620
+                 ln2        = 0x1.62e42fefa39efP-1, // (.69314718055994530942) https://oeis.org/A002162
+                 ln10       = 0x1.26bb1bbb55516P+1, // (2.3025850929940456840) https://oeis.org/A002392
+                 log2e      = 0x1.71547652b82feP+0, // (1.4426950408889634074)
+                 log10e     = 0x1.bcb7b1526e50eP-2, // (.43429448190325182765)
+                 pi         = 0x1.921fb54442d18P+1, // (3.1415926535897932385) https://oeis.org/A000796
+                 inv_pi     = 0x1.45f306dc9c883P-2, // (.31830988618379067154) https://oeis.org/A049541
+                 sqrtpi     = 0x1.c5bf891b4ef6bP+0, // (1.7724538509055160273) https://oeis.org/A002161
+                 inv_sqrtpi = 0x1.20dd750429b6dP-1, // (.56418958354775628695) https://oeis.org/A087197
+                 sqrt2      = 0x1.6a09e667f3bcdP+0, // (1.4142135623730950488) https://oeis.org/A00219
+                 inv_sqrt2  = 0x1.6a09e667f3bcdP-1, // (.70710678118654752440)
+                 sqrt3      = 0x1.bb67ae8584caaP+0, // (1.7320508075688772935) https://oeis.org/A002194
+                 inv_sqrt3  = 0x1.279a74590331cP-1, // (.57735026918962576451)
+                 phi        = 0x1.9e3779b97f4a8P+0; // (1.6180339887498948482) https://oeis.org/A001622
+constexpr float ef          = 0x1.5bf0a8P+1F, // (2.71828183) https://oeis.org/A001113
+                egammaf     = 0x1.2788d0P-1F, // (.577215665) https://oeis.org/A001620
+                ln2f        = 0x1.62e430P-1F, // (.693147181) https://oeis.org/A002162
+                ln10f       = 0x1.26bb1cP+1F, // (2.30258509) https://oeis.org/A002392
+                log2ef      = 0x1.715476P+0F, // (1.44269504)
+                log10ef     = 0x1.bcb7b2P-2F, // (.434294482)
+                pif         = 0x1.921fb6P+1F, // (3.14159265) https://oeis.org/A000796
+                inv_pif     = 0x1.45f306P-2F, // (.318309886) https://oeis.org/A049541
+                sqrtpif     = 0x1.c5bf8aP+0F, // (1.77245385) https://oeis.org/A002161
+                inv_sqrtpif = 0x1.20dd76P-1F, // (.564189584) https://oeis.org/A087197
+                sqrt2f      = 0x1.6a09e6P+0F, // (1.41421356) https://oeis.org/A002193
+                inv_sqrt2f  = 0x1.6a09e6P-1F, // (.707106781)
+                sqrt3f      = 0x1.bb67aeP+0F, // (1.73205081) https://oeis.org/A002194
+                inv_sqrt3f  = 0x1.279a74P-1F, // (.577350269)
+                phif        = 0x1.9e377aP+0F; // (1.61803399) https://oeis.org/A001622
 // clang-format on
 } // namespace numbers
 

From 33ad474c45e6d7a0de7bc75e15e27cf6cb9ff895 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20Sainz=20de=20Baranda=20y=20Go=C3=B1i?=
 <manuel@zxe.io>
Date: Sun, 26 Jan 2025 16:48:42 +0100
Subject: [PATCH 140/432] [Clang] Add predefined macros for integer constants
 (#123514)

This adds predefined macros for integer constants to implement section 7.18.4 of ISO/IEC 9899:1999 in `<stdint.h>` in a safe way:

```
__INT8_C(c)
__INT16_C(c)
__INT32_C(c)
__INT64_C(c)
__INTMAX_C(c)
__UINT8_C(c)
__UINT16_C(c)
__UINT32_C(c)
__UINT64_C(c)
__UINTMAX_C(c)
```

Which improves compatibility with GCC and makes it trivial to implement
section 7.18.4 of ISO/IEC 9899:1999.

Clang defines `__INT<N>_C_SUFFIX__`, `__UINT<N>_C_SUFFIX__`,
`__INTAX_C_SUFFIX__` and `__UINTMAX_C_SUFFIX__`, but these macros are
useless for this purpose.

Let's say, for example, that `__INT64_C_SUFFIX__` expands to `L` or
`LL`. If the user defines them as a macros, the compiler will produce
errors if `INT64_C` is implemented in `<stdint.h>` using
`__INT64_C_SUFFIX__`:

**minimal-test.c:**
```cpp
#if defined(__clang__) & !defined(__INT64_C)
#	pragma clang diagnostic push
#	pragma clang diagnostic ignored "-Wreserved-identifier"
#	define __PSTDC_INT_C_(literal, suffix) literal##suffix
#	define __PSTDC_INT_C(literal, suffix) __PSTDC_INT_C_(literal, suffix)
#	define INT64_C(literal) __PSTDC_INT_C(literal, __INT64_C_SUFFIX__)
#	pragma clang diagnostic pop
#elif defined(__GNUC__)
#	define INT64_C __INT64_C
#endif

typedef __INT64_TYPE__ int64_t;

#define L  "Make Clang produce an error"
#define LL "Make Clang produce an error"

int main(int argc, char **argv)
	{
	(void)argc; (void)argv;
	int64_t v = INT64_C(9223372036854775807);
	(void)v;
	return 0;
	}

```

<img width="697" alt="imagen"
src="https://github.com/user-attachments/assets/6df97af6-7cfd-4cf9-85b7-d7c854509325"
/>

**test.c:**
```cpp
#if defined(__clang__) && !defined(__INT8_C)
#	pragma clang diagnostic push
#	pragma clang diagnostic ignored "-Wreserved-identifier"

#	define __PSTDC_INT_C_(literal, suffix) literal##suffix
#	define __PSTDC_INT_C(literal, suffix) __PSTDC_INT_C_(literal, suffix)

#	define INT8_C(literal)    __PSTDC_INT_C(literal, __INT8_C_SUFFIX__)
#	define INT16_C(literal)   __PSTDC_INT_C(literal, __INT16_C_SUFFIX__)
#	define INT32_C(literal)   __PSTDC_INT_C(literal, __INT32_C_SUFFIX__)
#	define INT64_C(literal)   __PSTDC_INT_C(literal, __INT64_C_SUFFIX__)
#	define INTMAX_C(literal)  __PSTDC_INT_C(literal, __INTMAX_C_SUFFIX__)
#	define UINT8_C(literal)   __PSTDC_INT_C(literal, __UINT8_C_SUFFIX__)
#	define UINT16_C(literal)  __PSTDC_INT_C(literal, __UINT16_C_SUFFIX__)
#	define UINT32_C(literal)  __PSTDC_INT_C(literal, __UINT32_C_SUFFIX__)
#	define UINT64_C(literal)  __PSTDC_INT_C(literal, __UINT64_C_SUFFIX__)
#	define UINTMAX_C(literal) __PSTDC_INT_C(literal, __UINTMAX_C_SUFFIX__)

#	pragma clang diagnostic pop

#else
#	define INT8_C    __INT8_C
#	define INT16_C   __INT16_C
#	define INT32_C   __INT32_C
#	define INT64_C   __INT64_C
#	define INTMAX_C  __INTMAX_C
#	define UINT8_C   __UINT8_C
#	define UINT16_C  __UINT16_C
#	define UINT32_C  __UINT32_C
#	define UINT64_C  __UINT64_C
#	define UINTMAX_C __UINTMAX_C
#endif

typedef __INT8_TYPE__    int8_t;
typedef __INT16_TYPE__   int16_t;
typedef __INT32_TYPE__   int32_t;
typedef __INT64_TYPE__   int64_t;
typedef __INTMAX_TYPE__  intmax_t;
typedef __UINT8_TYPE__   uint8_t;
typedef __UINT16_TYPE__  uint16_t;
typedef __UINT32_TYPE__  uint32_t;
typedef __UINT64_TYPE__  uint64_t;
typedef __UINTMAX_TYPE__ uintmax_t;

#define L   "Make Clang produce an error"
#define LL  "Make Clang produce an error"
#define U   "Make Clang produce an error"
#define UL  "Make Clang produce an error"
#define ULL "Make Clang produce an error"

int main(int argc, char **argv)
	{
	(void)argc; (void)argv;

	int8_t    a = INT8_C   (127);
	int16_t   b = INT16_C  (32767);
	int32_t   c = INT32_C  (2147483647);
	int64_t   d = INT64_C  (9223372036854775807);
	intmax_t  e = INTMAX_C (9223372036854775807);
	uint8_t   f = UINT8_C  (255);
	uint16_t  g = UINT16_C (65535);
	uint32_t  h = UINT32_C (4294967295);
	uint64_t  i = UINT64_C (18446744073709551615);
	uintmax_t j = UINTMAX_C(18446744073709551615);

	(void)a; (void)b; (void)c; (void)d; (void)e;
	(void)f; (void)g; (void)h; (void)i; (void)j;
	return 0;
	}
```
---
 clang/docs/ReleaseNotes.rst               |  11 +++
 clang/lib/Frontend/InitPreprocessor.cpp   |  14 ++-
 clang/test/Preprocessor/init-aarch64.c    |  35 ++++++++
 clang/test/Preprocessor/init-arm.c        |  71 +++++++++++++++
 clang/test/Preprocessor/init-csky.c       |  10 +++
 clang/test/Preprocessor/init-loongarch.c  |  20 +++++
 clang/test/Preprocessor/init-mips.c       |  60 +++++++++++++
 clang/test/Preprocessor/init-ppc.c        |  40 +++++++++
 clang/test/Preprocessor/init-ppc64.c      |  40 +++++++++
 clang/test/Preprocessor/init-s390x.c      |  10 +++
 clang/test/Preprocessor/init-v7k-compat.c |  10 +++
 clang/test/Preprocessor/init-ve.c         |  10 +++
 clang/test/Preprocessor/init-x86.c        |  70 +++++++++++++++
 clang/test/Preprocessor/init.c            | 101 ++++++++++++++++++++++
 14 files changed, 498 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b1238db758845..b63bd366cfe88 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -479,6 +479,17 @@ Non-comprehensive list of changes in this release
   ``__builtin_elementwise_sub_sat``, ``__builtin_reduce_min`` (For integral element type),
   ``__builtin_reduce_max`` (For integral element type).
 
+- The builtin macros ``__INT8_C``, ``__INT16_C``, ``__INT32_C``, ``__INT64_C``,
+  ``__INTMAX_C``, ``__UINT8_C``, ``__UINT16_C``, ``__UINT32_C``, ``__UINT64_C``
+  and ``__UINTMAX_C`` have been introduced to ease the implementaton of section
+  7.18.4 of ISO/IEC 9899:1999. These macros are also defined by GCC and should
+  be used instead of others that expand and paste the suffixes provided by
+  ``__INT8_C_SUFFIX__``, ``__INT16_C_SUFFIX__``, ``__INT32_C_SUFFIX__``,
+  ``__INT64_C_SUFFIX__``, ``__INTMAX_C_SUFFIX__``, ``__UINT8_C_SUFFIX__``,
+  ``__UINT16_C_SUFFIX__``, ``__UINT32_C_SUFFIX__``, ``__UINT64_C_SUFFIX__`` and
+  ``__UINTMAX_C_SUFFIX__``. Pasting suffixes after the expansion of their
+  respective macros is unsafe, as users can define the suffixes as macros.
+
 - Clang now rejects ``_BitInt`` matrix element types if the bit width is less than ``CHAR_WIDTH`` or
   not a power of two, matching preexisting behaviour for vector types.
 
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index 29723b573e771..17f624e964539 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -253,6 +253,8 @@ static void DefineExactWidthIntType(const LangOptions &LangOpts,
 
   StringRef ConstSuffix(TI.getTypeConstantSuffix(Ty));
   Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C_SUFFIX__", ConstSuffix);
+  Builder.defineMacro(Prefix + Twine(TypeWidth) + "_C(c)",
+                      ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c");
 }
 
 static void DefineExactWidthIntTypeSize(TargetInfo::IntType Ty,
@@ -1164,12 +1166,16 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
 
   DefineType("__INTMAX_TYPE__", TI.getIntMaxType(), Builder);
   DefineFmt(LangOpts, "__INTMAX", TI.getIntMaxType(), TI, Builder);
-  Builder.defineMacro("__INTMAX_C_SUFFIX__",
-                      TI.getTypeConstantSuffix(TI.getIntMaxType()));
+  StringRef ConstSuffix(TI.getTypeConstantSuffix(TI.getIntMaxType()));
+  Builder.defineMacro("__INTMAX_C_SUFFIX__", ConstSuffix);
+  Builder.defineMacro("__INTMAX_C(c)",
+                      ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c");
   DefineType("__UINTMAX_TYPE__", TI.getUIntMaxType(), Builder);
   DefineFmt(LangOpts, "__UINTMAX", TI.getUIntMaxType(), TI, Builder);
-  Builder.defineMacro("__UINTMAX_C_SUFFIX__",
-                      TI.getTypeConstantSuffix(TI.getUIntMaxType()));
+  ConstSuffix = TI.getTypeConstantSuffix(TI.getUIntMaxType());
+  Builder.defineMacro("__UINTMAX_C_SUFFIX__", ConstSuffix);
+  Builder.defineMacro("__UINTMAX_C(c)",
+                      ConstSuffix.size() ? Twine("c##") + ConstSuffix : "c");
   DefineType("__PTRDIFF_TYPE__", TI.getPtrDiffType(LangAS::Default), Builder);
   DefineFmt(LangOpts, "__PTRDIFF", TI.getPtrDiffType(LangAS::Default), TI,
             Builder);
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 8578993dbfaeb..b5e77ba10c347 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -135,26 +135,31 @@
 // AARCH64_CXX-NEXT: #define __GLIBCXX_BITSIZE_INT_N_0 128
 // AARCH64_CXX-NEXT: #define __GLIBCXX_TYPE_INT_N_0 __int128
 // AARCH64-NEXT: #define __HAVE_FUNCTION_MULTI_VERSIONING 1
+// AARCH64-NEXT: #define __INT16_C(c) c
 // AARCH64-NEXT: #define __INT16_C_SUFFIX__
 // AARCH64-NEXT: #define __INT16_FMTd__ "hd"
 // AARCH64-NEXT: #define __INT16_FMTi__ "hi"
 // AARCH64-NEXT: #define __INT16_MAX__ 32767
 // AARCH64-NEXT: #define __INT16_TYPE__ short
+// AARCH64-NEXT: #define __INT32_C(c) c
 // AARCH64-NEXT: #define __INT32_C_SUFFIX__
 // AARCH64-NEXT: #define __INT32_FMTd__ "d"
 // AARCH64-NEXT: #define __INT32_FMTi__ "i"
 // AARCH64-NEXT: #define __INT32_MAX__ 2147483647
 // AARCH64-NEXT: #define __INT32_TYPE__ int
+// AARCH64-NEXT: #define __INT64_C(c) c##L
 // AARCH64-NEXT: #define __INT64_C_SUFFIX__ L
 // AARCH64-NEXT: #define __INT64_FMTd__ "ld"
 // AARCH64-NEXT: #define __INT64_FMTi__ "li"
 // AARCH64-NEXT: #define __INT64_MAX__ 9223372036854775807L
 // AARCH64-NEXT: #define __INT64_TYPE__ long int
+// AARCH64-NEXT: #define __INT8_C(c) c
 // AARCH64-NEXT: #define __INT8_C_SUFFIX__
 // AARCH64-NEXT: #define __INT8_FMTd__ "hhd"
 // AARCH64-NEXT: #define __INT8_FMTi__ "hhi"
 // AARCH64-NEXT: #define __INT8_MAX__ 127
 // AARCH64-NEXT: #define __INT8_TYPE__ signed char
+// AARCH64-NEXT: #define __INTMAX_C(c) c##L
 // AARCH64-NEXT: #define __INTMAX_C_SUFFIX__ L
 // AARCH64-NEXT: #define __INTMAX_FMTd__ "ld"
 // AARCH64-NEXT: #define __INTMAX_FMTi__ "li"
@@ -287,6 +292,7 @@
 // AARCH64-NEXT: #define __STDC_UTF_32__ 1
 // AARCH64_C: #define __STDC_VERSION__ 201710L
 // AARCH64-NEXT: #define __STDC__ 1
+// AARCH64-NEXT: #define __UINT16_C(c) c
 // AARCH64-NEXT: #define __UINT16_C_SUFFIX__
 // AARCH64-NEXT: #define __UINT16_FMTX__ "hX"
 // AARCH64-NEXT: #define __UINT16_FMTo__ "ho"
@@ -294,6 +300,7 @@
 // AARCH64-NEXT: #define __UINT16_FMTx__ "hx"
 // AARCH64-NEXT: #define __UINT16_MAX__ 65535
 // AARCH64-NEXT: #define __UINT16_TYPE__ unsigned short
+// AARCH64-NEXT: #define __UINT32_C(c) c##U
 // AARCH64-NEXT: #define __UINT32_C_SUFFIX__ U
 // AARCH64-NEXT: #define __UINT32_FMTX__ "X"
 // AARCH64-NEXT: #define __UINT32_FMTo__ "o"
@@ -301,6 +308,7 @@
 // AARCH64-NEXT: #define __UINT32_FMTx__ "x"
 // AARCH64-NEXT: #define __UINT32_MAX__ 4294967295U
 // AARCH64-NEXT: #define __UINT32_TYPE__ unsigned int
+// AARCH64-NEXT: #define __UINT64_C(c) c##UL
 // AARCH64-NEXT: #define __UINT64_C_SUFFIX__ UL
 // AARCH64-NEXT: #define __UINT64_FMTX__ "lX"
 // AARCH64-NEXT: #define __UINT64_FMTo__ "lo"
@@ -308,6 +316,7 @@
 // AARCH64-NEXT: #define __UINT64_FMTx__ "lx"
 // AARCH64-NEXT: #define __UINT64_MAX__ 18446744073709551615UL
 // AARCH64-NEXT: #define __UINT64_TYPE__ long unsigned int
+// AARCH64-NEXT: #define __UINT8_C(c) c
 // AARCH64-NEXT: #define __UINT8_C_SUFFIX__
 // AARCH64-NEXT: #define __UINT8_FMTX__ "hhX"
 // AARCH64-NEXT: #define __UINT8_FMTo__ "hho"
@@ -315,6 +324,7 @@
 // AARCH64-NEXT: #define __UINT8_FMTx__ "hhx"
 // AARCH64-NEXT: #define __UINT8_MAX__ 255
 // AARCH64-NEXT: #define __UINT8_TYPE__ unsigned char
+// AARCH64-NEXT: #define __UINTMAX_C(c) c##UL
 // AARCH64-NEXT: #define __UINTMAX_C_SUFFIX__ UL
 // AARCH64-NEXT: #define __UINTMAX_FMTX__ "lX"
 // AARCH64-NEXT: #define __UINTMAX_FMTo__ "lo"
@@ -435,26 +445,31 @@
 // AARCH64-DARWIN: #define __FLT_MIN__ 1.17549435e-38F
 // AARCH64-DARWIN: #define __FLT_RADIX__ 2
 // AARCH64-DARWIN: #define __FUNCTION_MULTI_VERSIONING_SUPPORT_LEVEL 202430
+// AARCH64-DARWIN: #define __INT16_C(c) c
 // AARCH64-DARWIN: #define __INT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT16_FMTd__ "hd"
 // AARCH64-DARWIN: #define __INT16_FMTi__ "hi"
 // AARCH64-DARWIN: #define __INT16_MAX__ 32767
 // AARCH64-DARWIN: #define __INT16_TYPE__ short
+// AARCH64-DARWIN: #define __INT32_C(c) c
 // AARCH64-DARWIN: #define __INT32_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT32_FMTd__ "d"
 // AARCH64-DARWIN: #define __INT32_FMTi__ "i"
 // AARCH64-DARWIN: #define __INT32_MAX__ 2147483647
 // AARCH64-DARWIN: #define __INT32_TYPE__ int
+// AARCH64-DARWIN: #define __INT64_C(c) c##LL
 // AARCH64-DARWIN: #define __INT64_C_SUFFIX__ LL
 // AARCH64-DARWIN: #define __INT64_FMTd__ "lld"
 // AARCH64-DARWIN: #define __INT64_FMTi__ "lli"
 // AARCH64-DARWIN: #define __INT64_MAX__ 9223372036854775807LL
 // AARCH64-DARWIN: #define __INT64_TYPE__ long long int
+// AARCH64-DARWIN: #define __INT8_C(c) c
 // AARCH64-DARWIN: #define __INT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __INT8_FMTd__ "hhd"
 // AARCH64-DARWIN: #define __INT8_FMTi__ "hhi"
 // AARCH64-DARWIN: #define __INT8_MAX__ 127
 // AARCH64-DARWIN: #define __INT8_TYPE__ signed char
+// AARCH64-DARWIN: #define __INTMAX_C(c) c##L
 // AARCH64-DARWIN: #define __INTMAX_C_SUFFIX__ L
 // AARCH64-DARWIN: #define __INTMAX_FMTd__ "ld"
 // AARCH64-DARWIN: #define __INTMAX_FMTi__ "li"
@@ -538,18 +553,23 @@
 // AARCH64-DARWIN: #define __SIZE_MAX__ 18446744073709551615UL
 // AARCH64-DARWIN: #define __SIZE_TYPE__ long unsigned int
 // AARCH64-DARWIN: #define __SIZE_WIDTH__ 64
+// AARCH64-DARWIN: #define __UINT16_C(c) c
 // AARCH64-DARWIN: #define __UINT16_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT16_MAX__ 65535
 // AARCH64-DARWIN: #define __UINT16_TYPE__ unsigned short
+// AARCH64-DARWIN: #define __UINT32_C(c) c##U
 // AARCH64-DARWIN: #define __UINT32_C_SUFFIX__ U
 // AARCH64-DARWIN: #define __UINT32_MAX__ 4294967295U
 // AARCH64-DARWIN: #define __UINT32_TYPE__ unsigned int
+// AARCH64-DARWIN: #define __UINT64_C(c) c##ULL
 // AARCH64-DARWIN: #define __UINT64_C_SUFFIX__ ULL
 // AARCH64-DARWIN: #define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-DARWIN: #define __UINT64_TYPE__ long long unsigned int
+// AARCH64-DARWIN: #define __UINT8_C(c) c
 // AARCH64-DARWIN: #define __UINT8_C_SUFFIX__
 // AARCH64-DARWIN: #define __UINT8_MAX__ 255
 // AARCH64-DARWIN: #define __UINT8_TYPE__ unsigned char
+// AARCH64-DARWIN: #define __UINTMAX_C(c) c##UL
 // AARCH64-DARWIN: #define __UINTMAX_C_SUFFIX__ UL
 // AARCH64-DARWIN: #define __UINTMAX_MAX__ 18446744073709551615UL
 // AARCH64-DARWIN: #define __UINTMAX_TYPE__ long unsigned int
@@ -703,18 +723,23 @@
 // AARCH64-MSVC: #define __STDC_UTF_32__ 1
 // AARCH64-MSVC: #define __STDC_VERSION__ 201710L
 // AARCH64-MSVC: #define __STDC__ 1
+// AARCH64-MSVC: #define __UINT16_C(c) c
 // AARCH64-MSVC: #define __UINT16_C_SUFFIX__
 // AARCH64-MSVC: #define __UINT16_MAX__ 65535
 // AARCH64-MSVC: #define __UINT16_TYPE__ unsigned short
+// AARCH64-MSVC: #define __UINT32_C(c) c##U
 // AARCH64-MSVC: #define __UINT32_C_SUFFIX__ U
 // AARCH64-MSVC: #define __UINT32_MAX__ 4294967295U
 // AARCH64-MSVC: #define __UINT32_TYPE__ unsigned int
+// AARCH64-MSVC: #define __UINT64_C(c) c##ULL
 // AARCH64-MSVC: #define __UINT64_C_SUFFIX__ ULL
 // AARCH64-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL
 // AARCH64-MSVC: #define __UINT64_TYPE__ long long unsigned int
+// AARCH64-MSVC: #define __UINT8_C(c) c
 // AARCH64-MSVC: #define __UINT8_C_SUFFIX__
 // AARCH64-MSVC: #define __UINT8_MAX__ 255
 // AARCH64-MSVC: #define __UINT8_TYPE__ unsigned char
+// AARCH64-MSVC: #define __UINTMAX_C(c) c##ULL
 // AARCH64-MSVC: #define __UINTMAX_C_SUFFIX__ ULL
 // AARCH64-MSVC: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // AARCH64-MSVC: #define __UINTMAX_TYPE__ long long unsigned int
@@ -867,26 +892,31 @@
 // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
 // ARM64EC-MSVC: #define __HAVE_FUNCTION_MULTI_VERSIONING 1
+// ARM64EC-MSVC: #define __INT16_C(c) c
 // ARM64EC-MSVC: #define __INT16_C_SUFFIX__
 // ARM64EC-MSVC: #define __INT16_FMTd__ "hd"
 // ARM64EC-MSVC: #define __INT16_FMTi__ "hi"
 // ARM64EC-MSVC: #define __INT16_MAX__ 32767
 // ARM64EC-MSVC: #define __INT16_TYPE__ short
+// ARM64EC-MSVC: #define __INT32_C(c) c
 // ARM64EC-MSVC: #define __INT32_C_SUFFIX__
 // ARM64EC-MSVC: #define __INT32_FMTd__ "d"
 // ARM64EC-MSVC: #define __INT32_FMTi__ "i"
 // ARM64EC-MSVC: #define __INT32_MAX__ 2147483647
 // ARM64EC-MSVC: #define __INT32_TYPE__ int
+// ARM64EC-MSVC: #define __INT64_C(c) c##LL
 // ARM64EC-MSVC: #define __INT64_C_SUFFIX__ LL
 // ARM64EC-MSVC: #define __INT64_FMTd__ "lld"
 // ARM64EC-MSVC: #define __INT64_FMTi__ "lli"
 // ARM64EC-MSVC: #define __INT64_MAX__ 9223372036854775807LL
 // ARM64EC-MSVC: #define __INT64_TYPE__ long long int
+// ARM64EC-MSVC: #define __INT8_C(c) c
 // ARM64EC-MSVC: #define __INT8_C_SUFFIX__
 // ARM64EC-MSVC: #define __INT8_FMTd__ "hhd"
 // ARM64EC-MSVC: #define __INT8_FMTi__ "hhi"
 // ARM64EC-MSVC: #define __INT8_MAX__ 127
 // ARM64EC-MSVC: #define __INT8_TYPE__ signed char
+// ARM64EC-MSVC: #define __INTMAX_C(c) c##LL
 // ARM64EC-MSVC: #define __INTMAX_C_SUFFIX__ LL
 // ARM64EC-MSVC: #define __INTMAX_FMTd__ "lld"
 // ARM64EC-MSVC: #define __INTMAX_FMTi__ "lli"
@@ -1013,6 +1043,7 @@
 // ARM64EC-MSVC: #define __STDC_UTF_32__ 1
 // ARM64EC-MSVC: #define __STDC_VERSION__ 201710L
 // ARM64EC-MSVC: #define __STDC__ 1
+// ARM64EC-MSVC: #define __UINT16_C(c) c
 // ARM64EC-MSVC: #define __UINT16_C_SUFFIX__
 // ARM64EC-MSVC: #define __UINT16_FMTX__ "hX"
 // ARM64EC-MSVC: #define __UINT16_FMTo__ "ho"
@@ -1020,6 +1051,7 @@
 // ARM64EC-MSVC: #define __UINT16_FMTx__ "hx"
 // ARM64EC-MSVC: #define __UINT16_MAX__ 65535
 // ARM64EC-MSVC: #define __UINT16_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __UINT32_C(c) c##U
 // ARM64EC-MSVC: #define __UINT32_C_SUFFIX__ U
 // ARM64EC-MSVC: #define __UINT32_FMTX__ "X"
 // ARM64EC-MSVC: #define __UINT32_FMTo__ "o"
@@ -1027,6 +1059,7 @@
 // ARM64EC-MSVC: #define __UINT32_FMTx__ "x"
 // ARM64EC-MSVC: #define __UINT32_MAX__ 4294967295U
 // ARM64EC-MSVC: #define __UINT32_TYPE__ unsigned int
+// ARM64EC-MSVC: #define __UINT64_C(c) c##ULL
 // ARM64EC-MSVC: #define __UINT64_C_SUFFIX__ ULL
 // ARM64EC-MSVC: #define __UINT64_FMTX__ "llX"
 // ARM64EC-MSVC: #define __UINT64_FMTo__ "llo"
@@ -1034,6 +1067,7 @@
 // ARM64EC-MSVC: #define __UINT64_FMTx__ "llx"
 // ARM64EC-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL
 // ARM64EC-MSVC: #define __UINT64_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINT8_C(c) c
 // ARM64EC-MSVC: #define __UINT8_C_SUFFIX__
 // ARM64EC-MSVC: #define __UINT8_FMTX__ "hhX"
 // ARM64EC-MSVC: #define __UINT8_FMTo__ "hho"
@@ -1041,6 +1075,7 @@
 // ARM64EC-MSVC: #define __UINT8_FMTx__ "hhx"
 // ARM64EC-MSVC: #define __UINT8_MAX__ 255
 // ARM64EC-MSVC: #define __UINT8_TYPE__ unsigned char
+// ARM64EC-MSVC: #define __UINTMAX_C(c) c##ULL
 // ARM64EC-MSVC: #define __UINTMAX_C_SUFFIX__ ULL
 // ARM64EC-MSVC: #define __UINTMAX_FMTX__ "llX"
 // ARM64EC-MSVC: #define __UINTMAX_FMTo__ "llo"
diff --git a/clang/test/Preprocessor/init-arm.c b/clang/test/Preprocessor/init-arm.c
index 6e3acacc5c3a5..d2fcfe94bcd3d 100644
--- a/clang/test/Preprocessor/init-arm.c
+++ b/clang/test/Preprocessor/init-arm.c
@@ -46,26 +46,31 @@
 // ARM:#define __FLT_MIN_EXP__ (-125)
 // ARM:#define __FLT_MIN__ 1.17549435e-38F
 // ARM:#define __FLT_RADIX__ 2
+// ARM:#define __INT16_C(c) c
 // ARM:#define __INT16_C_SUFFIX__
 // ARM:#define __INT16_FMTd__ "hd"
 // ARM:#define __INT16_FMTi__ "hi"
 // ARM:#define __INT16_MAX__ 32767
 // ARM:#define __INT16_TYPE__ short
+// ARM:#define __INT32_C(c) c
 // ARM:#define __INT32_C_SUFFIX__
 // ARM:#define __INT32_FMTd__ "d"
 // ARM:#define __INT32_FMTi__ "i"
 // ARM:#define __INT32_MAX__ 2147483647
 // ARM:#define __INT32_TYPE__ int
+// ARM:#define __INT64_C(c) c##LL
 // ARM:#define __INT64_C_SUFFIX__ LL
 // ARM:#define __INT64_FMTd__ "lld"
 // ARM:#define __INT64_FMTi__ "lli"
 // ARM:#define __INT64_MAX__ 9223372036854775807LL
 // ARM:#define __INT64_TYPE__ long long int
+// ARM:#define __INT8_C(c) c
 // ARM:#define __INT8_C_SUFFIX__
 // ARM:#define __INT8_FMTd__ "hhd"
 // ARM:#define __INT8_FMTi__ "hhi"
 // ARM:#define __INT8_MAX__ 127
 // ARM:#define __INT8_TYPE__ signed char
+// ARM:#define __INTMAX_C(c) c##LL
 // ARM:#define __INTMAX_C_SUFFIX__ LL
 // ARM:#define __INTMAX_FMTd__ "lld"
 // ARM:#define __INTMAX_FMTi__ "lli"
@@ -151,18 +156,23 @@
 // ARM:#define __SIZE_TYPE__ unsigned int
 // ARM:#define __SIZE_WIDTH__ 32
 // ARM-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U
+// ARM:#define __UINT16_C(c) c
 // ARM:#define __UINT16_C_SUFFIX__
 // ARM:#define __UINT16_MAX__ 65535
 // ARM:#define __UINT16_TYPE__ unsigned short
+// ARM:#define __UINT32_C(c) c##U
 // ARM:#define __UINT32_C_SUFFIX__ U
 // ARM:#define __UINT32_MAX__ 4294967295U
 // ARM:#define __UINT32_TYPE__ unsigned int
+// ARM:#define __UINT64_C(c) c##ULL
 // ARM:#define __UINT64_C_SUFFIX__ ULL
 // ARM:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM:#define __UINT64_TYPE__ long long unsigned int
+// ARM:#define __UINT8_C(c) c
 // ARM:#define __UINT8_C_SUFFIX__
 // ARM:#define __UINT8_MAX__ 255
 // ARM:#define __UINT8_TYPE__ unsigned char
+// ARM:#define __UINTMAX_C(c) c##ULL
 // ARM:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM:#define __UINTMAX_TYPE__ long long unsigned int
@@ -248,26 +258,31 @@
 // ARM-BE:#define __FLT_MIN_EXP__ (-125)
 // ARM-BE:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-BE:#define __FLT_RADIX__ 2
+// ARM-BE:#define __INT16_C(c) c
 // ARM-BE:#define __INT16_C_SUFFIX__
 // ARM-BE:#define __INT16_FMTd__ "hd"
 // ARM-BE:#define __INT16_FMTi__ "hi"
 // ARM-BE:#define __INT16_MAX__ 32767
 // ARM-BE:#define __INT16_TYPE__ short
+// ARM-BE:#define __INT32_C(c) c
 // ARM-BE:#define __INT32_C_SUFFIX__
 // ARM-BE:#define __INT32_FMTd__ "d"
 // ARM-BE:#define __INT32_FMTi__ "i"
 // ARM-BE:#define __INT32_MAX__ 2147483647
 // ARM-BE:#define __INT32_TYPE__ int
+// ARM-BE:#define __INT64_C(c) c##LL
 // ARM-BE:#define __INT64_C_SUFFIX__ LL
 // ARM-BE:#define __INT64_FMTd__ "lld"
 // ARM-BE:#define __INT64_FMTi__ "lli"
 // ARM-BE:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-BE:#define __INT64_TYPE__ long long int
+// ARM-BE:#define __INT8_C(c) c
 // ARM-BE:#define __INT8_C_SUFFIX__
 // ARM-BE:#define __INT8_FMTd__ "hhd"
 // ARM-BE:#define __INT8_FMTi__ "hhi"
 // ARM-BE:#define __INT8_MAX__ 127
 // ARM-BE:#define __INT8_TYPE__ signed char
+// ARM-BE:#define __INTMAX_C(c) c##LL
 // ARM-BE:#define __INTMAX_C_SUFFIX__ LL
 // ARM-BE:#define __INTMAX_FMTd__ "lld"
 // ARM-BE:#define __INTMAX_FMTi__ "lli"
@@ -351,18 +366,23 @@
 // ARM-BE:#define __SIZE_MAX__ 4294967295U
 // ARM-BE:#define __SIZE_TYPE__ unsigned int
 // ARM-BE:#define __SIZE_WIDTH__ 32
+// ARM-BE:#define __UINT16_C(c) c
 // ARM-BE:#define __UINT16_C_SUFFIX__
 // ARM-BE:#define __UINT16_MAX__ 65535
 // ARM-BE:#define __UINT16_TYPE__ unsigned short
+// ARM-BE:#define __UINT32_C(c) c##U
 // ARM-BE:#define __UINT32_C_SUFFIX__ U
 // ARM-BE:#define __UINT32_MAX__ 4294967295U
 // ARM-BE:#define __UINT32_TYPE__ unsigned int
+// ARM-BE:#define __UINT64_C(c) c##ULL
 // ARM-BE:#define __UINT64_C_SUFFIX__ ULL
 // ARM-BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINT64_TYPE__ long long unsigned int
+// ARM-BE:#define __UINT8_C(c) c
 // ARM-BE:#define __UINT8_C_SUFFIX__
 // ARM-BE:#define __UINT8_MAX__ 255
 // ARM-BE:#define __UINT8_TYPE__ unsigned char
+// ARM-BE:#define __UINTMAX_C(c) c##ULL
 // ARM-BE:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-BE:#define __UINTMAX_TYPE__ long long unsigned int
@@ -440,26 +460,31 @@
 // ARMEABISOFT:#define __FLT_MIN_EXP__ (-125)
 // ARMEABISOFT:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABISOFT:#define __FLT_RADIX__ 2
+// ARMEABISOFT:#define __INT16_C(c) c
 // ARMEABISOFT:#define __INT16_C_SUFFIX__
 // ARMEABISOFT:#define __INT16_FMTd__ "hd"
 // ARMEABISOFT:#define __INT16_FMTi__ "hi"
 // ARMEABISOFT:#define __INT16_MAX__ 32767
 // ARMEABISOFT:#define __INT16_TYPE__ short
+// ARMEABISOFT:#define __INT32_C(c) c
 // ARMEABISOFT:#define __INT32_C_SUFFIX__
 // ARMEABISOFT:#define __INT32_FMTd__ "d"
 // ARMEABISOFT:#define __INT32_FMTi__ "i"
 // ARMEABISOFT:#define __INT32_MAX__ 2147483647
 // ARMEABISOFT:#define __INT32_TYPE__ int
+// ARMEABISOFT:#define __INT64_C(c) c##LL
 // ARMEABISOFT:#define __INT64_C_SUFFIX__ LL
 // ARMEABISOFT:#define __INT64_FMTd__ "lld"
 // ARMEABISOFT:#define __INT64_FMTi__ "lli"
 // ARMEABISOFT:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABISOFT:#define __INT64_TYPE__ long long int
+// ARMEABISOFT:#define __INT8_C(c) c
 // ARMEABISOFT:#define __INT8_C_SUFFIX__
 // ARMEABISOFT:#define __INT8_FMTd__ "hhd"
 // ARMEABISOFT:#define __INT8_FMTi__ "hhi"
 // ARMEABISOFT:#define __INT8_MAX__ 127
 // ARMEABISOFT:#define __INT8_TYPE__ signed char
+// ARMEABISOFT:#define __INTMAX_C(c) c##LL
 // ARMEABISOFT:#define __INTMAX_C_SUFFIX__ LL
 // ARMEABISOFT:#define __INTMAX_FMTd__ "lld"
 // ARMEABISOFT:#define __INTMAX_FMTi__ "lli"
@@ -545,18 +570,23 @@
 // ARMEABISOFT:#define __SIZE_TYPE__ unsigned int
 // ARMEABISOFT:#define __SIZE_WIDTH__ 32
 // ARMEABISOFT:#define __SOFTFP__ 1
+// ARMEABISOFT:#define __UINT16_C(c) c
 // ARMEABISOFT:#define __UINT16_C_SUFFIX__
 // ARMEABISOFT:#define __UINT16_MAX__ 65535
 // ARMEABISOFT:#define __UINT16_TYPE__ unsigned short
+// ARMEABISOFT:#define __UINT32_C(c) c##U
 // ARMEABISOFT:#define __UINT32_C_SUFFIX__ U
 // ARMEABISOFT:#define __UINT32_MAX__ 4294967295U
 // ARMEABISOFT:#define __UINT32_TYPE__ unsigned int
+// ARMEABISOFT:#define __UINT64_C(c) c##ULL
 // ARMEABISOFT:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABISOFT:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABISOFT:#define __UINT64_TYPE__ long long unsigned int
+// ARMEABISOFT:#define __UINT8_C(c) c
 // ARMEABISOFT:#define __UINT8_C_SUFFIX__
 // ARMEABISOFT:#define __UINT8_MAX__ 255
 // ARMEABISOFT:#define __UINT8_TYPE__ unsigned char
+// ARMEABISOFT:#define __UINTMAX_C(c) c##ULL
 // ARMEABISOFT:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABISOFT:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABISOFT:#define __UINTMAX_TYPE__ long long unsigned int
@@ -640,26 +670,31 @@
 // ARMEABISOFTFP_NOFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABISOFTFP_NOFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABISOFTFP_NOFP:#define __FLT_RADIX__ 2
+// ARMEABISOFTFP_NOFP:#define __INT16_C(c) c
 // ARMEABISOFTFP_NOFP:#define __INT16_C_SUFFIX__
 // ARMEABISOFTFP_NOFP:#define __INT16_FMTd__ "hd"
 // ARMEABISOFTFP_NOFP:#define __INT16_FMTi__ "hi"
 // ARMEABISOFTFP_NOFP:#define __INT16_MAX__ 32767
 // ARMEABISOFTFP_NOFP:#define __INT16_TYPE__ short
+// ARMEABISOFTFP_NOFP:#define __INT32_C(c) c
 // ARMEABISOFTFP_NOFP:#define __INT32_C_SUFFIX__
 // ARMEABISOFTFP_NOFP:#define __INT32_FMTd__ "d"
 // ARMEABISOFTFP_NOFP:#define __INT32_FMTi__ "i"
 // ARMEABISOFTFP_NOFP:#define __INT32_MAX__ 2147483647
 // ARMEABISOFTFP_NOFP:#define __INT32_TYPE__ int
+// ARMEABISOFTFP_NOFP:#define __INT64_C(c) c##LL
 // ARMEABISOFTFP_NOFP:#define __INT64_C_SUFFIX__ LL
 // ARMEABISOFTFP_NOFP:#define __INT64_FMTd__ "lld"
 // ARMEABISOFTFP_NOFP:#define __INT64_FMTi__ "lli"
 // ARMEABISOFTFP_NOFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABISOFTFP_NOFP:#define __INT64_TYPE__ long long int
+// ARMEABISOFTFP_NOFP:#define __INT8_C(c) c
 // ARMEABISOFTFP_NOFP:#define __INT8_C_SUFFIX__
 // ARMEABISOFTFP_NOFP:#define __INT8_FMTd__ "hhd"
 // ARMEABISOFTFP_NOFP:#define __INT8_FMTi__ "hhi"
 // ARMEABISOFTFP_NOFP:#define __INT8_MAX__ 127
 // ARMEABISOFTFP_NOFP:#define __INT8_TYPE__ signed char
+// ARMEABISOFTFP_NOFP:#define __INTMAX_C(c) c##LL
 // ARMEABISOFTFP_NOFP:#define __INTMAX_C_SUFFIX__ LL
 // ARMEABISOFTFP_NOFP:#define __INTMAX_FMTd__ "lld"
 // ARMEABISOFTFP_NOFP:#define __INTMAX_FMTi__ "lli"
@@ -745,18 +780,23 @@
 // ARMEABISOFTFP_NOFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABISOFTFP_NOFP:#define __SIZE_WIDTH__ 32
 // ARMEABISOFTFP_NOFP:#define __SOFTFP__ 1
+// ARMEABISOFTFP_NOFP:#define __UINT16_C(c) c
 // ARMEABISOFTFP_NOFP:#define __UINT16_C_SUFFIX__
 // ARMEABISOFTFP_NOFP:#define __UINT16_MAX__ 65535
 // ARMEABISOFTFP_NOFP:#define __UINT16_TYPE__ unsigned short
+// ARMEABISOFTFP_NOFP:#define __UINT32_C(c) c##U
 // ARMEABISOFTFP_NOFP:#define __UINT32_C_SUFFIX__ U
 // ARMEABISOFTFP_NOFP:#define __UINT32_MAX__ 4294967295U
 // ARMEABISOFTFP_NOFP:#define __UINT32_TYPE__ unsigned int
+// ARMEABISOFTFP_NOFP:#define __UINT64_C(c) c##ULL
 // ARMEABISOFTFP_NOFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABISOFTFP_NOFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP_NOFP:#define __UINT64_TYPE__ long long unsigned int
+// ARMEABISOFTFP_NOFP:#define __UINT8_C(c) c
 // ARMEABISOFTFP_NOFP:#define __UINT8_C_SUFFIX__
 // ARMEABISOFTFP_NOFP:#define __UINT8_MAX__ 255
 // ARMEABISOFTFP_NOFP:#define __UINT8_TYPE__ unsigned char
+// ARMEABISOFTFP_NOFP:#define __UINTMAX_C(c) c##ULL
 // ARMEABISOFTFP_NOFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABISOFTFP_NOFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP_NOFP:#define __UINTMAX_TYPE__ long long unsigned int
@@ -834,26 +874,31 @@
 // ARMEABISOFTFP_FP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABISOFTFP_FP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABISOFTFP_FP:#define __FLT_RADIX__ 2
+// ARMEABISOFTFP_FP:#define __INT16_C(c) c
 // ARMEABISOFTFP_FP:#define __INT16_C_SUFFIX__
 // ARMEABISOFTFP_FP:#define __INT16_FMTd__ "hd"
 // ARMEABISOFTFP_FP:#define __INT16_FMTi__ "hi"
 // ARMEABISOFTFP_FP:#define __INT16_MAX__ 32767
 // ARMEABISOFTFP_FP:#define __INT16_TYPE__ short
+// ARMEABISOFTFP_FP:#define __INT32_C(c) c
 // ARMEABISOFTFP_FP:#define __INT32_C_SUFFIX__
 // ARMEABISOFTFP_FP:#define __INT32_FMTd__ "d"
 // ARMEABISOFTFP_FP:#define __INT32_FMTi__ "i"
 // ARMEABISOFTFP_FP:#define __INT32_MAX__ 2147483647
 // ARMEABISOFTFP_FP:#define __INT32_TYPE__ int
+// ARMEABISOFTFP_FP:#define __INT64_C(c) c##LL
 // ARMEABISOFTFP_FP:#define __INT64_C_SUFFIX__ LL
 // ARMEABISOFTFP_FP:#define __INT64_FMTd__ "lld"
 // ARMEABISOFTFP_FP:#define __INT64_FMTi__ "lli"
 // ARMEABISOFTFP_FP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABISOFTFP_FP:#define __INT64_TYPE__ long long int
+// ARMEABISOFTFP_FP:#define __INT8_C(c) c
 // ARMEABISOFTFP_FP:#define __INT8_C_SUFFIX__
 // ARMEABISOFTFP_FP:#define __INT8_FMTd__ "hhd"
 // ARMEABISOFTFP_FP:#define __INT8_FMTi__ "hhi"
 // ARMEABISOFTFP_FP:#define __INT8_MAX__ 127
 // ARMEABISOFTFP_FP:#define __INT8_TYPE__ signed char
+// ARMEABISOFTFP_FP:#define __INTMAX_C(c) c##LL
 // ARMEABISOFTFP_FP:#define __INTMAX_C_SUFFIX__ LL
 // ARMEABISOFTFP_FP:#define __INTMAX_FMTd__ "lld"
 // ARMEABISOFTFP_FP:#define __INTMAX_FMTi__ "lli"
@@ -939,18 +984,23 @@
 // ARMEABISOFTFP_FP:#define __SIZE_TYPE__ unsigned int
 // ARMEABISOFTFP_FP:#define __SIZE_WIDTH__ 32
 // ARMEABISOFTFP_FP-NOT:#define __SOFTFP__ 1
+// ARMEABISOFTFP_FP:#define __UINT16_C(c) c
 // ARMEABISOFTFP_FP:#define __UINT16_C_SUFFIX__
 // ARMEABISOFTFP_FP:#define __UINT16_MAX__ 65535
 // ARMEABISOFTFP_FP:#define __UINT16_TYPE__ unsigned short
+// ARMEABISOFTFP_FP:#define __UINT32_C(c) c##U
 // ARMEABISOFTFP_FP:#define __UINT32_C_SUFFIX__ U
 // ARMEABISOFTFP_FP:#define __UINT32_MAX__ 4294967295U
 // ARMEABISOFTFP_FP:#define __UINT32_TYPE__ unsigned int
+// ARMEABISOFTFP_FP:#define __UINT64_C(c) c##ULL
 // ARMEABISOFTFP_FP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABISOFTFP_FP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP_FP:#define __UINT64_TYPE__ long long unsigned int
+// ARMEABISOFTFP_FP:#define __UINT8_C(c) c
 // ARMEABISOFTFP_FP:#define __UINT8_C_SUFFIX__
 // ARMEABISOFTFP_FP:#define __UINT8_MAX__ 255
 // ARMEABISOFTFP_FP:#define __UINT8_TYPE__ unsigned char
+// ARMEABISOFTFP_FP:#define __UINTMAX_C(c) c##ULL
 // ARMEABISOFTFP_FP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABISOFTFP_FP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABISOFTFP_FP:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1028,26 +1078,31 @@
 // ARMEABIHARDFP:#define __FLT_MIN_EXP__ (-125)
 // ARMEABIHARDFP:#define __FLT_MIN__ 1.17549435e-38F
 // ARMEABIHARDFP:#define __FLT_RADIX__ 2
+// ARMEABIHARDFP:#define __INT16_C(c) c
 // ARMEABIHARDFP:#define __INT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT16_FMTd__ "hd"
 // ARMEABIHARDFP:#define __INT16_FMTi__ "hi"
 // ARMEABIHARDFP:#define __INT16_MAX__ 32767
 // ARMEABIHARDFP:#define __INT16_TYPE__ short
+// ARMEABIHARDFP:#define __INT32_C(c) c
 // ARMEABIHARDFP:#define __INT32_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT32_FMTd__ "d"
 // ARMEABIHARDFP:#define __INT32_FMTi__ "i"
 // ARMEABIHARDFP:#define __INT32_MAX__ 2147483647
 // ARMEABIHARDFP:#define __INT32_TYPE__ int
+// ARMEABIHARDFP:#define __INT64_C(c) c##LL
 // ARMEABIHARDFP:#define __INT64_C_SUFFIX__ LL
 // ARMEABIHARDFP:#define __INT64_FMTd__ "lld"
 // ARMEABIHARDFP:#define __INT64_FMTi__ "lli"
 // ARMEABIHARDFP:#define __INT64_MAX__ 9223372036854775807LL
 // ARMEABIHARDFP:#define __INT64_TYPE__ long long int
+// ARMEABIHARDFP:#define __INT8_C(c) c
 // ARMEABIHARDFP:#define __INT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __INT8_FMTd__ "hhd"
 // ARMEABIHARDFP:#define __INT8_FMTi__ "hhi"
 // ARMEABIHARDFP:#define __INT8_MAX__ 127
 // ARMEABIHARDFP:#define __INT8_TYPE__ signed char
+// ARMEABIHARDFP:#define __INTMAX_C(c) c##LL
 // ARMEABIHARDFP:#define __INTMAX_C_SUFFIX__ LL
 // ARMEABIHARDFP:#define __INTMAX_FMTd__ "lld"
 // ARMEABIHARDFP:#define __INTMAX_FMTi__ "lli"
@@ -1133,18 +1188,23 @@
 // ARMEABIHARDFP:#define __SIZE_TYPE__ unsigned int
 // ARMEABIHARDFP:#define __SIZE_WIDTH__ 32
 // ARMEABIHARDFP-NOT:#define __SOFTFP__ 1
+// ARMEABIHARDFP:#define __UINT16_C(c) c
 // ARMEABIHARDFP:#define __UINT16_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT16_MAX__ 65535
 // ARMEABIHARDFP:#define __UINT16_TYPE__ unsigned short
+// ARMEABIHARDFP:#define __UINT32_C(c) c##U
 // ARMEABIHARDFP:#define __UINT32_C_SUFFIX__ U
 // ARMEABIHARDFP:#define __UINT32_MAX__ 4294967295U
 // ARMEABIHARDFP:#define __UINT32_TYPE__ unsigned int
+// ARMEABIHARDFP:#define __UINT64_C(c) c##ULL
 // ARMEABIHARDFP:#define __UINT64_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINT64_TYPE__ long long unsigned int
+// ARMEABIHARDFP:#define __UINT8_C(c) c
 // ARMEABIHARDFP:#define __UINT8_C_SUFFIX__
 // ARMEABIHARDFP:#define __UINT8_MAX__ 255
 // ARMEABIHARDFP:#define __UINT8_TYPE__ unsigned char
+// ARMEABIHARDFP:#define __UINTMAX_C(c) c##ULL
 // ARMEABIHARDFP:#define __UINTMAX_C_SUFFIX__ ULL
 // ARMEABIHARDFP:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARMEABIHARDFP:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1220,26 +1280,31 @@
 // ARM-NETBSD:#define __FLT_MIN_EXP__ (-125)
 // ARM-NETBSD:#define __FLT_MIN__ 1.17549435e-38F
 // ARM-NETBSD:#define __FLT_RADIX__ 2
+// ARM-NETBSD:#define __INT16_C(c) c
 // ARM-NETBSD:#define __INT16_C_SUFFIX__
 // ARM-NETBSD:#define __INT16_FMTd__ "hd"
 // ARM-NETBSD:#define __INT16_FMTi__ "hi"
 // ARM-NETBSD:#define __INT16_MAX__ 32767
 // ARM-NETBSD:#define __INT16_TYPE__ short
+// ARM-NETBSD:#define __INT32_C(c) c
 // ARM-NETBSD:#define __INT32_C_SUFFIX__
 // ARM-NETBSD:#define __INT32_FMTd__ "d"
 // ARM-NETBSD:#define __INT32_FMTi__ "i"
 // ARM-NETBSD:#define __INT32_MAX__ 2147483647
 // ARM-NETBSD:#define __INT32_TYPE__ int
+// ARM-NETBSD:#define __INT64_C(c) c##LL
 // ARM-NETBSD:#define __INT64_C_SUFFIX__ LL
 // ARM-NETBSD:#define __INT64_FMTd__ "lld"
 // ARM-NETBSD:#define __INT64_FMTi__ "lli"
 // ARM-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // ARM-NETBSD:#define __INT64_TYPE__ long long int
+// ARM-NETBSD:#define __INT8_C(c) c
 // ARM-NETBSD:#define __INT8_C_SUFFIX__
 // ARM-NETBSD:#define __INT8_FMTd__ "hhd"
 // ARM-NETBSD:#define __INT8_FMTi__ "hhi"
 // ARM-NETBSD:#define __INT8_MAX__ 127
 // ARM-NETBSD:#define __INT8_TYPE__ signed char
+// ARM-NETBSD:#define __INTMAX_C(c) c##LL
 // ARM-NETBSD:#define __INTMAX_C_SUFFIX__ LL
 // ARM-NETBSD:#define __INTMAX_FMTd__ "lld"
 // ARM-NETBSD:#define __INTMAX_FMTi__ "lli"
@@ -1325,18 +1390,23 @@
 // ARM-NETBSD:#define __SIZE_TYPE__ long unsigned int
 // ARM-NETBSD:#define __SIZE_WIDTH__ 32
 // ARM-NETBSD:#define __SOFTFP__ 1
+// ARM-NETBSD:#define __UINT16_C(c) c
 // ARM-NETBSD:#define __UINT16_C_SUFFIX__
 // ARM-NETBSD:#define __UINT16_MAX__ 65535
 // ARM-NETBSD:#define __UINT16_TYPE__ unsigned short
+// ARM-NETBSD:#define __UINT32_C(c) c##U
 // ARM-NETBSD:#define __UINT32_C_SUFFIX__ U
 // ARM-NETBSD:#define __UINT32_MAX__ 4294967295U
 // ARM-NETBSD:#define __UINT32_TYPE__ unsigned int
+// ARM-NETBSD:#define __UINT64_C(c) c##ULL
 // ARM-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINT64_TYPE__ long long unsigned int
+// ARM-NETBSD:#define __UINT8_C(c) c
 // ARM-NETBSD:#define __UINT8_C_SUFFIX__
 // ARM-NETBSD:#define __UINT8_MAX__ 255
 // ARM-NETBSD:#define __UINT8_TYPE__ unsigned char
+// ARM-NETBSD:#define __UINTMAX_C(c) c##ULL
 // ARM-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // ARM-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // ARM-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1372,6 +1442,7 @@
 // RUN: %clang -E -dM -ffreestanding -target arm-netbsd-eabihf %s -o - | FileCheck -match-full-lines -check-prefix ARMHF-NETBSD %s
 // ARMHF-NETBSD:#define __SIZE_WIDTH__ 32
 // ARMHF-NETBSD-NOT:#define __SOFTFP__ 1
+// ARMHF-NETBSD:#define __UINT16_C(c) c
 // ARMHF-NETBSD:#define __UINT16_C_SUFFIX__
 
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-none-eabi < /dev/null | FileCheck -match-full-lines -check-prefix ARM-NONE-EABI %s
diff --git a/clang/test/Preprocessor/init-csky.c b/clang/test/Preprocessor/init-csky.c
index f7868e02644aa..99c5ad1010edb 100644
--- a/clang/test/Preprocessor/init-csky.c
+++ b/clang/test/Preprocessor/init-csky.c
@@ -66,18 +66,23 @@
 // CSKY: #define __GNUC__ {{.*}}
 // CSKY: #define __GXX_ABI_VERSION {{.*}}
 // CSKY: #define __ILP32__ 1
+// CSKY: #define __INT16_C(c) c
 // CSKY: #define __INT16_C_SUFFIX__
 // CSKY: #define __INT16_MAX__ 32767
 // CSKY: #define __INT16_TYPE__ short
+// CSKY: #define __INT32_C(c) c
 // CSKY: #define __INT32_C_SUFFIX__
 // CSKY: #define __INT32_MAX__ 2147483647
 // CSKY: #define __INT32_TYPE__ int
+// CSKY: #define __INT64_C(c) c##LL
 // CSKY: #define __INT64_C_SUFFIX__ LL
 // CSKY: #define __INT64_MAX__ 9223372036854775807LL
 // CSKY: #define __INT64_TYPE__ long long int
+// CSKY: #define __INT8_C(c) c
 // CSKY: #define __INT8_C_SUFFIX__
 // CSKY: #define __INT8_MAX__ 127
 // CSKY: #define __INT8_TYPE__ signed char
+// CSKY: #define __INTMAX_C(c) c##LL
 // CSKY: #define __INTMAX_C_SUFFIX__ LL
 // CSKY: #define __INTMAX_MAX__ 9223372036854775807LL
 // CSKY: #define __INTMAX_TYPE__ long long int
@@ -152,18 +157,23 @@
 // CSKY: #define __STDC_UTF_32__ 1
 // CSKY: #define __STDC_VERSION__ 201710L
 // CSKY: #define __STDC__ 1
+// CSKY: #define __UINT16_C(c) c
 // CSKY: #define __UINT16_C_SUFFIX__
 // CSKY: #define __UINT16_MAX__ 65535
 // CSKY: #define __UINT16_TYPE__ unsigned short
+// CSKY: #define __UINT32_C(c) c##U
 // CSKY: #define __UINT32_C_SUFFIX__ U
 // CSKY: #define __UINT32_MAX__ 4294967295U
 // CSKY: #define __UINT32_TYPE__ unsigned int
+// CSKY: #define __UINT64_C(c) c##ULL
 // CSKY: #define __UINT64_C_SUFFIX__ ULL
 // CSKY: #define __UINT64_MAX__ 18446744073709551615ULL
 // CSKY: #define __UINT64_TYPE__ long long unsigned int
+// CSKY: #define __UINT8_C(c) c
 // CSKY: #define __UINT8_C_SUFFIX__
 // CSKY: #define __UINT8_MAX__ 255
 // CSKY: #define __UINT8_TYPE__ unsigned char
+// CSKY: #define __UINTMAX_C(c) c##ULL
 // CSKY: #define __UINTMAX_C_SUFFIX__ ULL
 // CSKY: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // CSKY: #define __UINTMAX_TYPE__ long long unsigned int
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index f6fd603dc39c0..ac461b371162f 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -85,26 +85,31 @@
 // LA32: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // LA32: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // LA32: #define __ILP32__ 1
+// LA32: #define __INT16_C(c) c
 // LA32: #define __INT16_C_SUFFIX__
 // LA32: #define __INT16_FMTd__ "hd"
 // LA32: #define __INT16_FMTi__ "hi"
 // LA32: #define __INT16_MAX__ 32767
 // LA32: #define __INT16_TYPE__ short
+// LA32: #define __INT32_C(c) c
 // LA32: #define __INT32_C_SUFFIX__
 // LA32: #define __INT32_FMTd__ "d"
 // LA32: #define __INT32_FMTi__ "i"
 // LA32: #define __INT32_MAX__ 2147483647
 // LA32: #define __INT32_TYPE__ int
+// LA32: #define __INT64_C(c) c##LL
 // LA32: #define __INT64_C_SUFFIX__ LL
 // LA32: #define __INT64_FMTd__ "lld"
 // LA32: #define __INT64_FMTi__ "lli"
 // LA32: #define __INT64_MAX__ 9223372036854775807LL
 // LA32: #define __INT64_TYPE__ long long int
+// LA32: #define __INT8_C(c) c
 // LA32: #define __INT8_C_SUFFIX__
 // LA32: #define __INT8_FMTd__ "hhd"
 // LA32: #define __INT8_FMTi__ "hhi"
 // LA32: #define __INT8_MAX__ 127
 // LA32: #define __INT8_TYPE__ signed char
+// LA32: #define __INTMAX_C(c) c##LL
 // LA32: #define __INTMAX_C_SUFFIX__ LL
 // LA32: #define __INTMAX_FMTd__ "lld"
 // LA32: #define __INTMAX_FMTi__ "lli"
@@ -227,6 +232,7 @@
 // LA32: #define __STDC_UTF_32__ 1
 // LA32: #define __STDC_VERSION__ 201710L
 // LA32: #define __STDC__ 1
+// LA32: #define __UINT16_C(c) c
 // LA32: #define __UINT16_C_SUFFIX__
 // LA32: #define __UINT16_FMTX__ "hX"
 // LA32: #define __UINT16_FMTo__ "ho"
@@ -234,6 +240,7 @@
 // LA32: #define __UINT16_FMTx__ "hx"
 // LA32: #define __UINT16_MAX__ 65535
 // LA32: #define __UINT16_TYPE__ unsigned short
+// LA32: #define __UINT32_C(c) c##U
 // LA32: #define __UINT32_C_SUFFIX__ U
 // LA32: #define __UINT32_FMTX__ "X"
 // LA32: #define __UINT32_FMTo__ "o"
@@ -241,6 +248,7 @@
 // LA32: #define __UINT32_FMTx__ "x"
 // LA32: #define __UINT32_MAX__ 4294967295U
 // LA32: #define __UINT32_TYPE__ unsigned int
+// LA32: #define __UINT64_C(c) c##ULL
 // LA32: #define __UINT64_C_SUFFIX__ ULL
 // LA32: #define __UINT64_FMTX__ "llX"
 // LA32: #define __UINT64_FMTo__ "llo"
@@ -248,6 +256,7 @@
 // LA32: #define __UINT64_FMTx__ "llx"
 // LA32: #define __UINT64_MAX__ 18446744073709551615ULL
 // LA32: #define __UINT64_TYPE__ long long unsigned int
+// LA32: #define __UINT8_C(c) c
 // LA32: #define __UINT8_C_SUFFIX__
 // LA32: #define __UINT8_FMTX__ "hhX"
 // LA32: #define __UINT8_FMTo__ "hho"
@@ -255,6 +264,7 @@
 // LA32: #define __UINT8_FMTx__ "hhx"
 // LA32: #define __UINT8_MAX__ 255
 // LA32: #define __UINT8_TYPE__ unsigned char
+// LA32: #define __UINTMAX_C(c) c##ULL
 // LA32: #define __UINTMAX_C_SUFFIX__ ULL
 // LA32: #define __UINTMAX_FMTX__ "llX"
 // LA32: #define __UINTMAX_FMTo__ "llo"
@@ -406,26 +416,31 @@
 // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
 // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
 // LA64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+// LA64: #define __INT16_C(c) c
 // LA64: #define __INT16_C_SUFFIX__
 // LA64: #define __INT16_FMTd__ "hd"
 // LA64: #define __INT16_FMTi__ "hi"
 // LA64: #define __INT16_MAX__ 32767
 // LA64: #define __INT16_TYPE__ short
+// LA64: #define __INT32_C(c) c
 // LA64: #define __INT32_C_SUFFIX__
 // LA64: #define __INT32_FMTd__ "d"
 // LA64: #define __INT32_FMTi__ "i"
 // LA64: #define __INT32_MAX__ 2147483647
 // LA64: #define __INT32_TYPE__ int
+// LA64: #define __INT64_C(c) c##L
 // LA64: #define __INT64_C_SUFFIX__ L
 // LA64: #define __INT64_FMTd__ "ld"
 // LA64: #define __INT64_FMTi__ "li"
 // LA64: #define __INT64_MAX__ 9223372036854775807L
 // LA64: #define __INT64_TYPE__ long int
+// LA64: #define __INT8_C(c) c
 // LA64: #define __INT8_C_SUFFIX__
 // LA64: #define __INT8_FMTd__ "hhd"
 // LA64: #define __INT8_FMTi__ "hhi"
 // LA64: #define __INT8_MAX__ 127
 // LA64: #define __INT8_TYPE__ signed char
+// LA64: #define __INTMAX_C(c) c##L
 // LA64: #define __INTMAX_C_SUFFIX__ L
 // LA64: #define __INTMAX_FMTd__ "ld"
 // LA64: #define __INTMAX_FMTi__ "li"
@@ -549,6 +564,7 @@
 // LA64: #define __STDC_UTF_32__ 1
 // LA64: #define __STDC_VERSION__ 201710L
 // LA64: #define __STDC__ 1
+// LA64: #define __UINT16_C(c) c
 // LA64: #define __UINT16_C_SUFFIX__
 // LA64: #define __UINT16_FMTX__ "hX"
 // LA64: #define __UINT16_FMTo__ "ho"
@@ -556,6 +572,7 @@
 // LA64: #define __UINT16_FMTx__ "hx"
 // LA64: #define __UINT16_MAX__ 65535
 // LA64: #define __UINT16_TYPE__ unsigned short
+// LA64: #define __UINT32_C(c) c##U
 // LA64: #define __UINT32_C_SUFFIX__ U
 // LA64: #define __UINT32_FMTX__ "X"
 // LA64: #define __UINT32_FMTo__ "o"
@@ -563,6 +580,7 @@
 // LA64: #define __UINT32_FMTx__ "x"
 // LA64: #define __UINT32_MAX__ 4294967295U
 // LA64: #define __UINT32_TYPE__ unsigned int
+// LA64: #define __UINT64_C(c) c##UL
 // LA64: #define __UINT64_C_SUFFIX__ UL
 // LA64: #define __UINT64_FMTX__ "lX"
 // LA64: #define __UINT64_FMTo__ "lo"
@@ -570,6 +588,7 @@
 // LA64: #define __UINT64_FMTx__ "lx"
 // LA64: #define __UINT64_MAX__ 18446744073709551615UL
 // LA64: #define __UINT64_TYPE__ long unsigned int
+// LA64: #define __UINT8_C(c) c
 // LA64: #define __UINT8_C_SUFFIX__
 // LA64: #define __UINT8_FMTX__ "hhX"
 // LA64: #define __UINT8_FMTo__ "hho"
@@ -577,6 +596,7 @@
 // LA64: #define __UINT8_FMTx__ "hhx"
 // LA64: #define __UINT8_MAX__ 255
 // LA64: #define __UINT8_TYPE__ unsigned char
+// LA64: #define __UINTMAX_C(c) c##UL
 // LA64: #define __UINTMAX_C_SUFFIX__ UL
 // LA64: #define __UINTMAX_FMTX__ "lX"
 // LA64: #define __UINTMAX_FMTo__ "lo"
diff --git a/clang/test/Preprocessor/init-mips.c b/clang/test/Preprocessor/init-mips.c
index 34091ea3690da..4fead33bd826e 100644
--- a/clang/test/Preprocessor/init-mips.c
+++ b/clang/test/Preprocessor/init-mips.c
@@ -49,26 +49,31 @@
 // MIPS32BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS32BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32BE:#define __FLT_RADIX__ 2
+// MIPS32BE:#define __INT16_C(c) c
 // MIPS32BE:#define __INT16_C_SUFFIX__
 // MIPS32BE:#define __INT16_FMTd__ "hd"
 // MIPS32BE:#define __INT16_FMTi__ "hi"
 // MIPS32BE:#define __INT16_MAX__ 32767
 // MIPS32BE:#define __INT16_TYPE__ short
+// MIPS32BE:#define __INT32_C(c) c
 // MIPS32BE:#define __INT32_C_SUFFIX__
 // MIPS32BE:#define __INT32_FMTd__ "d"
 // MIPS32BE:#define __INT32_FMTi__ "i"
 // MIPS32BE:#define __INT32_MAX__ 2147483647
 // MIPS32BE:#define __INT32_TYPE__ int
+// MIPS32BE:#define __INT64_C(c) c##LL
 // MIPS32BE:#define __INT64_C_SUFFIX__ LL
 // MIPS32BE:#define __INT64_FMTd__ "lld"
 // MIPS32BE:#define __INT64_FMTi__ "lli"
 // MIPS32BE:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32BE:#define __INT64_TYPE__ long long int
+// MIPS32BE:#define __INT8_C(c) c
 // MIPS32BE:#define __INT8_C_SUFFIX__
 // MIPS32BE:#define __INT8_FMTd__ "hhd"
 // MIPS32BE:#define __INT8_FMTi__ "hhi"
 // MIPS32BE:#define __INT8_MAX__ 127
 // MIPS32BE:#define __INT8_TYPE__ signed char
+// MIPS32BE:#define __INTMAX_C(c) c##LL
 // MIPS32BE:#define __INTMAX_C_SUFFIX__ LL
 // MIPS32BE:#define __INTMAX_FMTd__ "lld"
 // MIPS32BE:#define __INTMAX_FMTi__ "lli"
@@ -159,18 +164,23 @@
 // MIPS32BE:#define __STDC_HOSTED__ 0
 // MIPS32BE-C:#define __STDC_VERSION__ 201710L
 // MIPS32BE:#define __STDC__ 1
+// MIPS32BE:#define __UINT16_C(c) c
 // MIPS32BE:#define __UINT16_C_SUFFIX__
 // MIPS32BE:#define __UINT16_MAX__ 65535
 // MIPS32BE:#define __UINT16_TYPE__ unsigned short
+// MIPS32BE:#define __UINT32_C(c) c##U
 // MIPS32BE:#define __UINT32_C_SUFFIX__ U
 // MIPS32BE:#define __UINT32_MAX__ 4294967295U
 // MIPS32BE:#define __UINT32_TYPE__ unsigned int
+// MIPS32BE:#define __UINT64_C(c) c##ULL
 // MIPS32BE:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINT64_TYPE__ long long unsigned int
+// MIPS32BE:#define __UINT8_C(c) c
 // MIPS32BE:#define __UINT8_C_SUFFIX__
 // MIPS32BE:#define __UINT8_MAX__ 255
 // MIPS32BE:#define __UINT8_TYPE__ unsigned char
+// MIPS32BE:#define __UINTMAX_C(c) c##ULL
 // MIPS32BE:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32BE:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32BE:#define __UINTMAX_TYPE__ long long unsigned int
@@ -259,26 +269,31 @@
 // MIPS32EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS32EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS32EL:#define __FLT_RADIX__ 2
+// MIPS32EL:#define __INT16_C(c) c
 // MIPS32EL:#define __INT16_C_SUFFIX__
 // MIPS32EL:#define __INT16_FMTd__ "hd"
 // MIPS32EL:#define __INT16_FMTi__ "hi"
 // MIPS32EL:#define __INT16_MAX__ 32767
 // MIPS32EL:#define __INT16_TYPE__ short
+// MIPS32EL:#define __INT32_C(c) c
 // MIPS32EL:#define __INT32_C_SUFFIX__
 // MIPS32EL:#define __INT32_FMTd__ "d"
 // MIPS32EL:#define __INT32_FMTi__ "i"
 // MIPS32EL:#define __INT32_MAX__ 2147483647
 // MIPS32EL:#define __INT32_TYPE__ int
+// MIPS32EL:#define __INT64_C(c) c##LL
 // MIPS32EL:#define __INT64_C_SUFFIX__ LL
 // MIPS32EL:#define __INT64_FMTd__ "lld"
 // MIPS32EL:#define __INT64_FMTi__ "lli"
 // MIPS32EL:#define __INT64_MAX__ 9223372036854775807LL
 // MIPS32EL:#define __INT64_TYPE__ long long int
+// MIPS32EL:#define __INT8_C(c) c
 // MIPS32EL:#define __INT8_C_SUFFIX__
 // MIPS32EL:#define __INT8_FMTd__ "hhd"
 // MIPS32EL:#define __INT8_FMTi__ "hhi"
 // MIPS32EL:#define __INT8_MAX__ 127
 // MIPS32EL:#define __INT8_TYPE__ signed char
+// MIPS32EL:#define __INTMAX_C(c) c##LL
 // MIPS32EL:#define __INTMAX_C_SUFFIX__ LL
 // MIPS32EL:#define __INTMAX_FMTd__ "lld"
 // MIPS32EL:#define __INTMAX_FMTi__ "lli"
@@ -366,18 +381,23 @@
 // MIPS32EL:#define __SIZE_MAX__ 4294967295U
 // MIPS32EL:#define __SIZE_TYPE__ unsigned int
 // MIPS32EL:#define __SIZE_WIDTH__ 32
+// MIPS32EL:#define __UINT16_C(c) c
 // MIPS32EL:#define __UINT16_C_SUFFIX__
 // MIPS32EL:#define __UINT16_MAX__ 65535
 // MIPS32EL:#define __UINT16_TYPE__ unsigned short
+// MIPS32EL:#define __UINT32_C(c) c##U
 // MIPS32EL:#define __UINT32_C_SUFFIX__ U
 // MIPS32EL:#define __UINT32_MAX__ 4294967295U
 // MIPS32EL:#define __UINT32_TYPE__ unsigned int
+// MIPS32EL:#define __UINT64_C(c) c##ULL
 // MIPS32EL:#define __UINT64_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINT64_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINT64_TYPE__ long long unsigned int
+// MIPS32EL:#define __UINT8_C(c) c
 // MIPS32EL:#define __UINT8_C_SUFFIX__
 // MIPS32EL:#define __UINT8_MAX__ 255
 // MIPS32EL:#define __UINT8_TYPE__ unsigned char
+// MIPS32EL:#define __UINTMAX_C(c) c##ULL
 // MIPS32EL:#define __UINTMAX_C_SUFFIX__ ULL
 // MIPS32EL:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MIPS32EL:#define __UINTMAX_TYPE__ long long unsigned int
@@ -496,26 +516,31 @@
 // MIPSN32BE: #define __GNUC__ 4
 // MIPSN32BE: #define __GXX_ABI_VERSION 1002
 // MIPSN32BE: #define __ILP32__ 1
+// MIPSN32BE: #define __INT16_C(c) c
 // MIPSN32BE: #define __INT16_C_SUFFIX__
 // MIPSN32BE: #define __INT16_FMTd__ "hd"
 // MIPSN32BE: #define __INT16_FMTi__ "hi"
 // MIPSN32BE: #define __INT16_MAX__ 32767
 // MIPSN32BE: #define __INT16_TYPE__ short
+// MIPSN32BE: #define __INT32_C(c) c
 // MIPSN32BE: #define __INT32_C_SUFFIX__
 // MIPSN32BE: #define __INT32_FMTd__ "d"
 // MIPSN32BE: #define __INT32_FMTi__ "i"
 // MIPSN32BE: #define __INT32_MAX__ 2147483647
 // MIPSN32BE: #define __INT32_TYPE__ int
+// MIPSN32BE: #define __INT64_C(c) c##LL
 // MIPSN32BE: #define __INT64_C_SUFFIX__ LL
 // MIPSN32BE: #define __INT64_FMTd__ "lld"
 // MIPSN32BE: #define __INT64_FMTi__ "lli"
 // MIPSN32BE: #define __INT64_MAX__ 9223372036854775807LL
 // MIPSN32BE: #define __INT64_TYPE__ long long int
+// MIPSN32BE: #define __INT8_C(c) c
 // MIPSN32BE: #define __INT8_C_SUFFIX__
 // MIPSN32BE: #define __INT8_FMTd__ "hhd"
 // MIPSN32BE: #define __INT8_FMTi__ "hhi"
 // MIPSN32BE: #define __INT8_MAX__ 127
 // MIPSN32BE: #define __INT8_TYPE__ signed char
+// MIPSN32BE: #define __INTMAX_C(c) c##LL
 // MIPSN32BE: #define __INTMAX_C_SUFFIX__ LL
 // MIPSN32BE: #define __INTMAX_FMTd__ "lld"
 // MIPSN32BE: #define __INTMAX_FMTi__ "lli"
@@ -618,6 +643,7 @@
 // MIPSN32BE: #define __STDC_UTF_32__ 1
 // MIPSN32BE-C: #define __STDC_VERSION__ 201710L
 // MIPSN32BE: #define __STDC__ 1
+// MIPSN32BE: #define __UINT16_C(c) c
 // MIPSN32BE: #define __UINT16_C_SUFFIX__
 // MIPSN32BE: #define __UINT16_FMTX__ "hX"
 // MIPSN32BE: #define __UINT16_FMTo__ "ho"
@@ -625,6 +651,7 @@
 // MIPSN32BE: #define __UINT16_FMTx__ "hx"
 // MIPSN32BE: #define __UINT16_MAX__ 65535
 // MIPSN32BE: #define __UINT16_TYPE__ unsigned short
+// MIPSN32BE: #define __UINT32_C(c) c##U
 // MIPSN32BE: #define __UINT32_C_SUFFIX__ U
 // MIPSN32BE: #define __UINT32_FMTX__ "X"
 // MIPSN32BE: #define __UINT32_FMTo__ "o"
@@ -632,6 +659,7 @@
 // MIPSN32BE: #define __UINT32_FMTx__ "x"
 // MIPSN32BE: #define __UINT32_MAX__ 4294967295U
 // MIPSN32BE: #define __UINT32_TYPE__ unsigned int
+// MIPSN32BE: #define __UINT64_C(c) c##ULL
 // MIPSN32BE: #define __UINT64_C_SUFFIX__ ULL
 // MIPSN32BE: #define __UINT64_FMTX__ "llX"
 // MIPSN32BE: #define __UINT64_FMTo__ "llo"
@@ -639,6 +667,7 @@
 // MIPSN32BE: #define __UINT64_FMTx__ "llx"
 // MIPSN32BE: #define __UINT64_MAX__ 18446744073709551615ULL
 // MIPSN32BE: #define __UINT64_TYPE__ long long unsigned int
+// MIPSN32BE: #define __UINT8_C(c) c
 // MIPSN32BE: #define __UINT8_C_SUFFIX__
 // MIPSN32BE: #define __UINT8_FMTX__ "hhX"
 // MIPSN32BE: #define __UINT8_FMTo__ "hho"
@@ -646,6 +675,7 @@
 // MIPSN32BE: #define __UINT8_FMTx__ "hhx"
 // MIPSN32BE: #define __UINT8_MAX__ 255
 // MIPSN32BE: #define __UINT8_TYPE__ unsigned char
+// MIPSN32BE: #define __UINTMAX_C(c) c##ULL
 // MIPSN32BE: #define __UINTMAX_C_SUFFIX__ ULL
 // MIPSN32BE: #define __UINTMAX_FMTX__ "llX"
 // MIPSN32BE: #define __UINTMAX_FMTo__ "llo"
@@ -803,26 +833,31 @@
 // MIPSN32EL: #define __GNUC__ 4
 // MIPSN32EL: #define __GXX_ABI_VERSION 1002
 // MIPSN32EL: #define __ILP32__ 1
+// MIPSN32EL: #define __INT16_C(c) c
 // MIPSN32EL: #define __INT16_C_SUFFIX__
 // MIPSN32EL: #define __INT16_FMTd__ "hd"
 // MIPSN32EL: #define __INT16_FMTi__ "hi"
 // MIPSN32EL: #define __INT16_MAX__ 32767
 // MIPSN32EL: #define __INT16_TYPE__ short
+// MIPSN32EL: #define __INT32_C(c) c
 // MIPSN32EL: #define __INT32_C_SUFFIX__
 // MIPSN32EL: #define __INT32_FMTd__ "d"
 // MIPSN32EL: #define __INT32_FMTi__ "i"
 // MIPSN32EL: #define __INT32_MAX__ 2147483647
 // MIPSN32EL: #define __INT32_TYPE__ int
+// MIPSN32EL: #define __INT64_C(c) c##LL
 // MIPSN32EL: #define __INT64_C_SUFFIX__ LL
 // MIPSN32EL: #define __INT64_FMTd__ "lld"
 // MIPSN32EL: #define __INT64_FMTi__ "lli"
 // MIPSN32EL: #define __INT64_MAX__ 9223372036854775807LL
 // MIPSN32EL: #define __INT64_TYPE__ long long int
+// MIPSN32EL: #define __INT8_C(c) c
 // MIPSN32EL: #define __INT8_C_SUFFIX__
 // MIPSN32EL: #define __INT8_FMTd__ "hhd"
 // MIPSN32EL: #define __INT8_FMTi__ "hhi"
 // MIPSN32EL: #define __INT8_MAX__ 127
 // MIPSN32EL: #define __INT8_TYPE__ signed char
+// MIPSN32EL: #define __INTMAX_C(c) c##LL
 // MIPSN32EL: #define __INTMAX_C_SUFFIX__ LL
 // MIPSN32EL: #define __INTMAX_FMTd__ "lld"
 // MIPSN32EL: #define __INTMAX_FMTi__ "lli"
@@ -925,6 +960,7 @@
 // MIPSN32EL: #define __STDC_UTF_32__ 1
 // MIPSN32EL: #define __STDC_VERSION__ 201710L
 // MIPSN32EL: #define __STDC__ 1
+// MIPSN32EL: #define __UINT16_C(c) c
 // MIPSN32EL: #define __UINT16_C_SUFFIX__
 // MIPSN32EL: #define __UINT16_FMTX__ "hX"
 // MIPSN32EL: #define __UINT16_FMTo__ "ho"
@@ -932,6 +968,7 @@
 // MIPSN32EL: #define __UINT16_FMTx__ "hx"
 // MIPSN32EL: #define __UINT16_MAX__ 65535
 // MIPSN32EL: #define __UINT16_TYPE__ unsigned short
+// MIPSN32EL: #define __UINT32_C(c) c##U
 // MIPSN32EL: #define __UINT32_C_SUFFIX__ U
 // MIPSN32EL: #define __UINT32_FMTX__ "X"
 // MIPSN32EL: #define __UINT32_FMTo__ "o"
@@ -939,6 +976,7 @@
 // MIPSN32EL: #define __UINT32_FMTx__ "x"
 // MIPSN32EL: #define __UINT32_MAX__ 4294967295U
 // MIPSN32EL: #define __UINT32_TYPE__ unsigned int
+// MIPSN32EL: #define __UINT64_C(c) c##ULL
 // MIPSN32EL: #define __UINT64_C_SUFFIX__ ULL
 // MIPSN32EL: #define __UINT64_FMTX__ "llX"
 // MIPSN32EL: #define __UINT64_FMTo__ "llo"
@@ -946,6 +984,7 @@
 // MIPSN32EL: #define __UINT64_FMTx__ "llx"
 // MIPSN32EL: #define __UINT64_MAX__ 18446744073709551615ULL
 // MIPSN32EL: #define __UINT64_TYPE__ long long unsigned int
+// MIPSN32EL: #define __UINT8_C(c) c
 // MIPSN32EL: #define __UINT8_C_SUFFIX__
 // MIPSN32EL: #define __UINT8_FMTX__ "hhX"
 // MIPSN32EL: #define __UINT8_FMTo__ "hho"
@@ -953,6 +992,7 @@
 // MIPSN32EL: #define __UINT8_FMTx__ "hhx"
 // MIPSN32EL: #define __UINT8_MAX__ 255
 // MIPSN32EL: #define __UINT8_TYPE__ unsigned char
+// MIPSN32EL: #define __UINTMAX_C(c) c##ULL
 // MIPSN32EL: #define __UINTMAX_C_SUFFIX__ ULL
 // MIPSN32EL: #define __UINTMAX_FMTX__ "llX"
 // MIPSN32EL: #define __UINTMAX_FMTo__ "llo"
@@ -1086,26 +1126,31 @@
 // MIPS64BE:#define __FLT_MIN_EXP__ (-125)
 // MIPS64BE:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64BE:#define __FLT_RADIX__ 2
+// MIPS64BE:#define __INT16_C(c) c
 // MIPS64BE:#define __INT16_C_SUFFIX__
 // MIPS64BE:#define __INT16_FMTd__ "hd"
 // MIPS64BE:#define __INT16_FMTi__ "hi"
 // MIPS64BE:#define __INT16_MAX__ 32767
 // MIPS64BE:#define __INT16_TYPE__ short
+// MIPS64BE:#define __INT32_C(c) c
 // MIPS64BE:#define __INT32_C_SUFFIX__
 // MIPS64BE:#define __INT32_FMTd__ "d"
 // MIPS64BE:#define __INT32_FMTi__ "i"
 // MIPS64BE:#define __INT32_MAX__ 2147483647
 // MIPS64BE:#define __INT32_TYPE__ int
+// MIPS64BE:#define __INT64_C(c) c##L
 // MIPS64BE:#define __INT64_C_SUFFIX__ L
 // MIPS64BE:#define __INT64_FMTd__ "ld"
 // MIPS64BE:#define __INT64_FMTi__ "li"
 // MIPS64BE:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64BE:#define __INT64_TYPE__ long int
+// MIPS64BE:#define __INT8_C(c) c
 // MIPS64BE:#define __INT8_C_SUFFIX__
 // MIPS64BE:#define __INT8_FMTd__ "hhd"
 // MIPS64BE:#define __INT8_FMTi__ "hhi"
 // MIPS64BE:#define __INT8_MAX__ 127
 // MIPS64BE:#define __INT8_TYPE__ signed char
+// MIPS64BE:#define __INTMAX_C(c) c##L
 // MIPS64BE:#define __INTMAX_C_SUFFIX__ L
 // MIPS64BE:#define __INTMAX_FMTd__ "ld"
 // MIPS64BE:#define __INTMAX_FMTi__ "li"
@@ -1194,18 +1239,23 @@
 // MIPS64BE:#define __SIZE_TYPE__ long unsigned int
 // MIPS64BE:#define __SIZE_WIDTH__ 64
 // MIPS64BE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+// MIPS64BE:#define __UINT16_C(c) c
 // MIPS64BE:#define __UINT16_C_SUFFIX__
 // MIPS64BE:#define __UINT16_MAX__ 65535
 // MIPS64BE:#define __UINT16_TYPE__ unsigned short
+// MIPS64BE:#define __UINT32_C(c) c##U
 // MIPS64BE:#define __UINT32_C_SUFFIX__ U
 // MIPS64BE:#define __UINT32_MAX__ 4294967295U
 // MIPS64BE:#define __UINT32_TYPE__ unsigned int
+// MIPS64BE:#define __UINT64_C(c) c##UL
 // MIPS64BE:#define __UINT64_C_SUFFIX__ UL
 // MIPS64BE:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __UINT64_TYPE__ long unsigned int
+// MIPS64BE:#define __UINT8_C(c) c
 // MIPS64BE:#define __UINT8_C_SUFFIX__
 // MIPS64BE:#define __UINT8_MAX__ 255
 // MIPS64BE:#define __UINT8_TYPE__ unsigned char
+// MIPS64BE:#define __UINTMAX_C(c) c##UL
 // MIPS64BE:#define __UINTMAX_C_SUFFIX__ UL
 // MIPS64BE:#define __UINTMAX_MAX__ 18446744073709551615UL
 // MIPS64BE:#define __UINTMAX_TYPE__ long unsigned int
@@ -1296,26 +1346,31 @@
 // MIPS64EL:#define __FLT_MIN_EXP__ (-125)
 // MIPS64EL:#define __FLT_MIN__ 1.17549435e-38F
 // MIPS64EL:#define __FLT_RADIX__ 2
+// MIPS64EL:#define __INT16_C(c) c
 // MIPS64EL:#define __INT16_C_SUFFIX__
 // MIPS64EL:#define __INT16_FMTd__ "hd"
 // MIPS64EL:#define __INT16_FMTi__ "hi"
 // MIPS64EL:#define __INT16_MAX__ 32767
 // MIPS64EL:#define __INT16_TYPE__ short
+// MIPS64EL:#define __INT32_C(c) c
 // MIPS64EL:#define __INT32_C_SUFFIX__
 // MIPS64EL:#define __INT32_FMTd__ "d"
 // MIPS64EL:#define __INT32_FMTi__ "i"
 // MIPS64EL:#define __INT32_MAX__ 2147483647
 // MIPS64EL:#define __INT32_TYPE__ int
+// MIPS64EL:#define __INT64_C(c) c##L
 // MIPS64EL:#define __INT64_C_SUFFIX__ L
 // MIPS64EL:#define __INT64_FMTd__ "ld"
 // MIPS64EL:#define __INT64_FMTi__ "li"
 // MIPS64EL:#define __INT64_MAX__ 9223372036854775807L
 // MIPS64EL:#define __INT64_TYPE__ long int
+// MIPS64EL:#define __INT8_C(c) c
 // MIPS64EL:#define __INT8_C_SUFFIX__
 // MIPS64EL:#define __INT8_FMTd__ "hhd"
 // MIPS64EL:#define __INT8_FMTi__ "hhi"
 // MIPS64EL:#define __INT8_MAX__ 127
 // MIPS64EL:#define __INT8_TYPE__ signed char
+// MIPS64EL:#define __INTMAX_C(c) c##L
 // MIPS64EL:#define __INTMAX_C_SUFFIX__ L
 // MIPS64EL:#define __INTMAX_FMTd__ "ld"
 // MIPS64EL:#define __INTMAX_FMTi__ "li"
@@ -1404,18 +1459,23 @@
 // MIPS64EL:#define __SIZE_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __SIZE_TYPE__ long unsigned int
 // MIPS64EL:#define __SIZE_WIDTH__ 64
+// MIPS64EL:#define __UINT16_C(c) c
 // MIPS64EL:#define __UINT16_C_SUFFIX__
 // MIPS64EL:#define __UINT16_MAX__ 65535
 // MIPS64EL:#define __UINT16_TYPE__ unsigned short
+// MIPS64EL:#define __UINT32_C(c) c##U
 // MIPS64EL:#define __UINT32_C_SUFFIX__ U
 // MIPS64EL:#define __UINT32_MAX__ 4294967295U
 // MIPS64EL:#define __UINT32_TYPE__ unsigned int
+// MIPS64EL:#define __UINT64_C(c) c##UL
 // MIPS64EL:#define __UINT64_C_SUFFIX__ UL
 // MIPS64EL:#define __UINT64_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __UINT64_TYPE__ long unsigned int
+// MIPS64EL:#define __UINT8_C(c) c
 // MIPS64EL:#define __UINT8_C_SUFFIX__
 // MIPS64EL:#define __UINT8_MAX__ 255
 // MIPS64EL:#define __UINT8_TYPE__ unsigned char
+// MIPS64EL:#define __UINTMAX_C(c) c##UL
 // MIPS64EL:#define __UINTMAX_C_SUFFIX__ UL
 // MIPS64EL:#define __UINTMAX_MAX__ 18446744073709551615UL
 // MIPS64EL:#define __UINTMAX_TYPE__ long unsigned int
diff --git a/clang/test/Preprocessor/init-ppc.c b/clang/test/Preprocessor/init-ppc.c
index 1421b102a3dfd..6b7eceda9b97b 100644
--- a/clang/test/Preprocessor/init-ppc.c
+++ b/clang/test/Preprocessor/init-ppc.c
@@ -41,26 +41,31 @@
 // PPC603E:#define __FLT_MIN_EXP__ (-125)
 // PPC603E:#define __FLT_MIN__ 1.17549435e-38F
 // PPC603E:#define __FLT_RADIX__ 2
+// PPC603E:#define __INT16_C(c) c
 // PPC603E:#define __INT16_C_SUFFIX__
 // PPC603E:#define __INT16_FMTd__ "hd"
 // PPC603E:#define __INT16_FMTi__ "hi"
 // PPC603E:#define __INT16_MAX__ 32767
 // PPC603E:#define __INT16_TYPE__ short
+// PPC603E:#define __INT32_C(c) c
 // PPC603E:#define __INT32_C_SUFFIX__
 // PPC603E:#define __INT32_FMTd__ "d"
 // PPC603E:#define __INT32_FMTi__ "i"
 // PPC603E:#define __INT32_MAX__ 2147483647
 // PPC603E:#define __INT32_TYPE__ int
+// PPC603E:#define __INT64_C(c) c##LL
 // PPC603E:#define __INT64_C_SUFFIX__ LL
 // PPC603E:#define __INT64_FMTd__ "lld"
 // PPC603E:#define __INT64_FMTi__ "lli"
 // PPC603E:#define __INT64_MAX__ 9223372036854775807LL
 // PPC603E:#define __INT64_TYPE__ long long int
+// PPC603E:#define __INT8_C(c) c
 // PPC603E:#define __INT8_C_SUFFIX__
 // PPC603E:#define __INT8_FMTd__ "hhd"
 // PPC603E:#define __INT8_FMTi__ "hhi"
 // PPC603E:#define __INT8_MAX__ 127
 // PPC603E:#define __INT8_TYPE__ signed char
+// PPC603E:#define __INTMAX_C(c) c##LL
 // PPC603E:#define __INTMAX_C_SUFFIX__ LL
 // PPC603E:#define __INTMAX_FMTd__ "lld"
 // PPC603E:#define __INTMAX_FMTi__ "lli"
@@ -150,18 +155,23 @@
 // PPC603E:#define __SIZE_TYPE__ long unsigned int
 // PPC603E:#define __SIZE_WIDTH__ 32
 // PPC603E-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+// PPC603E:#define __UINT16_C(c) c
 // PPC603E:#define __UINT16_C_SUFFIX__
 // PPC603E:#define __UINT16_MAX__ 65535
 // PPC603E:#define __UINT16_TYPE__ unsigned short
+// PPC603E:#define __UINT32_C(c) c##U
 // PPC603E:#define __UINT32_C_SUFFIX__ U
 // PPC603E:#define __UINT32_MAX__ 4294967295U
 // PPC603E:#define __UINT32_TYPE__ unsigned int
+// PPC603E:#define __UINT64_C(c) c##ULL
 // PPC603E:#define __UINT64_C_SUFFIX__ ULL
 // PPC603E:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINT64_TYPE__ long long unsigned int
+// PPC603E:#define __UINT8_C(c) c
 // PPC603E:#define __UINT8_C_SUFFIX__
 // PPC603E:#define __UINT8_MAX__ 255
 // PPC603E:#define __UINT8_TYPE__ unsigned char
+// PPC603E:#define __UINTMAX_C(c) c##ULL
 // PPC603E:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC603E:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC603E:#define __UINTMAX_TYPE__ long long unsigned int
@@ -235,26 +245,31 @@
 // PPC:#define __FLT_MIN__ 1.17549435e-38F
 // PPC:#define __FLT_RADIX__ 2
 // PPC:#define __HAVE_BSWAP__ 1
+// PPC:#define __INT16_C(c) c
 // PPC:#define __INT16_C_SUFFIX__
 // PPC:#define __INT16_FMTd__ "hd"
 // PPC:#define __INT16_FMTi__ "hi"
 // PPC:#define __INT16_MAX__ 32767
 // PPC:#define __INT16_TYPE__ short
+// PPC:#define __INT32_C(c) c
 // PPC:#define __INT32_C_SUFFIX__
 // PPC:#define __INT32_FMTd__ "d"
 // PPC:#define __INT32_FMTi__ "i"
 // PPC:#define __INT32_MAX__ 2147483647
 // PPC:#define __INT32_TYPE__ int
+// PPC:#define __INT64_C(c) c##LL
 // PPC:#define __INT64_C_SUFFIX__ LL
 // PPC:#define __INT64_FMTd__ "lld"
 // PPC:#define __INT64_FMTi__ "lli"
 // PPC:#define __INT64_MAX__ 9223372036854775807LL
 // PPC:#define __INT64_TYPE__ long long int
+// PPC:#define __INT8_C(c) c
 // PPC:#define __INT8_C_SUFFIX__
 // PPC:#define __INT8_FMTd__ "hhd"
 // PPC:#define __INT8_FMTi__ "hhi"
 // PPC:#define __INT8_MAX__ 127
 // PPC:#define __INT8_TYPE__ signed char
+// PPC:#define __INTMAX_C(c) c##LL
 // PPC:#define __INTMAX_C_SUFFIX__ LL
 // PPC:#define __INTMAX_FMTd__ "lld"
 // PPC:#define __INTMAX_FMTi__ "lli"
@@ -344,18 +359,23 @@
 // PPC:#define __SIZE_MAX__ 4294967295UL
 // PPC:#define __SIZE_TYPE__ long unsigned int
 // PPC:#define __SIZE_WIDTH__ 32
+// PPC:#define __UINT16_C(c) c
 // PPC:#define __UINT16_C_SUFFIX__
 // PPC:#define __UINT16_MAX__ 65535
 // PPC:#define __UINT16_TYPE__ unsigned short
+// PPC:#define __UINT32_C(c) c##U
 // PPC:#define __UINT32_C_SUFFIX__ U
 // PPC:#define __UINT32_MAX__ 4294967295U
 // PPC:#define __UINT32_TYPE__ unsigned int
+// PPC:#define __UINT64_C(c) c##ULL
 // PPC:#define __UINT64_C_SUFFIX__ ULL
 // PPC:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC:#define __UINT64_TYPE__ long long unsigned int
+// PPC:#define __UINT8_C(c) c
 // PPC:#define __UINT8_C_SUFFIX__
 // PPC:#define __UINT8_MAX__ 255
 // PPC:#define __UINT8_TYPE__ unsigned char
+// PPC:#define __UINTMAX_C(c) c##ULL
 // PPC:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC:#define __UINTMAX_TYPE__ long long unsigned int
@@ -435,26 +455,31 @@
 // PPC-AIX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-AIX:#define __FLT_RADIX__ 2
 // PPC-AIX:#define __HOS_AIX__ 1
+// PPC-AIX:#define __INT16_C(c) c
 // PPC-AIX:#define __INT16_C_SUFFIX__
 // PPC-AIX:#define __INT16_FMTd__ "hd"
 // PPC-AIX:#define __INT16_FMTi__ "hi"
 // PPC-AIX:#define __INT16_MAX__ 32767
 // PPC-AIX:#define __INT16_TYPE__ short
+// PPC-AIX:#define __INT32_C(c) c
 // PPC-AIX:#define __INT32_C_SUFFIX__
 // PPC-AIX:#define __INT32_FMTd__ "d"
 // PPC-AIX:#define __INT32_FMTi__ "i"
 // PPC-AIX:#define __INT32_MAX__ 2147483647
 // PPC-AIX:#define __INT32_TYPE__ int
+// PPC-AIX:#define __INT64_C(c) c##LL
 // PPC-AIX:#define __INT64_C_SUFFIX__ LL
 // PPC-AIX:#define __INT64_FMTd__ "lld"
 // PPC-AIX:#define __INT64_FMTi__ "lli"
 // PPC-AIX:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-AIX:#define __INT64_TYPE__ long long int
+// PPC-AIX:#define __INT8_C(c) c
 // PPC-AIX:#define __INT8_C_SUFFIX__
 // PPC-AIX:#define __INT8_FMTd__ "hhd"
 // PPC-AIX:#define __INT8_FMTi__ "hhi"
 // PPC-AIX:#define __INT8_MAX__ 127
 // PPC-AIX:#define __INT8_TYPE__ signed char
+// PPC-AIX:#define __INTMAX_C(c) c##LL
 // PPC-AIX:#define __INTMAX_C_SUFFIX__ LL
 // PPC-AIX:#define __INTMAX_FMTd__ "lld"
 // PPC-AIX:#define __INTMAX_FMTi__ "lli"
@@ -546,18 +571,23 @@
 // PPC-AIX:#define __THW_BIG_ENDIAN__ 1
 // PPC-AIX:#define __THW_PPC__ 1
 // PPC-AIX:#define __TOS_AIX__ 1
+// PPC-AIX:#define __UINT16_C(c) c
 // PPC-AIX:#define __UINT16_C_SUFFIX__
 // PPC-AIX:#define __UINT16_MAX__ 65535
 // PPC-AIX:#define __UINT16_TYPE__ unsigned short
+// PPC-AIX:#define __UINT32_C(c) c##U
 // PPC-AIX:#define __UINT32_C_SUFFIX__ U
 // PPC-AIX:#define __UINT32_MAX__ 4294967295U
 // PPC-AIX:#define __UINT32_TYPE__ unsigned int
+// PPC-AIX:#define __UINT64_C(c) c##ULL
 // PPC-AIX:#define __UINT64_C_SUFFIX__ ULL
 // PPC-AIX:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-AIX:#define __UINT64_TYPE__ long long unsigned int
+// PPC-AIX:#define __UINT8_C(c) c
 // PPC-AIX:#define __UINT8_C_SUFFIX__
 // PPC-AIX:#define __UINT8_MAX__ 255
 // PPC-AIX:#define __UINT8_TYPE__ unsigned char
+// PPC-AIX:#define __UINTMAX_C(c) c##ULL
 // PPC-AIX:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC-AIX:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC-AIX:#define __UINTMAX_TYPE__ long long unsigned int
@@ -807,26 +837,31 @@
 // PPC-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC-LINUX:#define __FLT_RADIX__ 2
 // PPC-LINUX:#define __HAVE_BSWAP__ 1
+// PPC-LINUX:#define __INT16_C(c) c
 // PPC-LINUX:#define __INT16_C_SUFFIX__
 // PPC-LINUX:#define __INT16_FMTd__ "hd"
 // PPC-LINUX:#define __INT16_FMTi__ "hi"
 // PPC-LINUX:#define __INT16_MAX__ 32767
 // PPC-LINUX:#define __INT16_TYPE__ short
+// PPC-LINUX:#define __INT32_C(c) c
 // PPC-LINUX:#define __INT32_C_SUFFIX__
 // PPC-LINUX:#define __INT32_FMTd__ "d"
 // PPC-LINUX:#define __INT32_FMTi__ "i"
 // PPC-LINUX:#define __INT32_MAX__ 2147483647
 // PPC-LINUX:#define __INT32_TYPE__ int
+// PPC-LINUX:#define __INT64_C(c) c##LL
 // PPC-LINUX:#define __INT64_C_SUFFIX__ LL
 // PPC-LINUX:#define __INT64_FMTd__ "lld"
 // PPC-LINUX:#define __INT64_FMTi__ "lli"
 // PPC-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // PPC-LINUX:#define __INT64_TYPE__ long long int
+// PPC-LINUX:#define __INT8_C(c) c
 // PPC-LINUX:#define __INT8_C_SUFFIX__
 // PPC-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC-LINUX:#define __INT8_MAX__ 127
 // PPC-LINUX:#define __INT8_TYPE__ signed char
+// PPC-LINUX:#define __INTMAX_C(c) c##LL
 // PPC-LINUX:#define __INTMAX_C_SUFFIX__ LL
 // PPC-LINUX:#define __INTMAX_FMTd__ "lld"
 // PPC-LINUX:#define __INTMAX_FMTi__ "lli"
@@ -915,18 +950,23 @@
 // PPC-LINUX:#define __SIZE_MAX__ 4294967295U
 // PPC-LINUX:#define __SIZE_TYPE__ unsigned int
 // PPC-LINUX:#define __SIZE_WIDTH__ 32
+// PPC-LINUX:#define __UINT16_C(c) c
 // PPC-LINUX:#define __UINT16_C_SUFFIX__
 // PPC-LINUX:#define __UINT16_MAX__ 65535
 // PPC-LINUX:#define __UINT16_TYPE__ unsigned short
+// PPC-LINUX:#define __UINT32_C(c) c##U
 // PPC-LINUX:#define __UINT32_C_SUFFIX__ U
 // PPC-LINUX:#define __UINT32_MAX__ 4294967295U
 // PPC-LINUX:#define __UINT32_TYPE__ unsigned int
+// PPC-LINUX:#define __UINT64_C(c) c##ULL
 // PPC-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // PPC-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // PPC-LINUX:#define __UINT64_TYPE__ long long unsigned int
+// PPC-LINUX:#define __UINT8_C(c) c
 // PPC-LINUX:#define __UINT8_C_SUFFIX__
 // PPC-LINUX:#define __UINT8_MAX__ 255
 // PPC-LINUX:#define __UINT8_TYPE__ unsigned char
+// PPC-LINUX:#define __UINTMAX_C(c) c##ULL
 // PPC-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
 // PPC-LINUX:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // PPC-LINUX:#define __UINTMAX_TYPE__ long long unsigned int
diff --git a/clang/test/Preprocessor/init-ppc64.c b/clang/test/Preprocessor/init-ppc64.c
index 57e2ca31d5d53..7dffd4627481b 100644
--- a/clang/test/Preprocessor/init-ppc64.c
+++ b/clang/test/Preprocessor/init-ppc64.c
@@ -47,26 +47,31 @@
 // PPC64:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64:#define __FLT_RADIX__ 2
 // PPC64:#define __HAVE_BSWAP__ 1
+// PPC64:#define __INT16_C(c) c
 // PPC64:#define __INT16_C_SUFFIX__
 // PPC64:#define __INT16_FMTd__ "hd"
 // PPC64:#define __INT16_FMTi__ "hi"
 // PPC64:#define __INT16_MAX__ 32767
 // PPC64:#define __INT16_TYPE__ short
+// PPC64:#define __INT32_C(c) c
 // PPC64:#define __INT32_C_SUFFIX__
 // PPC64:#define __INT32_FMTd__ "d"
 // PPC64:#define __INT32_FMTi__ "i"
 // PPC64:#define __INT32_MAX__ 2147483647
 // PPC64:#define __INT32_TYPE__ int
+// PPC64:#define __INT64_C(c) c##L
 // PPC64:#define __INT64_C_SUFFIX__ L
 // PPC64:#define __INT64_FMTd__ "ld"
 // PPC64:#define __INT64_FMTi__ "li"
 // PPC64:#define __INT64_MAX__ 9223372036854775807L
 // PPC64:#define __INT64_TYPE__ long int
+// PPC64:#define __INT8_C(c) c
 // PPC64:#define __INT8_C_SUFFIX__
 // PPC64:#define __INT8_FMTd__ "hhd"
 // PPC64:#define __INT8_FMTi__ "hhi"
 // PPC64:#define __INT8_MAX__ 127
 // PPC64:#define __INT8_TYPE__ signed char
+// PPC64:#define __INTMAX_C(c) c##L
 // PPC64:#define __INTMAX_C_SUFFIX__ L
 // PPC64:#define __INTMAX_FMTd__ "ld"
 // PPC64:#define __INTMAX_FMTi__ "li"
@@ -157,18 +162,23 @@
 // PPC64:#define __SIZE_TYPE__ long unsigned int
 // PPC64:#define __SIZE_WIDTH__ 64
 // PPC64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+// PPC64:#define __UINT16_C(c) c
 // PPC64:#define __UINT16_C_SUFFIX__
 // PPC64:#define __UINT16_MAX__ 65535
 // PPC64:#define __UINT16_TYPE__ unsigned short
+// PPC64:#define __UINT32_C(c) c##U
 // PPC64:#define __UINT32_C_SUFFIX__ U
 // PPC64:#define __UINT32_MAX__ 4294967295U
 // PPC64:#define __UINT32_TYPE__ unsigned int
+// PPC64:#define __UINT64_C(c) c##UL
 // PPC64:#define __UINT64_C_SUFFIX__ UL
 // PPC64:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64:#define __UINT64_TYPE__ long unsigned int
+// PPC64:#define __UINT8_C(c) c
 // PPC64:#define __UINT8_C_SUFFIX__
 // PPC64:#define __UINT8_MAX__ 255
 // PPC64:#define __UINT8_TYPE__ unsigned char
+// PPC64:#define __UINTMAX_C(c) c##UL
 // PPC64:#define __UINTMAX_C_SUFFIX__ UL
 // PPC64:#define __UINTMAX_MAX__ 18446744073709551615UL
 // PPC64:#define __UINTMAX_TYPE__ long unsigned int
@@ -250,26 +260,31 @@
 // PPC64LE:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64LE:#define __FLT_RADIX__ 2
 // PPC64LE:#define __HAVE_BSWAP__ 1
+// PPC64LE:#define __INT16_C(c) c
 // PPC64LE:#define __INT16_C_SUFFIX__
 // PPC64LE:#define __INT16_FMTd__ "hd"
 // PPC64LE:#define __INT16_FMTi__ "hi"
 // PPC64LE:#define __INT16_MAX__ 32767
 // PPC64LE:#define __INT16_TYPE__ short
+// PPC64LE:#define __INT32_C(c) c
 // PPC64LE:#define __INT32_C_SUFFIX__
 // PPC64LE:#define __INT32_FMTd__ "d"
 // PPC64LE:#define __INT32_FMTi__ "i"
 // PPC64LE:#define __INT32_MAX__ 2147483647
 // PPC64LE:#define __INT32_TYPE__ int
+// PPC64LE:#define __INT64_C(c) c##L
 // PPC64LE:#define __INT64_C_SUFFIX__ L
 // PPC64LE:#define __INT64_FMTd__ "ld"
 // PPC64LE:#define __INT64_FMTi__ "li"
 // PPC64LE:#define __INT64_MAX__ 9223372036854775807L
 // PPC64LE:#define __INT64_TYPE__ long int
+// PPC64LE:#define __INT8_C(c) c
 // PPC64LE:#define __INT8_C_SUFFIX__
 // PPC64LE:#define __INT8_FMTd__ "hhd"
 // PPC64LE:#define __INT8_FMTi__ "hhi"
 // PPC64LE:#define __INT8_MAX__ 127
 // PPC64LE:#define __INT8_TYPE__ signed char
+// PPC64LE:#define __INTMAX_C(c) c##L
 // PPC64LE:#define __INTMAX_C_SUFFIX__ L
 // PPC64LE:#define __INTMAX_FMTd__ "ld"
 // PPC64LE:#define __INTMAX_FMTi__ "li"
@@ -361,18 +376,23 @@
 // PPC64LE:#define __SIZE_TYPE__ long unsigned int
 // PPC64LE:#define __SIZE_WIDTH__ 64
 // PPC64LE:#define __STRUCT_PARM_ALIGN__ 16
+// PPC64LE:#define __UINT16_C(c) c
 // PPC64LE:#define __UINT16_C_SUFFIX__
 // PPC64LE:#define __UINT16_MAX__ 65535
 // PPC64LE:#define __UINT16_TYPE__ unsigned short
+// PPC64LE:#define __UINT32_C(c) c##U
 // PPC64LE:#define __UINT32_C_SUFFIX__ U
 // PPC64LE:#define __UINT32_MAX__ 4294967295U
 // PPC64LE:#define __UINT32_TYPE__ unsigned int
+// PPC64LE:#define __UINT64_C(c) c##UL
 // PPC64LE:#define __UINT64_C_SUFFIX__ UL
 // PPC64LE:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64LE:#define __UINT64_TYPE__ long unsigned int
+// PPC64LE:#define __UINT8_C(c) c
 // PPC64LE:#define __UINT8_C_SUFFIX__
 // PPC64LE:#define __UINT8_MAX__ 255
 // PPC64LE:#define __UINT8_TYPE__ unsigned char
+// PPC64LE:#define __UINTMAX_C(c) c##UL
 // PPC64LE:#define __UINTMAX_C_SUFFIX__ UL
 // PPC64LE:#define __UINTMAX_MAX__ 18446744073709551615UL
 // PPC64LE:#define __UINTMAX_TYPE__ long unsigned int
@@ -733,26 +753,31 @@
 // PPC64-AIX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64-AIX:#define __FLT_RADIX__ 2
 // PPC64-AIX-NOT:#define __ILP32__ 1
+// PPC64-AIX:#define __INT16_C(c) c
 // PPC64-AIX:#define __INT16_C_SUFFIX__
 // PPC64-AIX:#define __INT16_FMTd__ "hd"
 // PPC64-AIX:#define __INT16_FMTi__ "hi"
 // PPC64-AIX:#define __INT16_MAX__ 32767
 // PPC64-AIX:#define __INT16_TYPE__ short
+// PPC64-AIX:#define __INT32_C(c) c
 // PPC64-AIX:#define __INT32_C_SUFFIX__
 // PPC64-AIX:#define __INT32_FMTd__ "d"
 // PPC64-AIX:#define __INT32_FMTi__ "i"
 // PPC64-AIX:#define __INT32_MAX__ 2147483647
 // PPC64-AIX:#define __INT32_TYPE__ int
+// PPC64-AIX:#define __INT64_C(c) c##L
 // PPC64-AIX:#define __INT64_C_SUFFIX__ L
 // PPC64-AIX:#define __INT64_FMTd__ "ld"
 // PPC64-AIX:#define __INT64_FMTi__ "li"
 // PPC64-AIX:#define __INT64_MAX__ 9223372036854775807L
 // PPC64-AIX:#define __INT64_TYPE__ long int
+// PPC64-AIX:#define __INT8_C(c) c
 // PPC64-AIX:#define __INT8_C_SUFFIX__
 // PPC64-AIX:#define __INT8_FMTd__ "hhd"
 // PPC64-AIX:#define __INT8_FMTi__ "hhi"
 // PPC64-AIX:#define __INT8_MAX__ 127
 // PPC64-AIX:#define __INT8_TYPE__ signed char
+// PPC64-AIX:#define __INTMAX_C(c) c##L
 // PPC64-AIX:#define __INTMAX_C_SUFFIX__ L
 // PPC64-AIX:#define __INTMAX_FMTd__ "ld"
 // PPC64-AIX:#define __INTMAX_FMTi__ "li"
@@ -842,18 +867,23 @@
 // PPC64-AIX:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64-AIX:#define __SIZE_TYPE__ long unsigned int
 // PPC64-AIX:#define __SIZE_WIDTH__ 64
+// PPC64-AIX:#define __UINT16_C(c) c
 // PPC64-AIX:#define __UINT16_C_SUFFIX__
 // PPC64-AIX:#define __UINT16_MAX__ 65535
 // PPC64-AIX:#define __UINT16_TYPE__ unsigned short
+// PPC64-AIX:#define __UINT32_C(c) c##U
 // PPC64-AIX:#define __UINT32_C_SUFFIX__ U
 // PPC64-AIX:#define __UINT32_MAX__ 4294967295U
 // PPC64-AIX:#define __UINT32_TYPE__ unsigned int
+// PPC64-AIX:#define __UINT64_C(c) c##UL
 // PPC64-AIX:#define __UINT64_C_SUFFIX__ UL
 // PPC64-AIX:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64-AIX:#define __UINT64_TYPE__ long unsigned int
+// PPC64-AIX:#define __UINT8_C(c) c
 // PPC64-AIX:#define __UINT8_C_SUFFIX__
 // PPC64-AIX:#define __UINT8_MAX__ 255
 // PPC64-AIX:#define __UINT8_TYPE__ unsigned char
+// PPC64-AIX:#define __UINTMAX_C(c) c##UL
 // PPC64-AIX:#define __UINTMAX_C_SUFFIX__ UL
 // PPC64-AIX:#define __UINTMAX_MAX__ 18446744073709551615UL
 // PPC64-AIX:#define __UINTMAX_TYPE__ long unsigned int
@@ -930,26 +960,31 @@
 // PPC64-LINUX:#define __FLT_MIN__ 1.17549435e-38F
 // PPC64-LINUX:#define __FLT_RADIX__ 2
 // PPC64-LINUX:#define __HAVE_BSWAP__ 1
+// PPC64-LINUX:#define __INT16_C(c) c
 // PPC64-LINUX:#define __INT16_C_SUFFIX__
 // PPC64-LINUX:#define __INT16_FMTd__ "hd"
 // PPC64-LINUX:#define __INT16_FMTi__ "hi"
 // PPC64-LINUX:#define __INT16_MAX__ 32767
 // PPC64-LINUX:#define __INT16_TYPE__ short
+// PPC64-LINUX:#define __INT32_C(c) c
 // PPC64-LINUX:#define __INT32_C_SUFFIX__
 // PPC64-LINUX:#define __INT32_FMTd__ "d"
 // PPC64-LINUX:#define __INT32_FMTi__ "i"
 // PPC64-LINUX:#define __INT32_MAX__ 2147483647
 // PPC64-LINUX:#define __INT32_TYPE__ int
+// PPC64-LINUX:#define __INT64_C(c) c##L
 // PPC64-LINUX:#define __INT64_C_SUFFIX__ L
 // PPC64-LINUX:#define __INT64_FMTd__ "ld"
 // PPC64-LINUX:#define __INT64_FMTi__ "li"
 // PPC64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // PPC64-LINUX:#define __INT64_TYPE__ long int
+// PPC64-LINUX:#define __INT8_C(c) c
 // PPC64-LINUX:#define __INT8_C_SUFFIX__
 // PPC64-LINUX:#define __INT8_FMTd__ "hhd"
 // PPC64-LINUX:#define __INT8_FMTi__ "hhi"
 // PPC64-LINUX:#define __INT8_MAX__ 127
 // PPC64-LINUX:#define __INT8_TYPE__ signed char
+// PPC64-LINUX:#define __INTMAX_C(c) c##L
 // PPC64-LINUX:#define __INTMAX_C_SUFFIX__ L
 // PPC64-LINUX:#define __INTMAX_FMTd__ "ld"
 // PPC64-LINUX:#define __INTMAX_FMTi__ "li"
@@ -1039,18 +1074,23 @@
 // PPC64-LINUX:#define __SIZE_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __SIZE_TYPE__ long unsigned int
 // PPC64-LINUX:#define __SIZE_WIDTH__ 64
+// PPC64-LINUX:#define __UINT16_C(c) c
 // PPC64-LINUX:#define __UINT16_C_SUFFIX__
 // PPC64-LINUX:#define __UINT16_MAX__ 65535
 // PPC64-LINUX:#define __UINT16_TYPE__ unsigned short
+// PPC64-LINUX:#define __UINT32_C(c) c##U
 // PPC64-LINUX:#define __UINT32_C_SUFFIX__ U
 // PPC64-LINUX:#define __UINT32_MAX__ 4294967295U
 // PPC64-LINUX:#define __UINT32_TYPE__ unsigned int
+// PPC64-LINUX:#define __UINT64_C(c) c##UL
 // PPC64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // PPC64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __UINT64_TYPE__ long unsigned int
+// PPC64-LINUX:#define __UINT8_C(c) c
 // PPC64-LINUX:#define __UINT8_C_SUFFIX__
 // PPC64-LINUX:#define __UINT8_MAX__ 255
 // PPC64-LINUX:#define __UINT8_TYPE__ unsigned char
+// PPC64-LINUX:#define __UINTMAX_C(c) c##UL
 // PPC64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
 // PPC64-LINUX:#define __UINTMAX_MAX__ 18446744073709551615UL
 // PPC64-LINUX:#define __UINTMAX_TYPE__ long unsigned int
diff --git a/clang/test/Preprocessor/init-s390x.c b/clang/test/Preprocessor/init-s390x.c
index 6d08e9bfcb632..a8fbde46cbb75 100644
--- a/clang/test/Preprocessor/init-s390x.c
+++ b/clang/test/Preprocessor/init-s390x.c
@@ -34,26 +34,31 @@
 // S390X:#define __FLT_MIN_EXP__ (-125)
 // S390X:#define __FLT_MIN__ 1.17549435e-38F
 // S390X:#define __FLT_RADIX__ 2
+// S390X:#define __INT16_C(c) c
 // S390X:#define __INT16_C_SUFFIX__
 // S390X:#define __INT16_FMTd__ "hd"
 // S390X:#define __INT16_FMTi__ "hi"
 // S390X:#define __INT16_MAX__ 32767
 // S390X:#define __INT16_TYPE__ short
+// S390X:#define __INT32_C(c) c
 // S390X:#define __INT32_C_SUFFIX__
 // S390X:#define __INT32_FMTd__ "d"
 // S390X:#define __INT32_FMTi__ "i"
 // S390X:#define __INT32_MAX__ 2147483647
 // S390X:#define __INT32_TYPE__ int
+// S390X:#define __INT64_C(c) c##L
 // S390X:#define __INT64_C_SUFFIX__ L
 // S390X:#define __INT64_FMTd__ "ld"
 // S390X:#define __INT64_FMTi__ "li"
 // S390X:#define __INT64_MAX__ 9223372036854775807L
 // S390X:#define __INT64_TYPE__ long int
+// S390X:#define __INT8_C(c) c
 // S390X:#define __INT8_C_SUFFIX__
 // S390X:#define __INT8_FMTd__ "hhd"
 // S390X:#define __INT8_FMTi__ "hhi"
 // S390X:#define __INT8_MAX__ 127
 // S390X:#define __INT8_TYPE__ signed char
+// S390X:#define __INTMAX_C(c) c##L
 // S390X:#define __INTMAX_C_SUFFIX__ L
 // S390X:#define __INTMAX_FMTd__ "ld"
 // S390X:#define __INTMAX_FMTi__ "li"
@@ -136,18 +141,23 @@
 // S390X:#define __SIZE_TYPE__ long unsigned int
 // S390X:#define __SIZE_WIDTH__ 64
 // S390X-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8UL
+// S390X:#define __UINT16_C(c) c
 // S390X:#define __UINT16_C_SUFFIX__
 // S390X:#define __UINT16_MAX__ 65535
 // S390X:#define __UINT16_TYPE__ unsigned short
+// S390X:#define __UINT32_C(c) c##U
 // S390X:#define __UINT32_C_SUFFIX__ U
 // S390X:#define __UINT32_MAX__ 4294967295U
 // S390X:#define __UINT32_TYPE__ unsigned int
+// S390X:#define __UINT64_C(c) c##UL
 // S390X:#define __UINT64_C_SUFFIX__ UL
 // S390X:#define __UINT64_MAX__ 18446744073709551615UL
 // S390X:#define __UINT64_TYPE__ long unsigned int
+// S390X:#define __UINT8_C(c) c
 // S390X:#define __UINT8_C_SUFFIX__
 // S390X:#define __UINT8_MAX__ 255
 // S390X:#define __UINT8_TYPE__ unsigned char
+// S390X:#define __UINTMAX_C(c) c##UL
 // S390X:#define __UINTMAX_C_SUFFIX__ UL
 // S390X:#define __UINTMAX_MAX__ 18446744073709551615UL
 // S390X:#define __UINTMAX_TYPE__ long unsigned int
diff --git a/clang/test/Preprocessor/init-v7k-compat.c b/clang/test/Preprocessor/init-v7k-compat.c
index ff5d4bbdea53a..a9c6e7a290646 100644
--- a/clang/test/Preprocessor/init-v7k-compat.c
+++ b/clang/test/Preprocessor/init-v7k-compat.c
@@ -39,26 +39,31 @@
 // CHECK: #define __FLT_MIN_EXP__ (-125)
 // CHECK: #define __FLT_MIN__ 1.17549435e-38F
 // CHECK: #define __FLT_RADIX__ 2
+// CHECK: #define __INT16_C(c) c
 // CHECK: #define __INT16_C_SUFFIX__ {{$}}
 // CHECK: #define __INT16_FMTd__ "hd"
 // CHECK: #define __INT16_FMTi__ "hi"
 // CHECK: #define __INT16_MAX__ 32767
 // CHECK: #define __INT16_TYPE__ short
+// CHECK: #define __INT32_C(c) c
 // CHECK: #define __INT32_C_SUFFIX__ {{$}}
 // CHECK: #define __INT32_FMTd__ "d"
 // CHECK: #define __INT32_FMTi__ "i"
 // CHECK: #define __INT32_MAX__ 2147483647
 // CHECK: #define __INT32_TYPE__ int
+// CHECK: #define __INT64_C(c) c##LL
 // CHECK: #define __INT64_C_SUFFIX__ LL
 // CHECK: #define __INT64_FMTd__ "lld"
 // CHECK: #define __INT64_FMTi__ "lli"
 // CHECK: #define __INT64_MAX__ 9223372036854775807LL
 // CHECK: #define __INT64_TYPE__ long long int
+// CHECK: #define __INT8_C(c) c
 // CHECK: #define __INT8_C_SUFFIX__ {{$}}
 // CHECK: #define __INT8_FMTd__ "hhd"
 // CHECK: #define __INT8_FMTi__ "hhi"
 // CHECK: #define __INT8_MAX__ 127
 // CHECK: #define __INT8_TYPE__ signed char
+// CHECK: #define __INTMAX_C(c) c##LL
 // CHECK: #define __INTMAX_C_SUFFIX__ LL
 // CHECK: #define __INTMAX_FMTd__ "lld"
 // CHECK: #define __INTMAX_FMTi__ "lli"
@@ -140,18 +145,23 @@
 // CHECK: #define __SIZE_MAX__ 4294967295UL
 // CHECK: #define __SIZE_TYPE__ long unsigned int
 // CHECK: #define __SIZE_WIDTH__ 32
+// CHECK: #define __UINT16_C(c) c
 // CHECK: #define __UINT16_C_SUFFIX__ {{$}}
 // CHECK: #define __UINT16_MAX__ 65535
 // CHECK: #define __UINT16_TYPE__ unsigned short
+// CHECK: #define __UINT32_C(c) c##U
 // CHECK: #define __UINT32_C_SUFFIX__ U
 // CHECK: #define __UINT32_MAX__ 4294967295U
 // CHECK: #define __UINT32_TYPE__ unsigned int
+// CHECK: #define __UINT64_C(c) c##ULL
 // CHECK: #define __UINT64_C_SUFFIX__ ULL
 // CHECK: #define __UINT64_MAX__ 18446744073709551615ULL
 // CHECK: #define __UINT64_TYPE__ long long unsigned int
+// CHECK: #define __UINT8_C(c) c
 // CHECK: #define __UINT8_C_SUFFIX__ {{$}}
 // CHECK: #define __UINT8_MAX__ 255
 // CHECK: #define __UINT8_TYPE__ unsigned char
+// CHECK: #define __UINTMAX_C(c) c##ULL
 // CHECK: #define __UINTMAX_C_SUFFIX__ ULL
 // CHECK: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // CHECK: #define __UINTMAX_TYPE__ long long unsigned int
diff --git a/clang/test/Preprocessor/init-ve.c b/clang/test/Preprocessor/init-ve.c
index 13bdb12387db4..711c2a04865b3 100644
--- a/clang/test/Preprocessor/init-ve.c
+++ b/clang/test/Preprocessor/init-ve.c
@@ -45,26 +45,31 @@
 // VE:#define __FLT_MIN_EXP__ (-125)
 // VE:#define __FLT_MIN__ 1.17549435e-38F
 // VE:#define __FLT_RADIX__ 2
+// VE:#define __INT16_C(c) c
 // VE:#define __INT16_C_SUFFIX__
 // VE:#define __INT16_FMTd__ "hd"
 // VE:#define __INT16_FMTi__ "hi"
 // VE:#define __INT16_MAX__ 32767
 // VE:#define __INT16_TYPE__ short
+// VE:#define __INT32_C(c) c
 // VE:#define __INT32_C_SUFFIX__
 // VE:#define __INT32_FMTd__ "d"
 // VE:#define __INT32_FMTi__ "i"
 // VE:#define __INT32_MAX__ 2147483647
 // VE:#define __INT32_TYPE__ int
+// VE:#define __INT64_C(c) c##L
 // VE:#define __INT64_C_SUFFIX__ L
 // VE:#define __INT64_FMTd__ "ld"
 // VE:#define __INT64_FMTi__ "li"
 // VE:#define __INT64_MAX__ 9223372036854775807L
 // VE:#define __INT64_TYPE__ long int
+// VE:#define __INT8_C(c) c
 // VE:#define __INT8_C_SUFFIX__
 // VE:#define __INT8_FMTd__ "hhd"
 // VE:#define __INT8_FMTi__ "hhi"
 // VE:#define __INT8_MAX__ 127
 // VE:#define __INT8_TYPE__ signed char
+// VE:#define __INTMAX_C(c) c##L
 // VE:#define __INTMAX_C_SUFFIX__ L
 // VE:#define __INTMAX_FMTd__ "ld"
 // VE:#define __INTMAX_FMTi__ "li"
@@ -164,6 +169,7 @@
 // VE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
 // VE-HOSTED:#define __STDC_HOSTED__ 1
 // VE-FREESTANDING:#define __STDC_HOSTED__ 0
+// VE:#define __UINT16_C(c) c
 // VE:#define __UINT16_C_SUFFIX__
 // VE:#define __UINT16_FMTX__ "hX"
 // VE:#define __UINT16_FMTo__ "ho"
@@ -171,6 +177,7 @@
 // VE:#define __UINT16_FMTx__ "hx"
 // VE:#define __UINT16_MAX__ 65535
 // VE:#define __UINT16_TYPE__ unsigned short
+// VE:#define __UINT32_C(c) c##U
 // VE:#define __UINT32_C_SUFFIX__ U
 // VE:#define __UINT32_FMTX__ "X"
 // VE:#define __UINT32_FMTo__ "o"
@@ -178,6 +185,7 @@
 // VE:#define __UINT32_FMTx__ "x"
 // VE:#define __UINT32_MAX__ 4294967295U
 // VE:#define __UINT32_TYPE__ unsigned int
+// VE:#define __UINT64_C(c) c##UL
 // VE:#define __UINT64_C_SUFFIX__ UL
 // VE:#define __UINT64_FMTX__ "lX"
 // VE:#define __UINT64_FMTo__ "lo"
@@ -185,6 +193,7 @@
 // VE:#define __UINT64_FMTx__ "lx"
 // VE:#define __UINT64_MAX__ 18446744073709551615UL
 // VE:#define __UINT64_TYPE__ long unsigned int
+// VE:#define __UINT8_C(c) c
 // VE:#define __UINT8_C_SUFFIX__
 // VE:#define __UINT8_FMTX__ "hhX"
 // VE:#define __UINT8_FMTo__ "hho"
@@ -192,6 +201,7 @@
 // VE:#define __UINT8_FMTx__ "hhx"
 // VE:#define __UINT8_MAX__ 255
 // VE:#define __UINT8_TYPE__ unsigned char
+// VE:#define __UINTMAX_C(c) c##UL
 // VE:#define __UINTMAX_C_SUFFIX__ UL
 // VE:#define __UINTMAX_FMTX__ "lX"
 // VE:#define __UINTMAX_FMTo__ "lo"
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index 6f5aa5674e48e..cb77b5583407c 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -35,26 +35,31 @@
 // I386:#define __FLT_MIN_EXP__ (-125)
 // I386:#define __FLT_MIN__ 1.17549435e-38F
 // I386:#define __FLT_RADIX__ 2
+// I386:#define __INT16_C(c) c
 // I386:#define __INT16_C_SUFFIX__
 // I386:#define __INT16_FMTd__ "hd"
 // I386:#define __INT16_FMTi__ "hi"
 // I386:#define __INT16_MAX__ 32767
 // I386:#define __INT16_TYPE__ short
+// I386:#define __INT32_C(c) c
 // I386:#define __INT32_C_SUFFIX__
 // I386:#define __INT32_FMTd__ "d"
 // I386:#define __INT32_FMTi__ "i"
 // I386:#define __INT32_MAX__ 2147483647
 // I386:#define __INT32_TYPE__ int
+// I386:#define __INT64_C(c) c##LL
 // I386:#define __INT64_C_SUFFIX__ LL
 // I386:#define __INT64_FMTd__ "lld"
 // I386:#define __INT64_FMTi__ "lli"
 // I386:#define __INT64_MAX__ 9223372036854775807LL
 // I386:#define __INT64_TYPE__ long long int
+// I386:#define __INT8_C(c) c
 // I386:#define __INT8_C_SUFFIX__
 // I386:#define __INT8_FMTd__ "hhd"
 // I386:#define __INT8_FMTi__ "hhi"
 // I386:#define __INT8_MAX__ 127
 // I386:#define __INT8_TYPE__ signed char
+// I386:#define __INTMAX_C(c) c##LL
 // I386:#define __INTMAX_C_SUFFIX__ LL
 // I386:#define __INTMAX_FMTd__ "lld"
 // I386:#define __INTMAX_FMTi__ "lli"
@@ -140,18 +145,23 @@
 // I386:#define __SIZE_MAX__ 4294967295U
 // I386:#define __SIZE_TYPE__ unsigned int
 // I386:#define __SIZE_WIDTH__ 32
+// I386:#define __UINT16_C(c) c
 // I386:#define __UINT16_C_SUFFIX__
 // I386:#define __UINT16_MAX__ 65535
 // I386:#define __UINT16_TYPE__ unsigned short
+// I386:#define __UINT32_C(c) c##U
 // I386:#define __UINT32_C_SUFFIX__ U
 // I386:#define __UINT32_MAX__ 4294967295U
 // I386:#define __UINT32_TYPE__ unsigned int
+// I386:#define __UINT64_C(c) c##ULL
 // I386:#define __UINT64_C_SUFFIX__ ULL
 // I386:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386:#define __UINT64_TYPE__ long long unsigned int
+// I386:#define __UINT8_C(c) c
 // I386:#define __UINT8_C_SUFFIX__
 // I386:#define __UINT8_MAX__ 255
 // I386:#define __UINT8_TYPE__ unsigned char
+// I386:#define __UINTMAX_C(c) c##ULL
 // I386:#define __UINTMAX_C_SUFFIX__ ULL
 // I386:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // I386:#define __UINTMAX_TYPE__ long long unsigned int
@@ -235,26 +245,31 @@
 // I386-LINUX:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // I386-LINUX:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // I386-LINUX:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// I386-LINUX:#define __INT16_C(c) c
 // I386-LINUX:#define __INT16_C_SUFFIX__
 // I386-LINUX:#define __INT16_FMTd__ "hd"
 // I386-LINUX:#define __INT16_FMTi__ "hi"
 // I386-LINUX:#define __INT16_MAX__ 32767
 // I386-LINUX:#define __INT16_TYPE__ short
+// I386-LINUX:#define __INT32_C(c) c
 // I386-LINUX:#define __INT32_C_SUFFIX__
 // I386-LINUX:#define __INT32_FMTd__ "d"
 // I386-LINUX:#define __INT32_FMTi__ "i"
 // I386-LINUX:#define __INT32_MAX__ 2147483647
 // I386-LINUX:#define __INT32_TYPE__ int
+// I386-LINUX:#define __INT64_C(c) c##LL
 // I386-LINUX:#define __INT64_C_SUFFIX__ LL
 // I386-LINUX:#define __INT64_FMTd__ "lld"
 // I386-LINUX:#define __INT64_FMTi__ "lli"
 // I386-LINUX:#define __INT64_MAX__ 9223372036854775807LL
 // I386-LINUX:#define __INT64_TYPE__ long long int
+// I386-LINUX:#define __INT8_C(c) c
 // I386-LINUX:#define __INT8_C_SUFFIX__
 // I386-LINUX:#define __INT8_FMTd__ "hhd"
 // I386-LINUX:#define __INT8_FMTi__ "hhi"
 // I386-LINUX:#define __INT8_MAX__ 127
 // I386-LINUX:#define __INT8_TYPE__ signed char
+// I386-LINUX:#define __INTMAX_C(c) c##LL
 // I386-LINUX:#define __INTMAX_C_SUFFIX__ LL
 // I386-LINUX:#define __INTMAX_FMTd__ "lld"
 // I386-LINUX:#define __INTMAX_FMTi__ "lli"
@@ -341,18 +356,23 @@
 // I386-LINUX:#define __SIZE_TYPE__ unsigned int
 // I386-LINUX:#define __SIZE_WIDTH__ 32
 // I386-LINUX-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U
+// I386-LINUX:#define __UINT16_C(c) c
 // I386-LINUX:#define __UINT16_C_SUFFIX__
 // I386-LINUX:#define __UINT16_MAX__ 65535
 // I386-LINUX:#define __UINT16_TYPE__ unsigned short
+// I386-LINUX:#define __UINT32_C(c) c##U
 // I386-LINUX:#define __UINT32_C_SUFFIX__ U
 // I386-LINUX:#define __UINT32_MAX__ 4294967295U
 // I386-LINUX:#define __UINT32_TYPE__ unsigned int
+// I386-LINUX:#define __UINT64_C(c) c##ULL
 // I386-LINUX:#define __UINT64_C_SUFFIX__ ULL
 // I386-LINUX:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-LINUX:#define __UINT64_TYPE__ long long unsigned int
+// I386-LINUX:#define __UINT8_C(c) c
 // I386-LINUX:#define __UINT8_C_SUFFIX__
 // I386-LINUX:#define __UINT8_MAX__ 255
 // I386-LINUX:#define __UINT8_TYPE__ unsigned char
+// I386-LINUX:#define __UINTMAX_C(c) c##ULL
 // I386-LINUX:#define __UINTMAX_C_SUFFIX__ ULL
 // I386-LINUX:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // I386-LINUX:#define __UINTMAX_TYPE__ long long unsigned int
@@ -436,26 +456,31 @@
 // I386-NETBSD:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // I386-NETBSD:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // I386-NETBSD:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// I386-NETBSD:#define __INT16_C(c) c
 // I386-NETBSD:#define __INT16_C_SUFFIX__
 // I386-NETBSD:#define __INT16_FMTd__ "hd"
 // I386-NETBSD:#define __INT16_FMTi__ "hi"
 // I386-NETBSD:#define __INT16_MAX__ 32767
 // I386-NETBSD:#define __INT16_TYPE__ short
+// I386-NETBSD:#define __INT32_C(c) c
 // I386-NETBSD:#define __INT32_C_SUFFIX__
 // I386-NETBSD:#define __INT32_FMTd__ "d"
 // I386-NETBSD:#define __INT32_FMTi__ "i"
 // I386-NETBSD:#define __INT32_MAX__ 2147483647
 // I386-NETBSD:#define __INT32_TYPE__ int
+// I386-NETBSD:#define __INT64_C(c) c##LL
 // I386-NETBSD:#define __INT64_C_SUFFIX__ LL
 // I386-NETBSD:#define __INT64_FMTd__ "lld"
 // I386-NETBSD:#define __INT64_FMTi__ "lli"
 // I386-NETBSD:#define __INT64_MAX__ 9223372036854775807LL
 // I386-NETBSD:#define __INT64_TYPE__ long long int
+// I386-NETBSD:#define __INT8_C(c) c
 // I386-NETBSD:#define __INT8_C_SUFFIX__
 // I386-NETBSD:#define __INT8_FMTd__ "hhd"
 // I386-NETBSD:#define __INT8_FMTi__ "hhi"
 // I386-NETBSD:#define __INT8_MAX__ 127
 // I386-NETBSD:#define __INT8_TYPE__ signed char
+// I386-NETBSD:#define __INTMAX_C(c) c##LL
 // I386-NETBSD:#define __INTMAX_C_SUFFIX__ LL
 // I386-NETBSD:#define __INTMAX_FMTd__ "lld"
 // I386-NETBSD:#define __INTMAX_FMTi__ "lli"
@@ -542,18 +567,23 @@
 // I386-NETBSD:#define __SIZE_TYPE__ unsigned int
 // I386-NETBSD:#define __SIZE_WIDTH__ 32
 // I386-NETBSD-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 4U
+// I386-NETBSD:#define __UINT16_C(c) c
 // I386-NETBSD:#define __UINT16_C_SUFFIX__
 // I386-NETBSD:#define __UINT16_MAX__ 65535
 // I386-NETBSD:#define __UINT16_TYPE__ unsigned short
+// I386-NETBSD:#define __UINT32_C(c) c##U
 // I386-NETBSD:#define __UINT32_C_SUFFIX__ U
 // I386-NETBSD:#define __UINT32_MAX__ 4294967295U
 // I386-NETBSD:#define __UINT32_TYPE__ unsigned int
+// I386-NETBSD:#define __UINT64_C(c) c##ULL
 // I386-NETBSD:#define __UINT64_C_SUFFIX__ ULL
 // I386-NETBSD:#define __UINT64_MAX__ 18446744073709551615ULL
 // I386-NETBSD:#define __UINT64_TYPE__ long long unsigned int
+// I386-NETBSD:#define __UINT8_C(c) c
 // I386-NETBSD:#define __UINT8_C_SUFFIX__
 // I386-NETBSD:#define __UINT8_MAX__ 255
 // I386-NETBSD:#define __UINT8_TYPE__ unsigned char
+// I386-NETBSD:#define __UINTMAX_C(c) c##ULL
 // I386-NETBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // I386-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // I386-NETBSD:#define __UINTMAX_TYPE__ long long unsigned int
@@ -636,26 +666,31 @@
 // X86_64:#define __FLT_MIN_EXP__ (-125)
 // X86_64:#define __FLT_MIN__ 1.17549435e-38F
 // X86_64:#define __FLT_RADIX__ 2
+// X86_64:#define __INT16_C(c) c
 // X86_64:#define __INT16_C_SUFFIX__
 // X86_64:#define __INT16_FMTd__ "hd"
 // X86_64:#define __INT16_FMTi__ "hi"
 // X86_64:#define __INT16_MAX__ 32767
 // X86_64:#define __INT16_TYPE__ short
+// X86_64:#define __INT32_C(c) c
 // X86_64:#define __INT32_C_SUFFIX__
 // X86_64:#define __INT32_FMTd__ "d"
 // X86_64:#define __INT32_FMTi__ "i"
 // X86_64:#define __INT32_MAX__ 2147483647
 // X86_64:#define __INT32_TYPE__ int
+// X86_64:#define __INT64_C(c) c##L
 // X86_64:#define __INT64_C_SUFFIX__ L
 // X86_64:#define __INT64_FMTd__ "ld"
 // X86_64:#define __INT64_FMTi__ "li"
 // X86_64:#define __INT64_MAX__ 9223372036854775807L
 // X86_64:#define __INT64_TYPE__ long int
+// X86_64:#define __INT8_C(c) c
 // X86_64:#define __INT8_C_SUFFIX__
 // X86_64:#define __INT8_FMTd__ "hhd"
 // X86_64:#define __INT8_FMTi__ "hhi"
 // X86_64:#define __INT8_MAX__ 127
 // X86_64:#define __INT8_TYPE__ signed char
+// X86_64:#define __INTMAX_C(c) c##L
 // X86_64:#define __INTMAX_C_SUFFIX__ L
 // X86_64:#define __INTMAX_FMTd__ "ld"
 // X86_64:#define __INTMAX_FMTi__ "li"
@@ -748,18 +783,23 @@
 // X86_64:#define __SSE_MATH__ 1
 // X86_64:#define __SSE__ 1
 // X86_64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16UL
+// X86_64:#define __UINT16_C(c) c
 // X86_64:#define __UINT16_C_SUFFIX__
 // X86_64:#define __UINT16_MAX__ 65535
 // X86_64:#define __UINT16_TYPE__ unsigned short
+// X86_64:#define __UINT32_C(c) c##U
 // X86_64:#define __UINT32_C_SUFFIX__ U
 // X86_64:#define __UINT32_MAX__ 4294967295U
 // X86_64:#define __UINT32_TYPE__ unsigned int
+// X86_64:#define __UINT64_C(c) c##UL
 // X86_64:#define __UINT64_C_SUFFIX__ UL
 // X86_64:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64:#define __UINT64_TYPE__ long unsigned int
+// X86_64:#define __UINT8_C(c) c
 // X86_64:#define __UINT8_C_SUFFIX__
 // X86_64:#define __UINT8_MAX__ 255
 // X86_64:#define __UINT8_TYPE__ unsigned char
+// X86_64:#define __UINTMAX_C(c) c##UL
 // X86_64:#define __UINTMAX_C_SUFFIX__ UL
 // X86_64:#define __UINTMAX_MAX__ 18446744073709551615UL
 // X86_64:#define __UINTMAX_TYPE__ long unsigned int
@@ -842,26 +882,31 @@
 // X32:#define __FLT_RADIX__ 2
 // X32:#define __ILP32__ 1
 // X32-NOT:#define __LP64__ 1
+// X32:#define __INT16_C(c) c
 // X32:#define __INT16_C_SUFFIX__
 // X32:#define __INT16_FMTd__ "hd"
 // X32:#define __INT16_FMTi__ "hi"
 // X32:#define __INT16_MAX__ 32767
 // X32:#define __INT16_TYPE__ short
+// X32:#define __INT32_C(c) c
 // X32:#define __INT32_C_SUFFIX__
 // X32:#define __INT32_FMTd__ "d"
 // X32:#define __INT32_FMTi__ "i"
 // X32:#define __INT32_MAX__ 2147483647
 // X32:#define __INT32_TYPE__ int
+// X32:#define __INT64_C(c) c##LL
 // X32:#define __INT64_C_SUFFIX__ LL
 // X32:#define __INT64_FMTd__ "lld"
 // X32:#define __INT64_FMTi__ "lli"
 // X32:#define __INT64_MAX__ 9223372036854775807LL
 // X32:#define __INT64_TYPE__ long long int
+// X32:#define __INT8_C(c) c
 // X32:#define __INT8_C_SUFFIX__
 // X32:#define __INT8_FMTd__ "hhd"
 // X32:#define __INT8_FMTi__ "hhi"
 // X32:#define __INT8_MAX__ 127
 // X32:#define __INT8_TYPE__ signed char
+// X32:#define __INTMAX_C(c) c##LL
 // X32:#define __INTMAX_C_SUFFIX__ LL
 // X32:#define __INTMAX_FMTd__ "lld"
 // X32:#define __INTMAX_FMTi__ "lli"
@@ -952,18 +997,23 @@
 // X32:#define __SSE_MATH__ 1
 // X32:#define __SSE__ 1
 // X32-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 16U
+// X32:#define __UINT16_C(c) c
 // X32:#define __UINT16_C_SUFFIX__
 // X32:#define __UINT16_MAX__ 65535
 // X32:#define __UINT16_TYPE__ unsigned short
+// X32:#define __UINT32_C(c) c##U
 // X32:#define __UINT32_C_SUFFIX__ U
 // X32:#define __UINT32_MAX__ 4294967295U
 // X32:#define __UINT32_TYPE__ unsigned int
+// X32:#define __UINT64_C(c) c##ULL
 // X32:#define __UINT64_C_SUFFIX__ ULL
 // X32:#define __UINT64_MAX__ 18446744073709551615ULL
 // X32:#define __UINT64_TYPE__ long long unsigned int
+// X32:#define __UINT8_C(c) c
 // X32:#define __UINT8_C_SUFFIX__
 // X32:#define __UINT8_MAX__ 255
 // X32:#define __UINT8_TYPE__ unsigned char
+// X32:#define __UINTMAX_C(c) c##ULL
 // X32:#define __UINTMAX_C_SUFFIX__ ULL
 // X32:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // X32:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1046,26 +1096,31 @@
 // X86_64-LINUX:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // X86_64-LINUX:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // X86_64-LINUX:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// X86_64-LINUX:#define __INT16_C(c) c
 // X86_64-LINUX:#define __INT16_C_SUFFIX__
 // X86_64-LINUX:#define __INT16_FMTd__ "hd"
 // X86_64-LINUX:#define __INT16_FMTi__ "hi"
 // X86_64-LINUX:#define __INT16_MAX__ 32767
 // X86_64-LINUX:#define __INT16_TYPE__ short
+// X86_64-LINUX:#define __INT32_C(c) c
 // X86_64-LINUX:#define __INT32_C_SUFFIX__
 // X86_64-LINUX:#define __INT32_FMTd__ "d"
 // X86_64-LINUX:#define __INT32_FMTi__ "i"
 // X86_64-LINUX:#define __INT32_MAX__ 2147483647
 // X86_64-LINUX:#define __INT32_TYPE__ int
+// X86_64-LINUX:#define __INT64_C(c) c##L
 // X86_64-LINUX:#define __INT64_C_SUFFIX__ L
 // X86_64-LINUX:#define __INT64_FMTd__ "ld"
 // X86_64-LINUX:#define __INT64_FMTi__ "li"
 // X86_64-LINUX:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-LINUX:#define __INT64_TYPE__ long int
+// X86_64-LINUX:#define __INT8_C(c) c
 // X86_64-LINUX:#define __INT8_C_SUFFIX__
 // X86_64-LINUX:#define __INT8_FMTd__ "hhd"
 // X86_64-LINUX:#define __INT8_FMTi__ "hhi"
 // X86_64-LINUX:#define __INT8_MAX__ 127
 // X86_64-LINUX:#define __INT8_TYPE__ signed char
+// X86_64-LINUX:#define __INTMAX_C(c) c##L
 // X86_64-LINUX:#define __INTMAX_C_SUFFIX__ L
 // X86_64-LINUX:#define __INTMAX_FMTd__ "ld"
 // X86_64-LINUX:#define __INTMAX_FMTi__ "li"
@@ -1156,18 +1211,23 @@
 // X86_64-LINUX:#define __SSE2__ 1
 // X86_64-LINUX:#define __SSE_MATH__ 1
 // X86_64-LINUX:#define __SSE__ 1
+// X86_64-LINUX:#define __UINT16_C(c) c
 // X86_64-LINUX:#define __UINT16_C_SUFFIX__
 // X86_64-LINUX:#define __UINT16_MAX__ 65535
 // X86_64-LINUX:#define __UINT16_TYPE__ unsigned short
+// X86_64-LINUX:#define __UINT32_C(c) c##U
 // X86_64-LINUX:#define __UINT32_C_SUFFIX__ U
 // X86_64-LINUX:#define __UINT32_MAX__ 4294967295U
 // X86_64-LINUX:#define __UINT32_TYPE__ unsigned int
+// X86_64-LINUX:#define __UINT64_C(c) c##UL
 // X86_64-LINUX:#define __UINT64_C_SUFFIX__ UL
 // X86_64-LINUX:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-LINUX:#define __UINT64_TYPE__ long unsigned int
+// X86_64-LINUX:#define __UINT8_C(c) c
 // X86_64-LINUX:#define __UINT8_C_SUFFIX__
 // X86_64-LINUX:#define __UINT8_MAX__ 255
 // X86_64-LINUX:#define __UINT8_TYPE__ unsigned char
+// X86_64-LINUX:#define __UINTMAX_C(c) c##UL
 // X86_64-LINUX:#define __UINTMAX_C_SUFFIX__ UL
 // X86_64-LINUX:#define __UINTMAX_MAX__ 18446744073709551615UL
 // X86_64-LINUX:#define __UINTMAX_TYPE__ long unsigned int
@@ -1258,26 +1318,31 @@
 // X86_64-NETBSD:#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
 // X86_64-NETBSD:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // X86_64-NETBSD:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
+// X86_64-NETBSD:#define __INT16_C(c) c
 // X86_64-NETBSD:#define __INT16_C_SUFFIX__
 // X86_64-NETBSD:#define __INT16_FMTd__ "hd"
 // X86_64-NETBSD:#define __INT16_FMTi__ "hi"
 // X86_64-NETBSD:#define __INT16_MAX__ 32767
 // X86_64-NETBSD:#define __INT16_TYPE__ short
+// X86_64-NETBSD:#define __INT32_C(c) c
 // X86_64-NETBSD:#define __INT32_C_SUFFIX__
 // X86_64-NETBSD:#define __INT32_FMTd__ "d"
 // X86_64-NETBSD:#define __INT32_FMTi__ "i"
 // X86_64-NETBSD:#define __INT32_MAX__ 2147483647
 // X86_64-NETBSD:#define __INT32_TYPE__ int
+// X86_64-NETBSD:#define __INT64_C(c) c##L
 // X86_64-NETBSD:#define __INT64_C_SUFFIX__ L
 // X86_64-NETBSD:#define __INT64_FMTd__ "ld"
 // X86_64-NETBSD:#define __INT64_FMTi__ "li"
 // X86_64-NETBSD:#define __INT64_MAX__ 9223372036854775807L
 // X86_64-NETBSD:#define __INT64_TYPE__ long int
+// X86_64-NETBSD:#define __INT8_C(c) c
 // X86_64-NETBSD:#define __INT8_C_SUFFIX__
 // X86_64-NETBSD:#define __INT8_FMTd__ "hhd"
 // X86_64-NETBSD:#define __INT8_FMTi__ "hhi"
 // X86_64-NETBSD:#define __INT8_MAX__ 127
 // X86_64-NETBSD:#define __INT8_TYPE__ signed char
+// X86_64-NETBSD:#define __INTMAX_C(c) c##L
 // X86_64-NETBSD:#define __INTMAX_C_SUFFIX__ L
 // X86_64-NETBSD:#define __INTMAX_FMTd__ "ld"
 // X86_64-NETBSD:#define __INTMAX_FMTi__ "li"
@@ -1368,18 +1433,23 @@
 // X86_64-NETBSD:#define __SSE2__ 1
 // X86_64-NETBSD:#define __SSE_MATH__ 1
 // X86_64-NETBSD:#define __SSE__ 1
+// X86_64-NETBSD:#define __UINT16_C(c) c
 // X86_64-NETBSD:#define __UINT16_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT16_MAX__ 65535
 // X86_64-NETBSD:#define __UINT16_TYPE__ unsigned short
+// X86_64-NETBSD:#define __UINT32_C(c) c##U
 // X86_64-NETBSD:#define __UINT32_C_SUFFIX__ U
 // X86_64-NETBSD:#define __UINT32_MAX__ 4294967295U
 // X86_64-NETBSD:#define __UINT32_TYPE__ unsigned int
+// X86_64-NETBSD:#define __UINT64_C(c) c##UL
 // X86_64-NETBSD:#define __UINT64_C_SUFFIX__ UL
 // X86_64-NETBSD:#define __UINT64_MAX__ 18446744073709551615UL
 // X86_64-NETBSD:#define __UINT64_TYPE__ long unsigned int
+// X86_64-NETBSD:#define __UINT8_C(c) c
 // X86_64-NETBSD:#define __UINT8_C_SUFFIX__
 // X86_64-NETBSD:#define __UINT8_MAX__ 255
 // X86_64-NETBSD:#define __UINT8_TYPE__ unsigned char
+// X86_64-NETBSD:#define __UINTMAX_C(c) c##UL
 // X86_64-NETBSD:#define __UINTMAX_C_SUFFIX__ UL
 // X86_64-NETBSD:#define __UINTMAX_MAX__ 18446744073709551615UL
 // X86_64-NETBSD:#define __UINTMAX_TYPE__ long unsigned int
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 5999b9c1d1bc3..1ac325d444662 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -426,26 +426,31 @@
 // MSP430:#define __FLT_MIN_EXP__ (-125)
 // MSP430:#define __FLT_MIN__ 1.17549435e-38F
 // MSP430:#define __FLT_RADIX__ 2
+// MSP430:#define __INT16_C(c) c
 // MSP430:#define __INT16_C_SUFFIX__
 // MSP430:#define __INT16_FMTd__ "hd"
 // MSP430:#define __INT16_FMTi__ "hi"
 // MSP430:#define __INT16_MAX__ 32767
 // MSP430:#define __INT16_TYPE__ short
+// MSP430:#define __INT32_C(c) c##L
 // MSP430:#define __INT32_C_SUFFIX__ L
 // MSP430:#define __INT32_FMTd__ "ld"
 // MSP430:#define __INT32_FMTi__ "li"
 // MSP430:#define __INT32_MAX__ 2147483647L
 // MSP430:#define __INT32_TYPE__ long int
+// MSP430:#define __INT64_C(c) c##LL
 // MSP430:#define __INT64_C_SUFFIX__ LL
 // MSP430:#define __INT64_FMTd__ "lld"
 // MSP430:#define __INT64_FMTi__ "lli"
 // MSP430:#define __INT64_MAX__ 9223372036854775807LL
 // MSP430:#define __INT64_TYPE__ long long int
+// MSP430:#define __INT8_C(c) c
 // MSP430:#define __INT8_C_SUFFIX__
 // MSP430:#define __INT8_FMTd__ "hhd"
 // MSP430:#define __INT8_FMTi__ "hhi"
 // MSP430:#define __INT8_MAX__ 127
 // MSP430:#define __INT8_TYPE__ signed char
+// MSP430:#define __INTMAX_C(c) c##LL
 // MSP430:#define __INTMAX_C_SUFFIX__ LL
 // MSP430:#define __INTMAX_FMTd__ "lld"
 // MSP430:#define __INTMAX_FMTi__ "lli"
@@ -531,18 +536,23 @@
 // MSP430:#define __SIZE_TYPE__ unsigned int
 // MSP430:#define __SIZE_WIDTH__ 16
 // MSP430-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 2U
+// MSP430:#define __UINT16_C(c) c##U
 // MSP430:#define __UINT16_C_SUFFIX__ U
 // MSP430:#define __UINT16_MAX__ 65535U
 // MSP430:#define __UINT16_TYPE__ unsigned short
+// MSP430:#define __UINT32_C(c) c##UL
 // MSP430:#define __UINT32_C_SUFFIX__ UL
 // MSP430:#define __UINT32_MAX__ 4294967295UL
 // MSP430:#define __UINT32_TYPE__ long unsigned int
+// MSP430:#define __UINT64_C(c) c##ULL
 // MSP430:#define __UINT64_C_SUFFIX__ ULL
 // MSP430:#define __UINT64_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINT64_TYPE__ long long unsigned int
+// MSP430:#define __UINT8_C(c) c
 // MSP430:#define __UINT8_C_SUFFIX__
 // MSP430:#define __UINT8_MAX__ 255
 // MSP430:#define __UINT8_TYPE__ unsigned char
+// MSP430:#define __UINTMAX_C(c) c##ULL
 // MSP430:#define __UINTMAX_C_SUFFIX__ ULL
 // MSP430:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // MSP430:#define __UINTMAX_TYPE__ long long unsigned int
@@ -613,26 +623,31 @@
 // NVPTX32:#define __FLT_MIN_EXP__ (-125)
 // NVPTX32:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX32:#define __FLT_RADIX__ 2
+// NVPTX32:#define __INT16_C(c) c
 // NVPTX32:#define __INT16_C_SUFFIX__
 // NVPTX32:#define __INT16_FMTd__ "hd"
 // NVPTX32:#define __INT16_FMTi__ "hi"
 // NVPTX32:#define __INT16_MAX__ 32767
 // NVPTX32:#define __INT16_TYPE__ short
+// NVPTX32:#define __INT32_C(c) c
 // NVPTX32:#define __INT32_C_SUFFIX__
 // NVPTX32:#define __INT32_FMTd__ "d"
 // NVPTX32:#define __INT32_FMTi__ "i"
 // NVPTX32:#define __INT32_MAX__ 2147483647
 // NVPTX32:#define __INT32_TYPE__ int
+// NVPTX32:#define __INT64_C(c) c##LL
 // NVPTX32:#define __INT64_C_SUFFIX__ LL
 // NVPTX32:#define __INT64_FMTd__ "lld"
 // NVPTX32:#define __INT64_FMTi__ "lli"
 // NVPTX32:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX32:#define __INT64_TYPE__ long long int
+// NVPTX32:#define __INT8_C(c) c
 // NVPTX32:#define __INT8_C_SUFFIX__
 // NVPTX32:#define __INT8_FMTd__ "hhd"
 // NVPTX32:#define __INT8_FMTi__ "hhi"
 // NVPTX32:#define __INT8_MAX__ 127
 // NVPTX32:#define __INT8_TYPE__ signed char
+// NVPTX32:#define __INTMAX_C(c) c##LL
 // NVPTX32:#define __INTMAX_C_SUFFIX__ LL
 // NVPTX32:#define __INTMAX_FMTd__ "lld"
 // NVPTX32:#define __INTMAX_FMTi__ "lli"
@@ -720,18 +735,23 @@
 // NVPTX32:#define __SIZE_TYPE__ unsigned int
 // NVPTX32:#define __SIZE_WIDTH__ 32
 // NVPTX32-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U
+// NVPTX32:#define __UINT16_C(c) c
 // NVPTX32:#define __UINT16_C_SUFFIX__
 // NVPTX32:#define __UINT16_MAX__ 65535
 // NVPTX32:#define __UINT16_TYPE__ unsigned short
+// NVPTX32:#define __UINT32_C(c) c##U
 // NVPTX32:#define __UINT32_C_SUFFIX__ U
 // NVPTX32:#define __UINT32_MAX__ 4294967295U
 // NVPTX32:#define __UINT32_TYPE__ unsigned int
+// NVPTX32:#define __UINT64_C(c) c##ULL
 // NVPTX32:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX32:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINT64_TYPE__ long long unsigned int
+// NVPTX32:#define __UINT8_C(c) c
 // NVPTX32:#define __UINT8_C_SUFFIX__
 // NVPTX32:#define __UINT8_MAX__ 255
 // NVPTX32:#define __UINT8_TYPE__ unsigned char
+// NVPTX32:#define __UINTMAX_C(c) c##ULL
 // NVPTX32:#define __UINTMAX_C_SUFFIX__ ULL
 // NVPTX32:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // NVPTX32:#define __UINTMAX_TYPE__ long long unsigned int
@@ -801,26 +821,31 @@
 // NVPTX64:#define __FLT_MIN_EXP__ (-125)
 // NVPTX64:#define __FLT_MIN__ 1.17549435e-38F
 // NVPTX64:#define __FLT_RADIX__ 2
+// NVPTX64:#define __INT16_C(c) c
 // NVPTX64:#define __INT16_C_SUFFIX__
 // NVPTX64:#define __INT16_FMTd__ "hd"
 // NVPTX64:#define __INT16_FMTi__ "hi"
 // NVPTX64:#define __INT16_MAX__ 32767
 // NVPTX64:#define __INT16_TYPE__ short
+// NVPTX64:#define __INT32_C(c) c
 // NVPTX64:#define __INT32_C_SUFFIX__
 // NVPTX64:#define __INT32_FMTd__ "d"
 // NVPTX64:#define __INT32_FMTi__ "i"
 // NVPTX64:#define __INT32_MAX__ 2147483647
 // NVPTX64:#define __INT32_TYPE__ int
+// NVPTX64:#define __INT64_C(c) c##LL
 // NVPTX64:#define __INT64_C_SUFFIX__ LL
 // NVPTX64:#define __INT64_FMTd__ "lld"
 // NVPTX64:#define __INT64_FMTi__ "lli"
 // NVPTX64:#define __INT64_MAX__ 9223372036854775807LL
 // NVPTX64:#define __INT64_TYPE__ long long int
+// NVPTX64:#define __INT8_C(c) c
 // NVPTX64:#define __INT8_C_SUFFIX__
 // NVPTX64:#define __INT8_FMTd__ "hhd"
 // NVPTX64:#define __INT8_FMTi__ "hhi"
 // NVPTX64:#define __INT8_MAX__ 127
 // NVPTX64:#define __INT8_TYPE__ signed char
+// NVPTX64:#define __INTMAX_C(c) c##LL
 // NVPTX64:#define __INTMAX_C_SUFFIX__ LL
 // NVPTX64:#define __INTMAX_FMTd__ "lld"
 // NVPTX64:#define __INTMAX_FMTi__ "lli"
@@ -908,18 +933,23 @@
 // NVPTX64:#define __SIZE_TYPE__ long unsigned int
 // NVPTX64:#define __SIZE_WIDTH__ 64
 // NVPTX64-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8UL
+// NVPTX64:#define __UINT16_C(c) c
 // NVPTX64:#define __UINT16_C_SUFFIX__
 // NVPTX64:#define __UINT16_MAX__ 65535
 // NVPTX64:#define __UINT16_TYPE__ unsigned short
+// NVPTX64:#define __UINT32_C(c) c##U
 // NVPTX64:#define __UINT32_C_SUFFIX__ U
 // NVPTX64:#define __UINT32_MAX__ 4294967295U
 // NVPTX64:#define __UINT32_TYPE__ unsigned int
+// NVPTX64:#define __UINT64_C(c) c##ULL
 // NVPTX64:#define __UINT64_C_SUFFIX__ ULL
 // NVPTX64:#define __UINT64_MAX__ 18446744073709551615ULL
 // NVPTX64:#define __UINT64_TYPE__ long long unsigned int
+// NVPTX64:#define __UINT8_C(c) c
 // NVPTX64:#define __UINT8_C_SUFFIX__
 // NVPTX64:#define __UINT8_MAX__ 255
 // NVPTX64:#define __UINT8_TYPE__ unsigned char
+// NVPTX64:#define __UINTMAX_C(c) c##ULL
 // NVPTX64:#define __UINTMAX_C_SUFFIX__ ULL
 // NVPTX64:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // NVPTX64:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1003,26 +1033,31 @@
 // SPARC:#define __FLT_MIN__ 1.17549435e-38F
 // SPARC:#define __FLT_RADIX__ 2
 // SPARC:#define __GCC_ATOMIC_LLONG_LOCK_FREE 1
+// SPARC:#define __INT16_C(c) c
 // SPARC:#define __INT16_C_SUFFIX__
 // SPARC:#define __INT16_FMTd__ "hd"
 // SPARC:#define __INT16_FMTi__ "hi"
 // SPARC:#define __INT16_MAX__ 32767
 // SPARC:#define __INT16_TYPE__ short
+// SPARC:#define __INT32_C(c) c
 // SPARC:#define __INT32_C_SUFFIX__
 // SPARC:#define __INT32_FMTd__ "d"
 // SPARC:#define __INT32_FMTi__ "i"
 // SPARC:#define __INT32_MAX__ 2147483647
 // SPARC:#define __INT32_TYPE__ int
+// SPARC:#define __INT64_C(c) c##LL
 // SPARC:#define __INT64_C_SUFFIX__ LL
 // SPARC:#define __INT64_FMTd__ "lld"
 // SPARC:#define __INT64_FMTi__ "lli"
 // SPARC:#define __INT64_MAX__ 9223372036854775807LL
 // SPARC:#define __INT64_TYPE__ long long int
+// SPARC:#define __INT8_C(c) c
 // SPARC:#define __INT8_C_SUFFIX__
 // SPARC:#define __INT8_FMTd__ "hhd"
 // SPARC:#define __INT8_FMTi__ "hhi"
 // SPARC:#define __INT8_MAX__ 127
 // SPARC:#define __INT8_TYPE__ signed char
+// SPARC:#define __INTMAX_C(c) c##LL
 // SPARC:#define __INTMAX_C_SUFFIX__ LL
 // SPARC:#define __INTMAX_FMTd__ "lld"
 // SPARC:#define __INTMAX_FMTi__ "lli"
@@ -1114,18 +1149,23 @@
 // SPARC-NETOPENBSD:#define __SIZE_TYPE__ long unsigned int
 // SPARC:#define __SIZE_WIDTH__ 32
 // SPARC-DEFAULT-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 8U
+// SPARC:#define __UINT16_C(c) c
 // SPARC:#define __UINT16_C_SUFFIX__
 // SPARC:#define __UINT16_MAX__ 65535
 // SPARC:#define __UINT16_TYPE__ unsigned short
+// SPARC:#define __UINT32_C(c) c##U
 // SPARC:#define __UINT32_C_SUFFIX__ U
 // SPARC:#define __UINT32_MAX__ 4294967295U
 // SPARC:#define __UINT32_TYPE__ unsigned int
+// SPARC:#define __UINT64_C(c) c##ULL
 // SPARC:#define __UINT64_C_SUFFIX__ ULL
 // SPARC:#define __UINT64_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINT64_TYPE__ long long unsigned int
+// SPARC:#define __UINT8_C(c) c
 // SPARC:#define __UINT8_C_SUFFIX__
 // SPARC:#define __UINT8_MAX__ 255
 // SPARC:#define __UINT8_TYPE__ unsigned char
+// SPARC:#define __UINTMAX_C(c) c##ULL
 // SPARC:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // SPARC:#define __UINTMAX_TYPE__ long long unsigned int
@@ -1201,21 +1241,25 @@
 // TCE:#define __FLT_MIN_EXP__ (-125)
 // TCE:#define __FLT_MIN__ 1.17549435e-38F
 // TCE:#define __FLT_RADIX__ 2
+// TCE:#define __INT16_C(c) c
 // TCE:#define __INT16_C_SUFFIX__
 // TCE:#define __INT16_FMTd__ "hd"
 // TCE:#define __INT16_FMTi__ "hi"
 // TCE:#define __INT16_MAX__ 32767
 // TCE:#define __INT16_TYPE__ short
+// TCE:#define __INT32_C(c) c
 // TCE:#define __INT32_C_SUFFIX__
 // TCE:#define __INT32_FMTd__ "d"
 // TCE:#define __INT32_FMTi__ "i"
 // TCE:#define __INT32_MAX__ 2147483647
 // TCE:#define __INT32_TYPE__ int
+// TCE:#define __INT8_C(c) c
 // TCE:#define __INT8_C_SUFFIX__
 // TCE:#define __INT8_FMTd__ "hhd"
 // TCE:#define __INT8_FMTi__ "hhi"
 // TCE:#define __INT8_MAX__ 127
 // TCE:#define __INT8_TYPE__ signed char
+// TCE:#define __INTMAX_C(c) c##L
 // TCE:#define __INTMAX_C_SUFFIX__ L
 // TCE:#define __INTMAX_FMTd__ "ld"
 // TCE:#define __INTMAX_FMTi__ "li"
@@ -1293,15 +1337,19 @@
 // TCE-CXX:#define __STDCPP_DEFAULT_NEW_ALIGNMENT__ 4U
 // TCE:#define __TCE_V1__ 1
 // TCE:#define __TCE__ 1
+// TCE:#define __UINT16_C(c) c
 // TCE:#define __UINT16_C_SUFFIX__
 // TCE:#define __UINT16_MAX__ 65535
 // TCE:#define __UINT16_TYPE__ unsigned short
+// TCE:#define __UINT32_C(c) c##U
 // TCE:#define __UINT32_C_SUFFIX__ U
 // TCE:#define __UINT32_MAX__ 4294967295U
 // TCE:#define __UINT32_TYPE__ unsigned int
+// TCE:#define __UINT8_C(c) c
 // TCE:#define __UINT8_C_SUFFIX__
 // TCE:#define __UINT8_MAX__ 255
 // TCE:#define __UINT8_TYPE__ unsigned char
+// TCE:#define __UINTMAX_C(c) c##UL
 // TCE:#define __UINTMAX_C_SUFFIX__ UL
 // TCE:#define __UINTMAX_MAX__ 4294967295UL
 // TCE:#define __UINTMAX_TYPE__ long unsigned int
@@ -1373,6 +1421,7 @@
 // PS4:#define __FreeBSD_cc_version 900001
 // PS4:#define __INT16_TYPE__ short
 // PS4:#define __INT32_TYPE__ int
+// PS4:#define __INT64_C(c) c##L
 // PS4:#define __INT64_C_SUFFIX__ L
 // PS4:#define __INT64_TYPE__ long int
 // PS4:#define __INT8_TYPE__ signed char
@@ -1464,6 +1513,7 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-none < /dev/null | FileCheck -match-full-lines -check-prefix SPARCV9 %s
 // SPARCV9:#define __BIGGEST_ALIGNMENT__ 16
 // SPARCV9:#define __INT64_TYPE__ long int
+// SPARCV9:#define __INTMAX_C(c) c##L
 // SPARCV9:#define __INTMAX_C_SUFFIX__ L
 // SPARCV9:#define __INTMAX_TYPE__ long int
 // SPARCV9:#define __INTPTR_TYPE__ long int
@@ -1475,8 +1525,10 @@
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=sparc64-none-openbsd < /dev/null | FileCheck -match-full-lines -check-prefix SPARC64-OBSD %s
 // SPARC64-OBSD:#define __INT64_TYPE__ long long int
+// SPARC64-OBSD:#define __INTMAX_C(c) c##LL
 // SPARC64-OBSD:#define __INTMAX_C_SUFFIX__ LL
 // SPARC64-OBSD:#define __INTMAX_TYPE__ long long int
+// SPARC64-OBSD:#define __UINTMAX_C(c) c##ULL
 // SPARC64-OBSD:#define __UINTMAX_C_SUFFIX__ ULL
 // SPARC64-OBSD:#define __UINTMAX_TYPE__ long long unsigned int
 //
@@ -1720,26 +1772,31 @@
 // WEBASSEMBLY-NEXT:#define __GXX_ABI_VERSION 1002
 // WEBASSEMBLY32-NEXT:#define __ILP32__ 1
 // WEBASSEMBLY64-NOT:#define __ILP32__
+// WEBASSEMBLY-NEXT:#define __INT16_C(c) c
 // WEBASSEMBLY-NEXT:#define __INT16_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __INT16_FMTd__ "hd"
 // WEBASSEMBLY-NEXT:#define __INT16_FMTi__ "hi"
 // WEBASSEMBLY-NEXT:#define __INT16_MAX__ 32767
 // WEBASSEMBLY-NEXT:#define __INT16_TYPE__ short
+// WEBASSEMBLY-NEXT:#define __INT32_C(c) c
 // WEBASSEMBLY-NEXT:#define __INT32_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __INT32_FMTd__ "d"
 // WEBASSEMBLY-NEXT:#define __INT32_FMTi__ "i"
 // WEBASSEMBLY-NEXT:#define __INT32_MAX__ 2147483647
 // WEBASSEMBLY-NEXT:#define __INT32_TYPE__ int
+// WEBASSEMBLY-NEXT:#define __INT64_C(c) c##LL
 // WEBASSEMBLY-NEXT:#define __INT64_C_SUFFIX__ LL
 // WEBASSEMBLY-NEXT:#define __INT64_FMTd__ "lld"
 // WEBASSEMBLY-NEXT:#define __INT64_FMTi__ "lli"
 // WEBASSEMBLY-NEXT:#define __INT64_MAX__ 9223372036854775807LL
 // WEBASSEMBLY-NEXT:#define __INT64_TYPE__ long long int
+// WEBASSEMBLY-NEXT:#define __INT8_C(c) c
 // WEBASSEMBLY-NEXT:#define __INT8_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __INT8_FMTd__ "hhd"
 // WEBASSEMBLY-NEXT:#define __INT8_FMTi__ "hhi"
 // WEBASSEMBLY-NEXT:#define __INT8_MAX__ 127
 // WEBASSEMBLY-NEXT:#define __INT8_TYPE__ signed char
+// WEBASSEMBLY-NEXT:#define __INTMAX_C(c) c##LL
 // WEBASSEMBLY-NEXT:#define __INTMAX_C_SUFFIX__ LL
 // WEBASSEMBLY-NEXT:#define __INTMAX_FMTd__ "lld"
 // WEBASSEMBLY-NEXT:#define __INTMAX_FMTi__ "lli"
@@ -1892,6 +1949,7 @@
 // WEBASSEMBLY-NEXT:#define __STDC_UTF_32__ 1
 // WEBASSEMBLY-NEXT:#define __STDC_VERSION__ 201710L
 // WEBASSEMBLY-NEXT:#define __STDC__ 1
+// WEBASSEMBLY-NEXT:#define __UINT16_C(c) c
 // WEBASSEMBLY-NEXT:#define __UINT16_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __UINT16_FMTX__ "hX"
 // WEBASSEMBLY-NEXT:#define __UINT16_FMTo__ "ho"
@@ -1899,6 +1957,7 @@
 // WEBASSEMBLY-NEXT:#define __UINT16_FMTx__ "hx"
 // WEBASSEMBLY-NEXT:#define __UINT16_MAX__ 65535
 // WEBASSEMBLY-NEXT:#define __UINT16_TYPE__ unsigned short
+// WEBASSEMBLY-NEXT:#define __UINT32_C(c) c##U
 // WEBASSEMBLY-NEXT:#define __UINT32_C_SUFFIX__ U
 // WEBASSEMBLY-NEXT:#define __UINT32_FMTX__ "X"
 // WEBASSEMBLY-NEXT:#define __UINT32_FMTo__ "o"
@@ -1906,6 +1965,7 @@
 // WEBASSEMBLY-NEXT:#define __UINT32_FMTx__ "x"
 // WEBASSEMBLY-NEXT:#define __UINT32_MAX__ 4294967295U
 // WEBASSEMBLY-NEXT:#define __UINT32_TYPE__ unsigned int
+// WEBASSEMBLY-NEXT:#define __UINT64_C(c) c##ULL
 // WEBASSEMBLY-NEXT:#define __UINT64_C_SUFFIX__ ULL
 // WEBASSEMBLY-NEXT:#define __UINT64_FMTX__ "llX"
 // WEBASSEMBLY-NEXT:#define __UINT64_FMTo__ "llo"
@@ -1913,6 +1973,7 @@
 // WEBASSEMBLY-NEXT:#define __UINT64_FMTx__ "llx"
 // WEBASSEMBLY-NEXT:#define __UINT64_MAX__ 18446744073709551615ULL
 // WEBASSEMBLY-NEXT:#define __UINT64_TYPE__ long long unsigned int
+// WEBASSEMBLY-NEXT:#define __UINT8_C(c) c
 // WEBASSEMBLY-NEXT:#define __UINT8_C_SUFFIX__
 // WEBASSEMBLY-NEXT:#define __UINT8_FMTX__ "hhX"
 // WEBASSEMBLY-NEXT:#define __UINT8_FMTo__ "hho"
@@ -1920,6 +1981,7 @@
 // WEBASSEMBLY-NEXT:#define __UINT8_FMTx__ "hhx"
 // WEBASSEMBLY-NEXT:#define __UINT8_MAX__ 255
 // WEBASSEMBLY-NEXT:#define __UINT8_TYPE__ unsigned char
+// WEBASSEMBLY-NEXT:#define __UINTMAX_C(c) c##ULL
 // WEBASSEMBLY-NEXT:#define __UINTMAX_C_SUFFIX__ ULL
 // WEBASSEMBLY-NEXT:#define __UINTMAX_FMTX__ "llX"
 // WEBASSEMBLY-NEXT:#define __UINTMAX_FMTo__ "llo"
@@ -2092,18 +2154,23 @@
 // AVR:#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
 // AVR:#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 1
 // AVR:#define __GXX_ABI_VERSION 1002
+// AVR:#define __INT16_C(c) c
 // AVR:#define __INT16_C_SUFFIX__
 // AVR:#define __INT16_MAX__ 32767
 // AVR:#define __INT16_TYPE__ int
+// AVR:#define __INT32_C(c) c##L
 // AVR:#define __INT32_C_SUFFIX__ L
 // AVR:#define __INT32_MAX__ 2147483647L
 // AVR:#define __INT32_TYPE__ long int
+// AVR:#define __INT64_C(c) c##LL
 // AVR:#define __INT64_C_SUFFIX__ LL
 // AVR:#define __INT64_MAX__ 9223372036854775807LL
 // AVR:#define __INT64_TYPE__ long long int
+// AVR:#define __INT8_C(c) c
 // AVR:#define __INT8_C_SUFFIX__
 // AVR:#define __INT8_MAX__ 127
 // AVR:#define __INT8_TYPE__ signed char
+// AVR:#define __INTMAX_C(c) c##LL
 // AVR:#define __INTMAX_C_SUFFIX__ LL
 // AVR:#define __INTMAX_MAX__ 9223372036854775807LL
 // AVR:#define __INTMAX_TYPE__ long long int
@@ -2175,15 +2242,19 @@
 // AVR:#define __STDC__ 1
 // AVR:#define __UINT16_MAX__ 65535U
 // AVR:#define __UINT16_TYPE__ unsigned int
+// AVR:#define __UINT32_C(c) c##UL
 // AVR:#define __UINT32_C_SUFFIX__ UL
 // AVR:#define __UINT32_MAX__ 4294967295UL
 // AVR:#define __UINT32_TYPE__ long unsigned int
+// AVR:#define __UINT64_C(c) c##ULL
 // AVR:#define __UINT64_C_SUFFIX__ ULL
 // AVR:#define __UINT64_MAX__ 18446744073709551615ULL
 // AVR:#define __UINT64_TYPE__ long long unsigned int
+// AVR:#define __UINT8_C(c) c
 // AVR:#define __UINT8_C_SUFFIX__
 // AVR:#define __UINT8_MAX__ 255
 // AVR:#define __UINT8_TYPE__ unsigned char
+// AVR:#define __UINTMAX_C(c) c##ULL
 // AVR:#define __UINTMAX_C_SUFFIX__ ULL
 // AVR:#define __UINTMAX_MAX__ 18446744073709551615ULL
 // AVR:#define __UINTMAX_TYPE__ long long unsigned int
@@ -2383,18 +2454,23 @@
 // RISCV32: #define __GNUC__ {{.*}}
 // RISCV32: #define __GXX_ABI_VERSION {{.*}}
 // RISCV32: #define __ILP32__ 1
+// RISCV32: #define __INT16_C(c) c
 // RISCV32: #define __INT16_C_SUFFIX__
 // RISCV32: #define __INT16_MAX__ 32767
 // RISCV32: #define __INT16_TYPE__ short
+// RISCV32: #define __INT32_C(c) c
 // RISCV32: #define __INT32_C_SUFFIX__
 // RISCV32: #define __INT32_MAX__ 2147483647
 // RISCV32: #define __INT32_TYPE__ int
+// RISCV32: #define __INT64_C(c) c##LL
 // RISCV32: #define __INT64_C_SUFFIX__ LL
 // RISCV32: #define __INT64_MAX__ 9223372036854775807LL
 // RISCV32: #define __INT64_TYPE__ long long int
+// RISCV32: #define __INT8_C(c) c
 // RISCV32: #define __INT8_C_SUFFIX__
 // RISCV32: #define __INT8_MAX__ 127
 // RISCV32: #define __INT8_TYPE__ signed char
+// RISCV32: #define __INTMAX_C(c) c##LL
 // RISCV32: #define __INTMAX_C_SUFFIX__ LL
 // RISCV32: #define __INTMAX_MAX__ 9223372036854775807LL
 // RISCV32: #define __INTMAX_TYPE__ long long int
@@ -2474,18 +2550,23 @@
 // RISCV32: #define __STDC_UTF_32__ 1
 // RISCV32: #define __STDC_VERSION__ 201710L
 // RISCV32: #define __STDC__ 1
+// RISCV32: #define __UINT16_C(c) c
 // RISCV32: #define __UINT16_C_SUFFIX__
 // RISCV32: #define __UINT16_MAX__ 65535
 // RISCV32: #define __UINT16_TYPE__ unsigned short
+// RISCV32: #define __UINT32_C(c) c##U
 // RISCV32: #define __UINT32_C_SUFFIX__ U
 // RISCV32: #define __UINT32_MAX__ 4294967295U
 // RISCV32: #define __UINT32_TYPE__ unsigned int
+// RISCV32: #define __UINT64_C(c) c##ULL
 // RISCV32: #define __UINT64_C_SUFFIX__ ULL
 // RISCV32: #define __UINT64_MAX__ 18446744073709551615ULL
 // RISCV32: #define __UINT64_TYPE__ long long unsigned int
+// RISCV32: #define __UINT8_C(c) c
 // RISCV32: #define __UINT8_C_SUFFIX__
 // RISCV32: #define __UINT8_MAX__ 255
 // RISCV32: #define __UINT8_TYPE__ unsigned char
+// RISCV32: #define __UINTMAX_C(c) c##ULL
 // RISCV32: #define __UINTMAX_C_SUFFIX__ ULL
 // RISCV32: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // RISCV32: #define __UINTMAX_TYPE__ long long unsigned int
@@ -2596,18 +2677,23 @@
 // RISCV64: #define __GNUC_STDC_INLINE__ 1
 // RISCV64: #define __GNUC__ {{.*}}
 // RISCV64: #define __GXX_ABI_VERSION {{.*}}
+// RISCV64: #define __INT16_C(c) c
 // RISCV64: #define __INT16_C_SUFFIX__
 // RISCV64: #define __INT16_MAX__ 32767
 // RISCV64: #define __INT16_TYPE__ short
+// RISCV64: #define __INT32_C(c) c
 // RISCV64: #define __INT32_C_SUFFIX__
 // RISCV64: #define __INT32_MAX__ 2147483647
 // RISCV64: #define __INT32_TYPE__ int
+// RISCV64: #define __INT64_C(c) c##L
 // RISCV64: #define __INT64_C_SUFFIX__ L
 // RISCV64: #define __INT64_MAX__ 9223372036854775807L
 // RISCV64: #define __INT64_TYPE__ long int
+// RISCV64: #define __INT8_C(c) c
 // RISCV64: #define __INT8_C_SUFFIX__
 // RISCV64: #define __INT8_MAX__ 127
 // RISCV64: #define __INT8_TYPE__ signed char
+// RISCV64: #define __INTMAX_C(c) c##L
 // RISCV64: #define __INTMAX_C_SUFFIX__ L
 // RISCV64: #define __INTMAX_MAX__ 9223372036854775807L
 // RISCV64: #define __INTMAX_TYPE__ long int
@@ -2687,18 +2773,23 @@
 // RISCV64: #define __STDC_UTF_32__ 1
 // RISCV64: #define __STDC_VERSION__ 201710L
 // RISCV64: #define __STDC__ 1
+// RISCV64: #define __UINT16_C(c) c
 // RISCV64: #define __UINT16_C_SUFFIX__
 // RISCV64: #define __UINT16_MAX__ 65535
 // RISCV64: #define __UINT16_TYPE__ unsigned short
+// RISCV64: #define __UINT32_C(c) c##U
 // RISCV64: #define __UINT32_C_SUFFIX__ U
 // RISCV64: #define __UINT32_MAX__ 4294967295U
 // RISCV64: #define __UINT32_TYPE__ unsigned int
+// RISCV64: #define __UINT64_C(c) c##UL
 // RISCV64: #define __UINT64_C_SUFFIX__ UL
 // RISCV64: #define __UINT64_MAX__ 18446744073709551615UL
 // RISCV64: #define __UINT64_TYPE__ long unsigned int
+// RISCV64: #define __UINT8_C(c) c
 // RISCV64: #define __UINT8_C_SUFFIX__
 // RISCV64: #define __UINT8_MAX__ 255
 // RISCV64: #define __UINT8_TYPE__ unsigned char
+// RISCV64: #define __UINTMAX_C(c) c##UL
 // RISCV64: #define __UINTMAX_C_SUFFIX__ UL
 // RISCV64: #define __UINTMAX_MAX__ 18446744073709551615UL
 // RISCV64: #define __UINTMAX_TYPE__ long unsigned int
@@ -2837,18 +2928,23 @@
 // XTENSA: #define __GNUC__ {{.*}}
 // XTENSA: #define __GXX_ABI_VERSION {{.*}}
 // XTENSA: #define __ILP32__ 1
+// XTENSA: #define __INT16_C(c) c
 // XTENSA: #define __INT16_C_SUFFIX__ 
 // XTENSA: #define __INT16_MAX__ 32767
 // XTENSA: #define __INT16_TYPE__ short
+// XTENSA: #define __INT32_C(c) c
 // XTENSA: #define __INT32_C_SUFFIX__ 
 // XTENSA: #define __INT32_MAX__ 2147483647
 // XTENSA: #define __INT32_TYPE__ int
+// XTENSA: #define __INT64_C(c) c##LL
 // XTENSA: #define __INT64_C_SUFFIX__ LL
 // XTENSA: #define __INT64_MAX__ 9223372036854775807LL
 // XTENSA: #define __INT64_TYPE__ long long int
+// XTENSA: #define __INT8_C(c) c
 // XTENSA: #define __INT8_C_SUFFIX__ 
 // XTENSA: #define __INT8_MAX__ 127
 // XTENSA: #define __INT8_TYPE__ signed char
+// XTENSA: #define __INTMAX_C(c) c##LL
 // XTENSA: #define __INTMAX_C_SUFFIX__ LL
 // XTENSA: #define __INTMAX_MAX__ 9223372036854775807LL
 // XTENSA: #define __INTMAX_TYPE__ long long int
@@ -2945,18 +3041,23 @@
 // XTENSA: #define __STDC_UTF_32__ 1
 // XTENSA: #define __STDC_VERSION__ 201710L
 // XTENSA: #define __STDC__ 1
+// XTENSA: #define __UINT16_C(c) c
 // XTENSA: #define __UINT16_C_SUFFIX__ 
 // XTENSA: #define __UINT16_MAX__ 65535
 // XTENSA: #define __UINT16_TYPE__ unsigned short
+// XTENSA: #define __UINT32_C(c) c##U
 // XTENSA: #define __UINT32_C_SUFFIX__ U
 // XTENSA: #define __UINT32_MAX__ 4294967295U
 // XTENSA: #define __UINT32_TYPE__ unsigned int
+// XTENSA: #define __UINT64_C(c) c##ULL
 // XTENSA: #define __UINT64_C_SUFFIX__ ULL
 // XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL
 // XTENSA: #define __UINT64_TYPE__ long long unsigned int
+// XTENSA: #define __UINT8_C(c) c
 // XTENSA: #define __UINT8_C_SUFFIX__ 
 // XTENSA: #define __UINT8_MAX__ 255
 // XTENSA: #define __UINT8_TYPE__ unsigned char
+// XTENSA: #define __UINTMAX_C(c) c##ULL
 // XTENSA: #define __UINTMAX_C_SUFFIX__ ULL
 // XTENSA: #define __UINTMAX_MAX__ 18446744073709551615ULL
 // XTENSA: #define __UINTMAX_TYPE__ long long unsigned int

From 2af819fa3d802e55027dcc1408186cb8738f08e6 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Sun, 26 Jan 2025 07:53:53 -0800
Subject: [PATCH 141/432] [MemProf] Add test for hot hints (#124394)

The change in PR124219 required removing one of the tests added for
-memprof-use-hot-hints, since we no longer label any contexts as hot in
metadata, so add a new test that checks the hot attribute instead.
---
 llvm/test/Transforms/PGOProfile/memprof.ll | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index 6aa2d307a1dc8..acf70880becd1 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -85,6 +85,14 @@
 ; RAND2: random hotness seed = 1730170724
 ; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdatarand2>' -pgo-warn-missing-function -S -stats 2>&1 | FileCheck %s --check-prefixes=MEMPROFRAND2,ALL,MEMPROFONLY,MEMPROFSTATS
 
+;; With the hot access density threshold set to 0, and hot hints enabled,
+;; the unconditionally notcold call to new should instead get a hot attribute.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-print-match-info -stats -memprof-min-ave-lifetime-access-density-hot-threshold=0 -memprof-use-hot-hints 2>&1 | FileCheck %s --check-prefixes=MEMPROFHOT,ALL
+
+;; However, with the same threshold, but hot hints not enabled, it should be
+;; notcold again.
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-min-ave-lifetime-access-density-hot-threshold=0 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL
+
 ; MEMPROFMATCHINFO: MemProf notcold context with id 1093248920606587996 has total profiled size 10 is matched
 ; MEMPROFMATCHINFO: MemProf notcold context with id 5725971306423925017 has total profiled size 10 is matched
 ; MEMPROFMATCHINFO: MemProf notcold context with id 6792096022461663180 has total profiled size 10 is matched
@@ -192,6 +200,7 @@ entry:
   store ptr %argv, ptr %argv.addr, align 8
   ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
   ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
+  ; MEMPROFHOT: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !35
   store ptr %call, ptr %a, align 8, !dbg !36
   ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A2:[0-9]+]]
@@ -404,6 +413,8 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROFNOCOLINFO: ![[C10]] = !{i64 -4535090212904553409}
 ; MEMPROFNOCOLINFO: ![[C11]] = !{i64 3577763375057267810}
 
+; MEMPROFHOT: #[[A1]] = { builtin allocsize(0) "memprof"="hot" }
+
 ;; For the specific random seed, this is the expected order of hotness
 ; MEMPROFRAND2: !"cold"
 ; MEMPROFRAND2: !"cold"

From f8ab91f74f152c8a6d8aaedb8165109c497a618d Mon Sep 17 00:00:00 2001
From: Andreas Jonson <andjo403@hotmail.com>
Date: Sun, 26 Jan 2025 16:53:43 +0100
Subject: [PATCH 142/432] [LVI][CVP] Add test for trunc bittest. (NFC)

---
 .../CorrelatedValuePropagation/icmp.ll        | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
index 72f09a949a060..e4de34c339d2d 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/icmp.ll
@@ -1509,3 +1509,107 @@ end:
   ; %arg is within [-16, -8).
   ret void
 }
+
+define void @test_trunc_bittest(i8 %a) {
+; CHECK-LABEL: @test_trunc_bittest(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[A:%.*]] to i1
+; CHECK-NEXT:    br i1 [[TRUNC]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[A]], 0
+; CHECK-NEXT:    call void @check1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    call void @check1(i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+  %trunc = trunc i8 %a to i1
+  br i1 %trunc, label %if.true, label %if.false
+
+if.true:
+  %cmp1 = icmp ne i8 %a, 0
+  call void @check1(i1 %cmp1)
+  %cmp2 = icmp eq i8 %a, 0
+  call void @check1(i1 %cmp2)
+  ret void
+
+if.false:
+  ret void
+}
+
+define void @test_trunc_not_bittest(i8 %a) {
+; CHECK-LABEL: @test_trunc_not_bittest(
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[A:%.*]] to i1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[TRUNC]], true
+; CHECK-NEXT:    br i1 [[NOT]], label [[IF_FALSE:%.*]], label [[IF_TRUE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[A]], -1
+; CHECK-NEXT:    call void @check1(i1 [[CMP1]])
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[A]], -1
+; CHECK-NEXT:    call void @check1(i1 [[CMP2]])
+; CHECK-NEXT:    ret void
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+  %trunc = trunc i8 %a to i1
+  %not = xor i1 %trunc, true
+  br i1 %not, label %if.true, label %if.false
+
+if.true:
+  %cmp1 = icmp ne i8 %a, -1
+  call void @check1(i1 %cmp1)
+  %cmp2 = icmp eq i8 %a, -1
+  call void @check1(i1 %cmp2)
+  ret void
+
+if.false:
+  ret void
+}
+
+define void @test_icmp_trunc(i8 %a) {
+; CHECK-LABEL: @test_icmp_trunc(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[A]] to i1
+; CHECK-NEXT:    call void @check1(i1 [[TRUNC]])
+; CHECK-NEXT:    ret void
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ne i8 %a, 0
+  br i1 %cmp1, label %if.true, label %if.false
+
+if.true:
+  %trunc = trunc i8 %a to i1
+  call void @check1(i1 %trunc)
+  ret void
+
+if.false:
+  ret void
+}
+
+define void @test_icmp_trunc_not(i8 %a) {
+; CHECK-LABEL: @test_icmp_trunc_not(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[A:%.*]], -1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i8 [[A]] to i1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i1 [[TRUNC]], true
+; CHECK-NEXT:    call void @check1(i1 [[TRUNC]])
+; CHECK-NEXT:    ret void
+; CHECK:       if.false:
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp eq i8 %a, -1
+  br i1 %cmp1, label %if.true, label %if.false
+
+if.true:
+  %trunc = trunc i8 %a to i1
+  %not = xor i1 %trunc, true
+  call void @check1(i1 %trunc)
+  ret void
+
+if.false:
+  ret void
+}

From e8e75e08c9214fe25b56535fc26f5435a875a137 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 09:46:38 -0800
Subject: [PATCH 143/432] [lld-macho] Remove unneeded functions from
 BPSectionOrderer. NFC

---
 lld/MachO/BPSectionOrderer.cpp | 12 +++++-------
 lld/MachO/BPSectionOrderer.h   |  8 +-------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index 18c8aad58344f..c2d03c968534e 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -26,8 +26,7 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
         auto *isec = subsec.isec;
         if (!isec || isec->data.empty() || !isec->data.data())
           continue;
-        sections.emplace_back(
-            std::make_unique<BPSectionMacho>(isec, sections.size()));
+        sections.emplace_back(std::make_unique<BPSectionMacho>(isec));
       }
     }
   }
@@ -38,11 +37,10 @@ DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
 
   DenseMap<const InputSection *, int> result;
   for (const auto &[sec, priority] : reorderedSections) {
-    if (auto *machoSection = dyn_cast<BPSectionMacho>(sec)) {
-      result.try_emplace(
-          static_cast<const InputSection *>(machoSection->getSection()),
-          priority);
-    }
+    result.try_emplace(
+        static_cast<const InputSection *>(
+            static_cast<const BPSectionMacho *>(sec)->getSection()),
+        priority);
   }
   return result;
 }
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index 69c6b260f044c..e3e6b12092e20 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -57,18 +57,14 @@ class BPSymbolMacho : public BPSymbol {
 
 class BPSectionMacho : public BPSectionBase {
   const InputSection *isec;
-  uint64_t sectionIdx;
 
 public:
-  explicit BPSectionMacho(const InputSection *sec, uint64_t sectionIdx)
-      : isec(sec), sectionIdx(sectionIdx) {}
+  explicit BPSectionMacho(const InputSection *sec) : isec(sec) {}
 
   const void *getSection() const override { return isec; }
 
   uint64_t getSize() const override { return isec->getSize(); }
 
-  uint64_t getSectionIdx() const { return sectionIdx; }
-
   bool isCodeSection() const override { return macho::isCodeSection(isec); }
 
   SmallVector<std::unique_ptr<BPSymbol>> getSymbols() const override {
@@ -118,8 +114,6 @@ class BPSectionMacho : public BPSectionBase {
     hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
   }
 
-  static bool classof(const BPSectionBase *s) { return true; }
-
 private:
   static uint64_t
   getRelocHash(const Reloc &reloc,

From ccc066e8d5a742f79b41a0f90ef309d5b9e92c2a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sun, 26 Jan 2025 11:50:10 -0800
Subject: [PATCH 144/432] [TableGen] Avoid repeated map lookups (NFC) (#124448)

This patch avoids repeated map lookups and constructions of temporary
std::string instances by switching to DenseSet.
---
 clang/utils/TableGen/MveEmitter.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 8ebd0bb800fef..58a4d3c22ac36 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1955,18 +1955,17 @@ void MveEmitter::EmitBuiltinDef(raw_ostream &OS) {
        << ", \"\", \"n\")\n";
   }
 
-  std::set<std::string> ShortNamesSeen;
+  DenseSet<StringRef> ShortNamesSeen;
 
   for (const auto &kv : ACLEIntrinsics) {
     const ACLEIntrinsic &Int = *kv.second;
     if (Int.polymorphic()) {
       StringRef Name = Int.shortName();
-      if (ShortNamesSeen.find(std::string(Name)) == ShortNamesSeen.end()) {
+      if (ShortNamesSeen.insert(Name).second) {
         OS << "BUILTIN(__builtin_arm_mve_" << Name << ", \"vi.\", \"nt";
         if (Int.nonEvaluating())
           OS << "u"; // indicate that this builtin doesn't evaluate its args
         OS << "\")\n";
-        ShortNamesSeen.insert(std::string(Name));
       }
     }
   }

From 1c4341d176492da5f276937b84a3d0c959e4cf5b Mon Sep 17 00:00:00 2001
From: Vasileios Porpodas <vporpodas@google.com>
Date: Sat, 25 Jan 2025 09:46:42 -0800
Subject: [PATCH 145/432] [SandboxVec][DAG] Fix interval check without Node

This patch moves the check of whether a node exists before the check of
whether it is contained in the interval.
---
 .../Vectorize/SandboxVectorizer/DependencyGraph.cpp           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index 390a5e9688cc7..7aa8794d26b20 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -267,11 +267,11 @@ void DependencyGraph::setDefUseUnscheduledSuccs(
       auto *OpI = dyn_cast<Instruction>(Op);
       if (OpI == nullptr)
         continue;
-      if (!TopInterval.contains(OpI))
-        continue;
       auto *OpN = getNode(OpI);
       if (OpN == nullptr)
         continue;
+      if (!TopInterval.contains(OpI))
+        continue;
       ++OpN->UnscheduledSuccs;
     }
   }

From fb01a289038c16e13c6133ee602a58254b349411 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sun, 26 Jan 2025 22:11:40 +0100
Subject: [PATCH 146/432] [LLD][COFF] Implement support for hybrid IAT on
 ARM64X (#124189)

In hybrid images, the PE header references a single IAT for both native
and EC views, merging entries where possible. When merging isn't
feasible, different imports are grouped together, and ARM64X relocations
are emitted as needed.
---
 lld/COFF/Chunks.cpp              |  27 +-
 lld/COFF/DLL.cpp                 | 146 ++++++++-
 lld/COFF/InputFiles.cpp          |  14 +-
 lld/COFF/InputFiles.h            |   6 +-
 lld/test/COFF/arm64x-import.test | 533 +++++++++++++++++++++++++++++++
 5 files changed, 706 insertions(+), 20 deletions(-)
 create mode 100644 lld/test/COFF/arm64x-import.test

diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 11e7cf4346b23..a01c69c709876 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -1172,11 +1172,12 @@ uint64_t Arm64XRelocVal::get() const {
 
 size_t Arm64XDynamicRelocEntry::getSize() const {
   switch (type) {
+  case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL:
+    return sizeof(uint16_t); // Just a header.
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE:
     return sizeof(uint16_t) + size; // A header and a payload.
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA:
-  case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL:
-    llvm_unreachable("unsupported type");
+    return 2 * sizeof(uint16_t); // A header and a delta.
   }
   llvm_unreachable("invalid type");
 }
@@ -1186,6 +1187,9 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {
   *out = (offset.get() & 0xfff) | (type << 12);
 
   switch (type) {
+  case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL:
+    *out |= ((bit_width(size) - 1) << 14); // Encode the size.
+    break;
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE:
     *out |= ((bit_width(size) - 1) << 14); // Encode the size.
     switch (size) {
@@ -1203,8 +1207,23 @@ void Arm64XDynamicRelocEntry::writeTo(uint8_t *buf) const {
     }
     break;
   case IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA:
-  case IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL:
-    llvm_unreachable("unsupported type");
+    int delta = value.get();
+    // Negative offsets use a sign bit in the header.
+    if (delta < 0) {
+      *out |= 1 << 14;
+      delta = -delta;
+    }
+    // Depending on the value, the delta is encoded with a shift of 2 or 3 bits.
+    if (delta & 7) {
+      assert(!(delta & 3));
+      delta >>= 2;
+    } else {
+      *out |= (1 << 15);
+      delta >>= 3;
+    }
+    out[1] = delta;
+    assert(!(delta & ~0xffff));
+    break;
   }
 }
 
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index ae3a8047b7008..b6fbd5a484b5e 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -716,6 +716,63 @@ class ExportOrdinalChunk : public NonSectionChunk {
 void IdataContents::create(COFFLinkerContext &ctx) {
   std::vector<std::vector<DefinedImportData *>> v = binImports(ctx, imports);
 
+  // In hybrid images, EC and native code are usually very similar,
+  // resulting in a highly similar set of imported symbols. Consequently,
+  // their import tables can be shared, with ARM64X relocations handling any
+  // differences. Identify matching import files used by EC and native code, and
+  // merge them into a single hybrid import entry.
+  if (ctx.hybridSymtab) {
+    for (std::vector<DefinedImportData *> &syms : v) {
+      std::vector<DefinedImportData *> hybridSyms;
+      ImportFile *prev = nullptr;
+      for (DefinedImportData *sym : syms) {
+        ImportFile *file = sym->file;
+        // At this stage, symbols are sorted by base name, ensuring that
+        // compatible import files, if present, are adjacent. Check if the
+        // current symbol's file imports the same symbol as the previously added
+        // one (if any and if it was not already merged). Additionally, verify
+        // that one of them is native while the other is EC. In rare cases,
+        // separate matching import entries may exist within the same namespace,
+        // which cannot be merged.
+        if (!prev || file->isEC() == prev->isEC() ||
+            !file->isSameImport(prev)) {
+          // We can't merge the import file, just add it to hybridSyms
+          // and set prev to its file so that we can try to match the next
+          // symbol.
+          hybridSyms.push_back(sym);
+          prev = file;
+          continue;
+        }
+
+        // A matching symbol may appear in syms in any order. The native variant
+        // exposes a subset of EC symbols and chunks, so always use the EC
+        // variant as the hybrid import file. If the native file was already
+        // added, replace it with the EC symbol in hybridSyms. Otherwise, the EC
+        // variant is already pushed, so we can simply merge it.
+        if (file->isEC()) {
+          hybridSyms.pop_back();
+          hybridSyms.push_back(sym);
+        }
+
+        // Merge import files by storing their hybrid form in the corresponding
+        // file class.
+        prev->hybridFile = file;
+        file->hybridFile = prev;
+        prev = nullptr; // A hybrid import file cannot be merged again.
+      }
+
+      // Sort symbols by type: native-only files first, followed by merged
+      // hybrid files, and then EC-only files.
+      llvm::stable_sort(hybridSyms,
+                        [](DefinedImportData *a, DefinedImportData *b) {
+                          if (a->file->hybridFile)
+                            return !b->file->hybridFile && b->file->isEC();
+                          return !a->file->isEC() && b->file->isEC();
+                        });
+      syms = std::move(hybridSyms);
+    }
+  }
+
   // Create .idata contents for each DLL.
   for (std::vector<DefinedImportData *> &syms : v) {
     // Create lookup and address tables. If they have external names,
@@ -723,19 +780,56 @@ void IdataContents::create(COFFLinkerContext &ctx) {
     // If they don't (if they are import-by-ordinals), we store only
     // ordinal values to the table.
     size_t base = lookups.size();
+    Chunk *lookupsTerminator = nullptr, *addressesTerminator = nullptr;
     for (DefinedImportData *s : syms) {
       uint16_t ord = s->getOrdinal();
+      HintNameChunk *hintChunk = nullptr;
+      Chunk *lookupsChunk, *addressesChunk;
+
       if (s->getExternalName().empty()) {
-        lookups.push_back(make<OrdinalOnlyChunk>(ctx, ord));
-        addresses.push_back(make<OrdinalOnlyChunk>(ctx, ord));
+        lookupsChunk = make<OrdinalOnlyChunk>(ctx, ord);
+        addressesChunk = make<OrdinalOnlyChunk>(ctx, ord);
       } else {
-        auto *c = make<HintNameChunk>(s->getExternalName(), ord);
-        lookups.push_back(make<LookupChunk>(ctx, c));
-        addresses.push_back(make<LookupChunk>(ctx, c));
-        hints.push_back(c);
+        hintChunk = make<HintNameChunk>(s->getExternalName(), ord);
+        lookupsChunk = make<LookupChunk>(ctx, hintChunk);
+        addressesChunk = make<LookupChunk>(ctx, hintChunk);
+        hints.push_back(hintChunk);
       }
 
-      if (s->file->impECSym) {
+      // Detect the first EC-only import in the hybrid IAT. Emit null chunk
+      // as a terminator for the native view, and add an ARM64X relocation to
+      // replace it with the correct import for the EC view.
+      //
+      // Additionally, for MSVC compatibility, store the lookup and address
+      // chunks and append them at the end of EC-only imports, where a null
+      // terminator chunk would typically be placed. Since they appear after
+      // the native terminator, they will be ignored in the native view.
+      // In the EC view, they should act as terminators, so emit ZEROFILL
+      // relocations overriding them.
+      if (ctx.hybridSymtab && !lookupsTerminator && s->file->isEC() &&
+          !s->file->hybridFile) {
+        lookupsTerminator = lookupsChunk;
+        addressesTerminator = addressesChunk;
+        lookupsChunk = make<NullChunk>(ctx);
+        addressesChunk = make<NullChunk>(ctx);
+
+        Arm64XRelocVal relocVal = hintChunk;
+        if (!hintChunk)
+          relocVal = (1ULL << 63) | ord;
+        ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE,
+                               sizeof(uint64_t), lookupsChunk, relocVal);
+        ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_VALUE,
+                               sizeof(uint64_t), addressesChunk, relocVal);
+        ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL,
+                               sizeof(uint64_t), lookupsTerminator);
+        ctx.dynamicRelocs->add(IMAGE_DVRT_ARM64X_FIXUP_TYPE_ZEROFILL,
+                               sizeof(uint64_t), addressesTerminator);
+      }
+
+      lookups.push_back(lookupsChunk);
+      addresses.push_back(addressesChunk);
+
+      if (s->file->isEC()) {
         auto chunk = make<AuxImportChunk>(s->file);
         auxIat.push_back(chunk);
         s->file->impECSym->setLocation(chunk);
@@ -743,18 +837,27 @@ void IdataContents::create(COFFLinkerContext &ctx) {
         chunk = make<AuxImportChunk>(s->file);
         auxIatCopy.push_back(chunk);
         s->file->auxImpCopySym->setLocation(chunk);
+      } else if (ctx.hybridSymtab) {
+        // Fill the auxiliary IAT with null chunks for native-only imports.
+        auxIat.push_back(make<NullChunk>(ctx));
+        auxIatCopy.push_back(make<NullChunk>(ctx));
       }
     }
     // Terminate with null values.
-    lookups.push_back(make<NullChunk>(ctx));
-    addresses.push_back(make<NullChunk>(ctx));
-    if (ctx.config.machine == ARM64EC) {
+    lookups.push_back(lookupsTerminator ? lookupsTerminator
+                                        : make<NullChunk>(ctx));
+    addresses.push_back(addressesTerminator ? addressesTerminator
+                                            : make<NullChunk>(ctx));
+    if (ctx.symtabEC) {
       auxIat.push_back(make<NullChunk>(ctx));
       auxIatCopy.push_back(make<NullChunk>(ctx));
     }
 
-    for (int i = 0, e = syms.size(); i < e; ++i)
+    for (int i = 0, e = syms.size(); i < e; ++i) {
       syms[i]->setLocation(addresses[base + i]);
+      if (syms[i]->file->hybridFile)
+        syms[i]->file->hybridFile->impSym->setLocation(addresses[base + i]);
+    }
 
     // Create the import table header.
     dllNames.push_back(make<StringChunk>(syms[0]->getDLLName()));
@@ -762,6 +865,27 @@ void IdataContents::create(COFFLinkerContext &ctx) {
     dir->lookupTab = lookups[base];
     dir->addressTab = addresses[base];
     dirs.push_back(dir);
+
+    if (ctx.hybridSymtab) {
+      // If native-only imports exist, they will appear as a prefix to all
+      // imports. Emit ARM64X relocations to skip them in the EC view.
+      uint32_t nativeOnly =
+          llvm::find_if(syms,
+                        [](DefinedImportData *s) { return s->file->isEC(); }) -
+          syms.begin();
+      if (nativeOnly) {
+        ctx.dynamicRelocs->add(
+            IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0,
+            Arm64XRelocVal(
+                dir, offsetof(ImportDirectoryTableEntry, ImportLookupTableRVA)),
+            nativeOnly * sizeof(uint64_t));
+        ctx.dynamicRelocs->add(
+            IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0,
+            Arm64XRelocVal(dir, offsetof(ImportDirectoryTableEntry,
+                                         ImportAddressTableRVA)),
+            nativeOnly * sizeof(uint64_t));
+      }
+    }
   }
   // Add null terminator.
   dirs.push_back(make<NullChunk>(sizeof(ImportDirectoryTableEntry), 4));
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 47faf70e099e1..7b105fb4c17a2 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1129,15 +1129,21 @@ void ObjFile::enqueuePdbFile(StringRef path, ObjFile *fromFile) {
 }
 
 ImportFile::ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m)
-    : InputFile(ctx.symtab, ImportKind, m), live(!ctx.config.doGC) {}
+    : InputFile(ctx.getSymtab(getMachineType(m)), ImportKind, m),
+      live(!ctx.config.doGC) {}
 
-MachineTypes ImportFile::getMachineType() const {
+MachineTypes ImportFile::getMachineType(MemoryBufferRef m) {
   uint16_t machine =
-      reinterpret_cast<const coff_import_header *>(mb.getBufferStart())
-          ->Machine;
+      reinterpret_cast<const coff_import_header *>(m.getBufferStart())->Machine;
   return MachineTypes(machine);
 }
 
+bool ImportFile::isSameImport(const ImportFile *other) const {
+  if (!externalName.empty())
+    return other->externalName == externalName;
+  return hdr->OrdinalHint == other->hdr->OrdinalHint;
+}
+
 ImportThunkChunk *ImportFile::makeImportThunk() {
   switch (hdr->Machine) {
   case AMD64:
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 823561cda247a..21b9aeef21d4f 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -351,11 +351,15 @@ class ImportFile : public InputFile {
   explicit ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m);
 
   static bool classof(const InputFile *f) { return f->kind() == ImportKind; }
-  MachineTypes getMachineType() const override;
+  MachineTypes getMachineType() const override { return getMachineType(mb); }
+  static MachineTypes getMachineType(MemoryBufferRef m);
+  bool isSameImport(const ImportFile *other) const;
+  bool isEC() const { return impECSym != nullptr; }
 
   DefinedImportData *impSym = nullptr;
   Defined *thunkSym = nullptr;
   ImportThunkChunkARM64EC *impchkThunk = nullptr;
+  ImportFile *hybridFile = nullptr;
   std::string dllName;
 
 private:
diff --git a/lld/test/COFF/arm64x-import.test b/lld/test/COFF/arm64x-import.test
new file mode 100644
index 0000000000000..bc202e1d17251
--- /dev/null
+++ b/lld/test/COFF/arm64x-import.test
@@ -0,0 +1,533 @@
+REQUIRES: aarch64
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-thunks-arm64ec.s -o func12-thunks-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func12-thunks-arm64.s -o func12-thunks-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-arm64ec.s -o func12-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func123-arm64.s -o func123-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func123-arm64ec.s -o func123-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func12-arm64.s -o func12-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func234-arm64.s -o func234-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12o-arm64ec.s -o func12o-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func34-arm64.s -o func34-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows func34o-arm64.s -o func34o-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows funco-arm64.s -o funco-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows icall.s -o icall.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+RUN: llvm-lib -machine:arm64ec -def:imp.def -out:imp-arm64ec.lib
+RUN: llvm-lib -machine:arm64 -def:imp.def -out:imp-arm64.lib
+RUN: llvm-lib -machine:arm64x -def:imp.def -defArm64Native:imp.def -out:imp-arm64x.lib
+RUN: llvm-lib -machine:arm64x -def:imp-ord10.def -defArm64Native:imp.def -out:imp-ecord.lib
+RUN: llvm-lib -machine:arm64x -def:imp-ord10.def -defArm64Native:imp-ord20.def -out:imp-ecord.lib
+RUN: llvm-lib -machine:arm64x -def:imp2.def -defArm64Native:imp2.def -out:imp2.lib
+RUN: llvm-lib -machine:arm64x -def:noname-ec.def -defArm64Native:noname-native.def -out:noname.lib
+RUN: llvm-lib -machine:arm64x -def:dup-ec.def -defArm64Native:dup-native.def -out:dup.lib
+
+
+# Link to the imported func1, func2, and func1's thunks from both native and EC code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-12-thunks.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj imp-arm64ec.lib imp-arm64.lib
+
+RUN: llvm-objdump -d test-12-thunks.dll | FileCheck --check-prefix=DISASM-12T %s
+DISASM-12T:      0000000180001000 <.text>:
+DISASM-12T-NEXT: 180001000: f0000010     adrp    x16, 0x180004000
+DISASM-12T-NEXT: 180001004: f9400610     ldr     x16, [x16, #0x8]
+DISASM-12T-NEXT: 180001008: d61f0200     br      x16
+DISASM-12T-NEXT:                 ...
+DISASM-12T-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
+DISASM-12T-NEXT: 180002004: d65f03c0     ret
+DISASM-12T-NEXT: 180002008: 90000030     adrp    x16, 0x180006000
+DISASM-12T-NEXT: 18000200c: f9400210     ldr     x16, [x16]
+DISASM-12T-NEXT: 180002010: d61f0200     br      x16
+DISASM-12T-NEXT: 180002014: d000000b     adrp    x11, 0x180004000
+DISASM-12T-NEXT: 180002018: f940016b     ldr     x11, [x11]
+DISASM-12T-NEXT: 18000201c: 9000000a     adrp    x10, 0x180002000 <.text+0x1000>
+DISASM-12T-NEXT: 180002020: 9100f14a     add     x10, x10, #0x3c
+DISASM-12T-NEXT: 180002024: 17fffff7     b       0x180002000 <.text+0x1000>
+DISASM-12T-NEXT: 180002028: d000000b     adrp    x11, 0x180004000
+DISASM-12T-NEXT: 18000202c: f940056b     ldr     x11, [x11, #0x8]
+DISASM-12T-NEXT: 180002030: d0ffffea     adrp    x10, 0x180000000
+DISASM-12T-NEXT: 180002034: 9100014a     add     x10, x10, #0x0
+DISASM-12T-NEXT: 180002038: 17fffff2     b       0x180002000 <.text+0x1000>
+DISASM-12T-NEXT: 18000203c: 52800060     mov     w0, #0x3                // =3
+DISASM-12T-NEXT: 180002040: d65f03c0     ret
+DISASM-12T-NEXT:                 ...
+DISASM-12T-NEXT: 180003000: ff 25 fa 0f 00 00            jmpq    *0xffa(%rip)            # 0x180004000
+
+RUN: llvm-readobj --coff-imports test-12-thunks.dll | FileCheck --check-prefix=IMPORTS-12 %s
+IMPORTS-12:      Import {
+IMPORTS-12-NEXT:   Name: test.dll
+IMPORTS-12-NEXT:   ImportLookupTableRVA: 0x5348
+IMPORTS-12-NEXT:   ImportAddressTableRVA: 0x4000
+IMPORTS-12-NEXT:   Symbol: func1 (0)
+IMPORTS-12-NEXT:   Symbol: func2 (0)
+IMPORTS-12-NEXT: }
+IMPORTS-12-NEXT: HybridObject {
+IMPORTS-12:        Import {
+IMPORTS-12-NEXT:     Name: test.dll
+IMPORTS-12-NEXT:     ImportLookupTableRVA: 0x5348
+IMPORTS-12-NEXT:     ImportAddressTableRVA: 0x4000
+IMPORTS-12-NEXT:     Symbol: func1 (0)
+IMPORTS-12-NEXT:     Symbol: func2 (0)
+IMPORTS-12-NEXT:   }
+IMPORTS-12-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKS %s
+FUNC-12-THUNKS:      0x180009000 00600000 00400000 00300000 08200000
+FUNC-12-THUNKS-NEXT: 0x180009010 08600000 08400000
+
+RUN: llvm-readobj --hex-dump=.testa test-12-thunks.dll | FileCheck --check-prefix=FUNC-12-THUNKSA %s
+FUNC-12-THUNKSA: 0x18000a000 00400000 08400000 00100000
+
+
+# If the ordinals of named imports don't match, use the EC value.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-12-thunks-ord.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj imp-ecord.lib
+RUN: llvm-readobj --coff-imports test-12-thunks-ord.dll | FileCheck --check-prefix=IMPORTS-ORD %s
+
+IMPORTS-ORD:      Format: COFF-ARM64X
+IMPORTS-ORD-NEXT: Arch: aarch64
+IMPORTS-ORD-NEXT: AddressSize: 64bit
+IMPORTS-ORD-NEXT: Import {
+IMPORTS-ORD-NEXT:   Name: test.dll
+IMPORTS-ORD-NEXT:   ImportLookupTableRVA: 0x5348
+IMPORTS-ORD-NEXT:   ImportAddressTableRVA: 0x4000
+IMPORTS-ORD-NEXT:   Symbol: func1 (11)
+IMPORTS-ORD-NEXT:   Symbol: func2 (12)
+IMPORTS-ORD-NEXT: }
+IMPORTS-ORD-NEXT: HybridObject {
+IMPORTS-ORD-NEXT:   Format: COFF-ARM64EC
+IMPORTS-ORD-NEXT:   Arch: aarch64
+IMPORTS-ORD-NEXT:   AddressSize: 64bit
+IMPORTS-ORD-NEXT:   Import {
+IMPORTS-ORD-NEXT:     Name: test.dll
+IMPORTS-ORD-NEXT:     ImportLookupTableRVA: 0x5348
+IMPORTS-ORD-NEXT:     ImportAddressTableRVA: 0x4000
+IMPORTS-ORD-NEXT:     Symbol: func1 (11)
+IMPORTS-ORD-NEXT:     Symbol: func2 (12)
+IMPORTS-ORD-NEXT:   }
+IMPORTS-ORD-NEXT: }
+
+
+# Link to NONAME imports.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-noname.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12-thunks-arm64ec.obj func12-thunks-arm64.obj noname.lib
+RUN: llvm-readobj --coff-imports test-noname.dll | FileCheck --check-prefix=IMPORTS-ORD2 %s
+
+IMPORTS-ORD2:      Format: COFF-ARM64X
+IMPORTS-ORD2-NEXT: Arch: aarch64
+IMPORTS-ORD2-NEXT: AddressSize: 64bit
+IMPORTS-ORD2-NEXT: Import {
+IMPORTS-ORD2-NEXT:   Name: test.dll
+IMPORTS-ORD2-NEXT:   ImportLookupTableRVA: 0x5348
+IMPORTS-ORD2-NEXT:   ImportAddressTableRVA: 0x4000
+IMPORTS-ORD2-NEXT:   Symbol:  (12)
+IMPORTS-ORD2-NEXT:   Symbol:  (11)
+IMPORTS-ORD2-NEXT: }
+IMPORTS-ORD2-NEXT: HybridObject {
+IMPORTS-ORD2-NEXT:   Format: COFF-ARM64EC
+IMPORTS-ORD2-NEXT:   Arch: aarch64
+IMPORTS-ORD2-NEXT:   AddressSize: 64bit
+IMPORTS-ORD2-NEXT:   Import {
+IMPORTS-ORD2-NEXT:     Name: test.dll
+IMPORTS-ORD2-NEXT:     ImportLookupTableRVA: 0x5350
+IMPORTS-ORD2-NEXT:     ImportAddressTableRVA: 0x4008
+IMPORTS-ORD2-NEXT:     Symbol:  (11)
+IMPORTS-ORD2-NEXT:     Symbol:  (10)
+IMPORTS-ORD2-NEXT:   }
+IMPORTS-ORD2-NEXT: }
+
+# Link to the imported func1 and func2 from both native and EC code, and func3 from native code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test2.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12-arm64ec.obj func123-arm64.obj imp-arm64x.lib
+
+RUN: llvm-readobj --coff-imports test2.dll | FileCheck --check-prefix=IMPORTS-123-12 %s
+IMPORTS-123-12:      Import {
+IMPORTS-123-12-NEXT:   Name: test.dll
+IMPORTS-123-12-NEXT:   ImportLookupTableRVA: 0x3338
+IMPORTS-123-12-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-123-12-NEXT:   Symbol: func3 (0)
+IMPORTS-123-12-NEXT:   Symbol: func1 (0)
+IMPORTS-123-12-NEXT:   Symbol: func2 (0)
+IMPORTS-123-12-NEXT: }
+IMPORTS-123-12-NEXT: HybridObject {
+IMPORTS-123-12:        Import {
+IMPORTS-123-12-NEXT:     Name: test.dll
+IMPORTS-123-12-NEXT:     ImportLookupTableRVA: 0x3340
+IMPORTS-123-12-NEXT:     ImportAddressTableRVA: 0x2008
+IMPORTS-123-12-NEXT:     Symbol: func1 (0)
+IMPORTS-123-12-NEXT:     Symbol: func2 (0)
+IMPORTS-123-12-NEXT:   }
+IMPORTS-123-12-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test test2.dll | FileCheck --check-prefix=TEST-123-12 %s
+TEST-123-12: 0x180007000 08400000 08200000 10400000 10200000
+
+RUN: llvm-readobj --hex-dump=.testa test2.dll | FileCheck --check-prefix=TEST-123-12A %s
+TEST-123-12A: 0x180008000 08200000 10200000 00200000
+
+RUN: llvm-readobj --hex-dump=.rdata test2.dll | FileCheck --check-prefix=TEST-123-12AUX %s
+TEST-123-12AUX:      0x180004000 00000000 00000000 08100080 01000000
+TEST-123-12AUX-NEXT: 0x180004010 1c100080 01000000 00000000 00000000
+
+
+# Link to the imported func1 and func2 from both native and EC code, and func3 from EC code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:func-12-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func123-arm64ec.obj func12-arm64.obj imp-arm64x.lib
+
+RUN: llvm-readobj --coff-imports func-12-123.dll | FileCheck --check-prefix=IMPORTS-12-123 %s
+IMPORTS-12-123:      Import {
+IMPORTS-12-123-NEXT:   Name: test.dll
+IMPORTS-12-123-NEXT:   ImportLookupTableRVA: 0x3338
+IMPORTS-12-123-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-12-123-NEXT:   Symbol: func1 (0)
+IMPORTS-12-123-NEXT:   Symbol: func2 (0)
+IMPORTS-12-123-NEXT: }
+IMPORTS-12-123-NEXT: HybridObject {
+IMPORTS-12-123:        Import {
+IMPORTS-12-123-NEXT:     Name: test.dll
+IMPORTS-12-123-NEXT:     ImportLookupTableRVA: 0x3338
+IMPORTS-12-123-NEXT:     ImportAddressTableRVA: 0x2000
+IMPORTS-12-123-NEXT:     Symbol: func1 (0)
+IMPORTS-12-123-NEXT:     Symbol: func2 (0)
+IMPORTS-12-123-NEXT:     Symbol: func3 (0)
+IMPORTS-12-123-NEXT:   }
+IMPORTS-12-123-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test func-12-123.dll | FileCheck --check-prefix=TEST-12-123 %s
+TEST-12-123:      0x180007000 00400000 00200000 08400000 08200000
+TEST-12-123-NEXT: 0x180007010 10400000 10200000
+
+RUN: llvm-readobj --hex-dump=.testa func-12-123.dll | FileCheck --check-prefix=TEST-12-123A %s
+TEST-12-123A: 0x180008000 00200000 08200000
+
+RUN: llvm-readobj --hex-dump=.rdata func-12-123.dll | FileCheck --check-prefix=TEST-12-123AUX %s
+TEST-12-123AUX:      0x180004000 08100080 01000000 1c100080 01000000
+TEST-12-123AUX-NEXT: 0x180004010 30100080 01000000 00000000 00000000
+
+
+# Link to the imported func2 and func3 from both native and EC code, func4 from native code,
+# and func1 from EC code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-234-123.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func123-arm64ec.obj func234-arm64.obj imp-arm64x.lib
+
+RUN: llvm-readobj --coff-imports test-234-123.dll | FileCheck --check-prefix=IMPORTS-234-123 %s
+IMPORTS-234-123:      Import {
+IMPORTS-234-123-NEXT:   Name: test.dll
+IMPORTS-234-123-NEXT:   ImportLookupTableRVA: 0x3338
+IMPORTS-234-123-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-234-123-NEXT:   Symbol: func4 (0)
+IMPORTS-234-123-NEXT:   Symbol: func2 (0)
+IMPORTS-234-123-NEXT:   Symbol: func3 (0)
+IMPORTS-234-123-NEXT: }
+IMPORTS-234-123-NEXT: HybridObject {
+IMPORTS-234-123:        Import {
+IMPORTS-234-123-NEXT:     Name: test.dll
+IMPORTS-234-123-NEXT:     ImportLookupTableRVA: 0x3340
+IMPORTS-234-123-NEXT:     ImportAddressTableRVA: 0x2008
+IMPORTS-234-123-NEXT:     Symbol: func2 (0)
+IMPORTS-234-123-NEXT:     Symbol: func3 (0)
+IMPORTS-234-123-NEXT:     Symbol: func1 (0)
+IMPORTS-234-123-NEXT:   }
+IMPORTS-234-123-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test test-234-123.dll | FileCheck --check-prefix=TEST-234-123 %s
+TEST-234-123:      0x180007000 18400000 18200000 08400000 08200000
+TEST-234-123-NEXT: 0x180007010 10400000 1020000
+
+RUN: llvm-readobj --hex-dump=.testa test-234-123.dll | FileCheck --check-prefix=TEST-234-123A %s
+TEST-234-123A: 0x180008000 08200000 10200000 00200000
+
+
+# Link to the imported func3 and func4 from native code, and func1 and func2 from EC code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-34-12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12o-arm64ec.obj func34o-arm64.obj imp-arm64x.lib imp2.lib
+
+RUN: llvm-readobj --coff-imports test-34-12.dll | FileCheck --check-prefix=IMPORTS-34-12 %s
+IMPORTS-34-12:      Import {
+IMPORTS-34-12-NEXT:   Name: test.dll
+IMPORTS-34-12-NEXT:   ImportLookupTableRVA: 0x3350
+IMPORTS-34-12-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-34-12-NEXT:   Symbol: func3 (0)
+IMPORTS-34-12-NEXT:   Symbol: func4 (0)
+IMPORTS-34-12-NEXT: }
+IMPORTS-34-12-NEXT: Import {
+IMPORTS-34-12-NEXT:   Name: test2.dll
+IMPORTS-34-12-NEXT:   ImportLookupTableRVA: 0x3378
+IMPORTS-34-12-NEXT:   ImportAddressTableRVA: 0x2028
+IMPORTS-34-12-NEXT:   Symbol: otherfunc (0)
+IMPORTS-34-12-NEXT: }
+IMPORTS-34-12-NEXT: HybridObject {
+IMPORTS-34-12:        Import {
+IMPORTS-34-12-NEXT:     Name: test.dll
+IMPORTS-34-12-NEXT:     ImportLookupTableRVA: 0x3360
+IMPORTS-34-12-NEXT:     ImportAddressTableRVA: 0x2010
+IMPORTS-34-12-NEXT:     Symbol: func1 (0)
+IMPORTS-34-12-NEXT:     Symbol: func2 (0)
+IMPORTS-34-12-NEXT:   }
+IMPORTS-34-12-NEXT:   Import {
+IMPORTS-34-12-NEXT:     Name: test2.dll
+IMPORTS-34-12-NEXT:     ImportLookupTableRVA: 0x3378
+IMPORTS-34-12-NEXT:     ImportAddressTableRVA: 0x2028
+IMPORTS-34-12-NEXT:     Symbol: otherfunc (0)
+IMPORTS-34-12-NEXT:   }
+IMPORTS-34-12-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test test-34-12.dll | FileCheck --check-prefix=TEST-23-12 %s
+TEST-23-12:      0x180007000 10400000 10200000 18400000 18200000
+TEST-23-12-NEXT: 0x180007010 28400000 28200000
+
+RUN: llvm-readobj --hex-dump=.testa test-34-12.dll | FileCheck --check-prefix=TEST-23-12A %s
+TEST-23-12A: 0x180008000 00200000 08200000 28200000
+
+
+# Link only to imported EC functions, with no native imports.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-ec12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          icall.obj func12-arm64ec.obj funco-arm64.obj imp-arm64x.lib imp2.lib
+
+RUN: llvm-readobj --coff-imports test-ec12.dll | FileCheck --check-prefix=IMPORTS-EC12 %s
+
+IMPORTS-EC12:      File: test-ec12.dll
+IMPORTS-EC12-NEXT: Format: COFF-ARM64X
+IMPORTS-EC12-NEXT: Arch: aarch64
+IMPORTS-EC12-NEXT: AddressSize: 64bit
+IMPORTS-EC12-NEXT: Import {
+IMPORTS-EC12-NEXT:   Name: test.dll
+IMPORTS-EC12-NEXT:   ImportLookupTableRVA: 0x3350
+IMPORTS-EC12-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-EC12-NEXT: }
+IMPORTS-EC12-NEXT: Import {
+IMPORTS-EC12-NEXT:   Name: test2.dll
+IMPORTS-EC12-NEXT:   ImportLookupTableRVA: 0x3368
+IMPORTS-EC12-NEXT:   ImportAddressTableRVA: 0x2018
+IMPORTS-EC12-NEXT:   Symbol: otherfunc (0)
+IMPORTS-EC12-NEXT: }
+IMPORTS-EC12-NEXT: HybridObject {
+IMPORTS-EC12-NEXT:   Format: COFF-ARM64EC
+IMPORTS-EC12-NEXT:   Arch: aarch64
+IMPORTS-EC12-NEXT:   AddressSize: 64bit
+IMPORTS-EC12-NEXT:   Import {
+IMPORTS-EC12-NEXT:     Name: test.dll
+IMPORTS-EC12-NEXT:     ImportLookupTableRVA: 0x3350
+IMPORTS-EC12-NEXT:     ImportAddressTableRVA: 0x2000
+IMPORTS-EC12-NEXT:     Symbol: func1 (0)
+IMPORTS-EC12-NEXT:     Symbol: func2 (0)
+IMPORTS-EC12-NEXT:   }
+IMPORTS-EC12-NEXT:   Import {
+IMPORTS-EC12-NEXT:     Name: test2.dll
+IMPORTS-EC12-NEXT:     ImportLookupTableRVA: 0x3370
+IMPORTS-EC12-NEXT:     ImportAddressTableRVA: 0x2020
+IMPORTS-EC12-NEXT:   }
+IMPORTS-EC12-NEXT: }
+
+
+# Link only to imported native functions, with no EC imports.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-n12.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          func12-arm64.obj imp-arm64x.lib
+
+RUN: llvm-readobj --coff-imports test-n12.dll | FileCheck --check-prefix=IMPORTS-N12 %s
+
+IMPORTS-N12:      Arch: aarch64
+IMPORTS-N12-NEXT: AddressSize: 64bit
+IMPORTS-N12-NEXT: Import {
+IMPORTS-N12-NEXT:   Name: test.dll
+IMPORTS-N12-NEXT:   ImportLookupTableRVA: 0x2330
+IMPORTS-N12-NEXT:   ImportAddressTableRVA: 0x1000
+IMPORTS-N12-NEXT:   Symbol: func1 (0)
+IMPORTS-N12-NEXT:   Symbol: func2 (0)
+IMPORTS-N12-NEXT: }
+IMPORTS-N12-NEXT: HybridObject {
+IMPORTS-N12-NEXT:   Format: COFF-ARM64EC
+IMPORTS-N12-NEXT:   Arch: aarch64
+IMPORTS-N12-NEXT:   AddressSize: 64bit
+IMPORTS-N12-NEXT:   Import {
+IMPORTS-N12-NEXT:     Name: test.dll
+IMPORTS-N12-NEXT:     ImportLookupTableRVA: 0x2340
+IMPORTS-N12-NEXT:     ImportAddressTableRVA: 0x1010
+IMPORTS-N12-NEXT:   }
+IMPORTS-N12-NEXT: }
+
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:test-dup.dll loadconfig-arm64.obj loadconfig-arm64ec.obj icall.obj \
+RUN:          func12-arm64ec.obj func34-arm64.obj dup.lib
+
+RUN: llvm-readobj --coff-imports test-dup.dll | FileCheck --check-prefix=IMPORTS-DUP %s
+IMPORTS-DUP:      Format: COFF-ARM64X
+IMPORTS-DUP-NEXT: Arch: aarch64
+IMPORTS-DUP-NEXT: AddressSize: 64bit
+IMPORTS-DUP-NEXT: Import {
+IMPORTS-DUP-NEXT:   Name: test.dll
+IMPORTS-DUP-NEXT:   ImportLookupTableRVA: 0x3338
+IMPORTS-DUP-NEXT:   ImportAddressTableRVA: 0x2000
+IMPORTS-DUP-NEXT:   Symbol: func4 (0)
+IMPORTS-DUP-NEXT:   Symbol: func4 (0)
+IMPORTS-DUP-NEXT: }
+IMPORTS-DUP-NEXT: HybridObject {
+IMPORTS-DUP-NEXT:   Format: COFF-ARM64EC
+IMPORTS-DUP-NEXT:   Arch: aarch64
+IMPORTS-DUP-NEXT:   AddressSize: 64bit
+IMPORTS-DUP-NEXT:   Import {
+IMPORTS-DUP-NEXT:     Name: test.dll
+IMPORTS-DUP-NEXT:     ImportLookupTableRVA: 0x3348
+IMPORTS-DUP-NEXT:     ImportAddressTableRVA: 0x2010
+IMPORTS-DUP-NEXT:     Symbol: func1 (0)
+IMPORTS-DUP-NEXT:     Symbol: func1 (0)
+IMPORTS-DUP-NEXT:   }
+IMPORTS-DUP-NEXT: }
+
+#--- func12-thunks-arm64ec.s
+    .section .test, "r"
+    .rva __imp_func1
+    .rva __imp_aux_func1
+    .rva func1
+    .rva "#func1"
+    .rva __imp_func2
+    .rva __imp_aux_func2
+
+#--- func12-thunks-arm64.s
+    .section .testa, "r"
+    .rva __imp_func1
+    .rva __imp_func2
+    .rva func2
+
+#--- func12-arm64ec.s
+    .section .test, "r"
+    .rva __imp_func1
+    .rva __imp_aux_func1
+    .rva __imp_func2
+    .rva __imp_aux_func2
+
+#--- func123-arm64.s
+    .section .testa, "r"
+    .rva __imp_func1
+    .rva __imp_func2
+    .rva __imp_func3
+
+#--- func123-arm64ec.s
+    .section .test, "r"
+    .rva __imp_func1
+    .rva __imp_aux_func1
+    .rva __imp_func2
+    .rva __imp_aux_func2
+    .rva __imp_func3
+    .rva __imp_aux_func3
+
+#--- func12-arm64.s
+    .section .testa, "r"
+    .rva __imp_func1
+    .rva __imp_func2
+
+#--- func234-arm64.s
+    .section .testa, "r"
+    .rva __imp_func2
+    .rva __imp_func3
+    .rva __imp_func4
+
+#--- func12o-arm64ec.s
+    .section .test, "r"
+    .rva __imp_func1
+    .rva __imp_aux_func1
+    .rva __imp_func2
+    .rva __imp_aux_func2
+    .rva __imp_otherfunc
+    .rva __imp_aux_otherfunc
+
+#--- func34-arm64.s
+    .section .testa, "r"
+    .rva __imp_func3
+    .rva __imp_func4
+
+#--- func34o-arm64.s
+    .section .testa, "r"
+    .rva __imp_func3
+    .rva __imp_func4
+    .rva __imp_otherfunc
+
+#--- funco-arm64.s
+    .section .testa, "r"
+    .rva __imp_otherfunc
+
+#--- icall.s
+    .text
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #2
+    ret
+
+    .section .hybmp$x, "yi"
+    .symidx __imp_func1
+    .symidx func1_exit_thunk
+    .word 4
+
+    .section .wowthk$aa,"xr",discard,func1_exit_thunk
+    .globl func1_exit_thunk
+func1_exit_thunk:
+    mov w0, #3
+    ret
+
+#--- imp.def
+NAME test.dll
+EXPORTS
+    data_sym DATA
+    func1
+    func2
+    func3
+    func4
+
+#--- imp-ord10.def
+NAME test.dll
+EXPORTS
+    data_sym DATA @10
+    func1 @11
+    func2 @12
+    func3 @13
+    func4 @14
+
+#--- imp-ord20.def
+NAME test.dll
+EXPORTS
+    data_sym DATA @10
+    func1 @21
+    func2 @22
+    func3 @23
+    func4 @24
+
+#--- imp2.def
+NAME test2.dll
+EXPORTS
+    otherfunc
+
+#--- noname-ec.def
+NAME test.dll
+EXPORTS
+    func1 @10 NONAME
+    func2 @11 NONAME
+
+#--- noname-native.def
+NAME test.dll
+EXPORTS
+    func1 @12 NONAME
+    func2 @11 NONAME
+
+#--- dup-ec.def
+NAME test.dll
+EXPORTS
+    func1
+    func2 EXPORTAS func1
+
+#--- dup-native.def
+NAME test.dll
+EXPORTS
+    func3 EXPORTAS func4
+    func4

From 80ab237c1187aa7e8a1f546175887d768fa14e2d Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Sun, 26 Jan 2025 22:36:01 +0100
Subject: [PATCH 147/432] [LLD][COFF] Add REQUIRE x86 to arm64x-import.test
 (NFC)

This ensures the disassembler can handle ARM64X binaries correctly. Fixes #124189.
---
 lld/test/COFF/arm64x-import.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/COFF/arm64x-import.test b/lld/test/COFF/arm64x-import.test
index bc202e1d17251..7441c71d87710 100644
--- a/lld/test/COFF/arm64x-import.test
+++ b/lld/test/COFF/arm64x-import.test
@@ -1,4 +1,4 @@
-REQUIRES: aarch64
+REQUIRES: aarch64, x86
 RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows func12-thunks-arm64ec.s -o func12-thunks-arm64ec.obj

From e278e1b6ece025ace4238748c0f57fda3ca833f9 Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Sun, 26 Jan 2025 23:58:58 +0200
Subject: [PATCH 148/432] [NFC][CodeGen] Fix typos in code comments. (#124382)

This fixes typos in `calcUniqueIDUpdateFlagsAndSize` function.
---
 llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index fbbd92a2e0ca4..6ab6d18213ba4 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -732,7 +732,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   // that section can be assigned an incorrect entry size. To avoid this we
   // usually put symbols of the same size into distinct mergeable sections with
   // the same name. Doing so relies on the ",unique ," assembly feature. This
-  // feature is not avalible until bintuils version 2.35
+  // feature is not available until binutils version 2.35
   // (https://sourceware.org/bugzilla/show_bug.cgi?id=25380).
   const bool SupportsUnique = Ctx.getAsmInfo()->useIntegratedAssembler() ||
                               Ctx.getAsmInfo()->binutilsIsAtLeast(2, 35);
@@ -745,7 +745,7 @@ calcUniqueIDUpdateFlagsAndSize(const GlobalObject *GO, StringRef SectionName,
   const bool SymbolMergeable = Flags & ELF::SHF_MERGE;
   const bool SeenSectionNameBefore =
       Ctx.isELFGenericMergeableSection(SectionName);
-  // If this is the first ocurrence of this section name, treat it as the
+  // If this is the first occurrence of this section name, treat it as the
   // generic section
   if (!SymbolMergeable && !SeenSectionNameBefore) {
     if (TM.getSeparateNamedSections())

From c9637afec7ed72904c74c2fc71e990d378f3d7a6 Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour@intel.com>
Date: Sun, 26 Jan 2025 15:06:26 -0800
Subject: [PATCH 149/432] [Clang] Fix createConstexprUnknownAPValues to use
 zero offset when ceating APValue (#124478)

When implmenting P2280R4 here:
https://github.com/llvm/llvm-project/pull/95474

When creating the APValue to store and constexprUnknown value I used an
offset of CharUnits::One() but it should have been CharUnits::Zero().

This change just adjusts that value.
---
 clang/lib/AST/ExprConstant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3b5ab839c6cf7..be8f1fe02e721 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -1961,7 +1961,7 @@ APValue &
 CallStackFrame::createConstexprUnknownAPValues(const VarDecl *Key,
                                                APValue::LValueBase Base) {
   APValue &Result = ConstexprUnknownAPValues[MapKeyTy(Key, Base.getVersion())];
-  Result = APValue(Base, CharUnits::One(), APValue::ConstexprUnknown{});
+  Result = APValue(Base, CharUnits::Zero(), APValue::ConstexprUnknown{});
 
   return Result;
 }

From bfa7de0df5d8eb8dd284b0f49f10e7f0cd850693 Mon Sep 17 00:00:00 2001
From: YunQiang Su <yunqiang@isrc.iscas.ac.cn>
Date: Mon, 27 Jan 2025 07:27:26 +0800
Subject: [PATCH 150/432] X86: Support FCANONICALIZE on f64/f80 for i686 with
 SSE2 or AVX (#123917)

Currently, FCANONICALIZE is not enabled for f64 with SSE2,
and is not enabled for f80 for 32bit system.
Let's enable them.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |   4 +-
 llvm/test/CodeGen/X86/canonicalize-vars.ll | 336 ++++++++++++++++++++-
 2 files changed, 335 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 23731212a420c..ce3c140af8105 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -334,10 +334,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
     }
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
+    setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
       setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
-      setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
     }
   }
   if (Subtarget.hasAVX10_2()) {
@@ -367,7 +367,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
     setOperationAction(ISD::FCANONICALIZE, MVT::f32, Custom);
-    setOperationAction(ISD::FCANONICALIZE, MVT::f80, Custom);
     setOperationAction(ISD::FCANONICALIZE, MVT::f64, Custom);
     if (Subtarget.is64Bit()) {
       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
@@ -889,6 +888,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FMUL     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FDIV     , MVT::f80, Legal);
     setOperationAction(ISD::STRICT_FSQRT    , MVT::f80, Legal);
+    setOperationAction(ISD::FCANONICALIZE   , MVT::f80, Custom);
     if (isTypeLegal(MVT::f16)) {
       setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom);
       setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom);
diff --git a/llvm/test/CodeGen/X86/canonicalize-vars.ll b/llvm/test/CodeGen/X86/canonicalize-vars.ll
index 951ea1b72f439..67213b38277dc 100644
--- a/llvm/test/CodeGen/X86/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/X86/canonicalize-vars.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --default-march x86_64-unknown-linux-gnu --version 5
 ; RUN: llc -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X87
-; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE,SSE2
+; RUN: llc -mattr=+sse2 -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X86-SSE
+; RUN: llc -mattr=+avx -mtriple=i686-- < %s | FileCheck %s -check-prefixes=X86-AVX
+; RUN: llc -mattr=+sse2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=SSE
 ; RUN: llc -mattr=+avx -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX1
 ; RUN: llc -mattr=+avx2 -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX2
 ; RUN: llc -mattr=+avx512f -mtriple=x86_64 < %s | FileCheck %s -check-prefixes=AVX,AVX512F
@@ -12,6 +14,30 @@ define float @canon_fp32_varargsf32(float %a) {
 ; X87-NEXT:    fmuls {{[0-9]+}}(%esp)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canon_fp32_varargsf32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movss %xmm0, (%esp)
+; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canon_fp32_varargsf32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    flds (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canon_fp32_varargsf32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -33,6 +59,20 @@ define x86_fp80 @canon_fp32_varargsf80(x86_fp80 %a) {
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canon_fp32_varargsf80:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fmulp %st, %st(1)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canon_fp32_varargsf80:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fmulp %st, %st(1)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canon_fp32_varargsf80:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
@@ -64,6 +104,32 @@ define x86_fp80 @complex_canonicalize_fmul_x86_fp80(x86_fp80 %a, x86_fp80 %b) {
 ; X87-NEXT:    fsubp %st, %st(1)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: complex_canonicalize_fmul_x86_fp80:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fsub %st(1), %st
+; X86-SSE-NEXT:    fld %st(0)
+; X86-SSE-NEXT:    fadd %st(2), %st
+; X86-SSE-NEXT:    fsubp %st, %st(1)
+; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fmulp %st, %st(1)
+; X86-SSE-NEXT:    fsubp %st, %st(1)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: complex_canonicalize_fmul_x86_fp80:
+; X86-AVX:       # %bb.0: # %entry
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fsub %st(1), %st
+; X86-AVX-NEXT:    fld %st(0)
+; X86-AVX-NEXT:    fadd %st(2), %st
+; X86-AVX-NEXT:    fsubp %st, %st(1)
+; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fmulp %st, %st(1)
+; X86-AVX-NEXT:    fsubp %st, %st(1)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: complex_canonicalize_fmul_x86_fp80:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
@@ -130,6 +196,54 @@ define double @canonicalize_fp64(double %a, double %b) unnamed_addr #0 {
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canonicalize_fp64:
+; X86-SSE:       # %bb.0: # %start
+; X86-SSE-NEXT:    pushl %ebp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    .cfi_offset %ebp, -8
+; X86-SSE-NEXT:    movl %esp, %ebp
+; X86-SSE-NEXT:    .cfi_def_cfa_register %ebp
+; X86-SSE-NEXT:    andl $-8, %esp
+; X86-SSE-NEXT:    subl $8, %esp
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE-NEXT:    cmpunordsd %xmm0, %xmm2
+; X86-SSE-NEXT:    movapd %xmm2, %xmm3
+; X86-SSE-NEXT:    andpd %xmm1, %xmm3
+; X86-SSE-NEXT:    maxsd %xmm0, %xmm1
+; X86-SSE-NEXT:    andnpd %xmm1, %xmm2
+; X86-SSE-NEXT:    orpd %xmm3, %xmm2
+; X86-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE-NEXT:    movsd %xmm2, (%esp)
+; X86-SSE-NEXT:    fldl (%esp)
+; X86-SSE-NEXT:    movl %ebp, %esp
+; X86-SSE-NEXT:    popl %ebp
+; X86-SSE-NEXT:    .cfi_def_cfa %esp, 4
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canonicalize_fp64:
+; X86-AVX:       # %bb.0: # %start
+; X86-AVX-NEXT:    pushl %ebp
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX-NEXT:    .cfi_offset %ebp, -8
+; X86-AVX-NEXT:    movl %esp, %ebp
+; X86-AVX-NEXT:    .cfi_def_cfa_register %ebp
+; X86-AVX-NEXT:    andl $-8, %esp
+; X86-AVX-NEXT:    subl $8, %esp
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX-NEXT:    vmaxsd %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
+; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%esp)
+; X86-AVX-NEXT:    fldl (%esp)
+; X86-AVX-NEXT:    movl %ebp, %esp
+; X86-AVX-NEXT:    popl %ebp
+; X86-AVX-NEXT:    .cfi_def_cfa %esp, 4
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canonicalize_fp64:
 ; SSE:       # %bb.0: # %start
 ; SSE-NEXT:    movapd %xmm0, %xmm2
@@ -207,6 +321,42 @@ define float @canonicalize_fp32(float %aa, float %bb) unnamed_addr #0 {
 ; X87-NEXT:    fmulp %st, %st(1)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canonicalize_fp32:
+; X86-SSE:       # %bb.0: # %start
+; X86-SSE-NEXT:    pushl %eax
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movaps %xmm0, %xmm2
+; X86-SSE-NEXT:    cmpunordss %xmm0, %xmm2
+; X86-SSE-NEXT:    movaps %xmm2, %xmm3
+; X86-SSE-NEXT:    andps %xmm1, %xmm3
+; X86-SSE-NEXT:    maxss %xmm0, %xmm1
+; X86-SSE-NEXT:    andnps %xmm1, %xmm2
+; X86-SSE-NEXT:    orps %xmm3, %xmm2
+; X86-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
+; X86-SSE-NEXT:    movss %xmm2, (%esp)
+; X86-SSE-NEXT:    flds (%esp)
+; X86-SSE-NEXT:    popl %eax
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canonicalize_fp32:
+; X86-AVX:       # %bb.0: # %start
+; X86-AVX-NEXT:    pushl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmaxss %xmm0, %xmm1, %xmm2
+; X86-AVX-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm0
+; X86-AVX-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
+; X86-AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
+; X86-AVX-NEXT:    flds (%esp)
+; X86-AVX-NEXT:    popl %eax
+; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canonicalize_fp32:
 ; SSE:       # %bb.0: # %start
 ; SSE-NEXT:    movaps %xmm0, %xmm2
@@ -261,6 +411,22 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: v_test_canonicalize_var_f32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movss %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: v_test_canonicalize_var_f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovss %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: v_test_canonicalize_var_f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
@@ -290,6 +456,24 @@ define void @v_test_canonicalize_x86_fp80(x86_fp80 addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstpt (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: v_test_canonicalize_x86_fp80:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    fldt (%eax)
+; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fmulp %st, %st(1)
+; X86-SSE-NEXT:    fstpt (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: v_test_canonicalize_x86_fp80:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    fldt (%eax)
+; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fmulp %st, %st(1)
+; X86-AVX-NEXT:    fstpt (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: v_test_canonicalize_x86_fp80:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    fldt (%rdi)
@@ -320,6 +504,22 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: v_test_canonicalize_var_f64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movsd %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: v_test_canonicalize_var_f64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovsd %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: v_test_canonicalize_var_f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
@@ -347,6 +547,20 @@ define void @canonicalize_undef(double addrspace(1)* %out) {
 ; X87-NEXT:    movl $0, (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canonicalize_undef:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movl $2146959360, 4(%eax) # imm = 0x7FF80000
+; X86-SSE-NEXT:    movl $0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canonicalize_undef:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    movl $2146959360, 4(%eax) # imm = 0x7FF80000
+; X86-AVX-NEXT:    movl $0, (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canonicalize_undef:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
@@ -384,6 +598,16 @@ define <4 x float> @canon_fp32_varargsv4f32(<4 x float> %a) {
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl $4
 ;
+; X86-SSE-LABEL: canon_fp32_varargsv4f32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canon_fp32_varargsv4f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canon_fp32_varargsv4f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -430,6 +654,18 @@ define <4 x double> @canon_fp64_varargsv4f64(<4 x double> %a) {
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    retl $4
 ;
+; X86-SSE-LABEL: canon_fp64_varargsv4f64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0]
+; X86-SSE-NEXT:    mulpd %xmm2, %xmm0
+; X86-SSE-NEXT:    mulpd %xmm2, %xmm1
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canon_fp64_varargsv4f64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canon_fp64_varargsv4f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd {{.*#+}} xmm2 = [1.0E+0,1.0E+0]
@@ -468,6 +704,26 @@ define <2 x x86_fp80> @canon_fp80_varargsv2fp80(<2 x x86_fp80> %a) {
 ; X87-NEXT:    fxch %st(1)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: canon_fp80_varargsv2fp80:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fmul %st, %st(1)
+; X86-SSE-NEXT:    fmulp %st, %st(2)
+; X86-SSE-NEXT:    fxch %st(1)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: canon_fp80_varargsv2fp80:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fmul %st, %st(1)
+; X86-AVX-NEXT:    fmulp %st, %st(2)
+; X86-AVX-NEXT:    fxch %st(1)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: canon_fp80_varargsv2fp80:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    fldt {{[0-9]+}}(%rsp)
@@ -512,6 +768,22 @@ define void @vec_canonicalize_var_v4f32(<4 x float> addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstps (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: vec_canonicalize_var_v4f32:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movaps (%eax), %xmm0
+; X86-SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    movaps %xmm0, (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: vec_canonicalize_var_v4f32:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovaps (%eax), %xmm0
+; X86-AVX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX-NEXT:    vmovaps %xmm0, (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: vec_canonicalize_var_v4f32:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps (%rdi), %xmm0
@@ -566,6 +838,26 @@ define void @vec_canonicalize_var_v4f64(<4 x double> addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstpl (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: vec_canonicalize_var_v4f64:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
+; X86-SSE-NEXT:    movapd 16(%eax), %xmm1
+; X86-SSE-NEXT:    mulpd %xmm0, %xmm1
+; X86-SSE-NEXT:    mulpd (%eax), %xmm0
+; X86-SSE-NEXT:    movapd %xmm0, (%eax)
+; X86-SSE-NEXT:    movapd %xmm1, 16(%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: vec_canonicalize_var_v4f64:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    vmovapd (%eax), %ymm0
+; X86-AVX-NEXT:    vmulpd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-AVX-NEXT:    vmovapd %ymm0, (%eax)
+; X86-AVX-NEXT:    vzeroupper
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: vec_canonicalize_var_v4f64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,1.0E+0]
@@ -626,6 +918,46 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
 ; X87-NEXT:    fstpt (%eax)
 ; X87-NEXT:    retl
 ;
+; X86-SSE-LABEL: vec_canonicalize_x86_fp80:
+; X86-SSE:       # %bb.0:
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SSE-NEXT:    fldt 30(%eax)
+; X86-SSE-NEXT:    fldt 20(%eax)
+; X86-SSE-NEXT:    fldt 10(%eax)
+; X86-SSE-NEXT:    fldt (%eax)
+; X86-SSE-NEXT:    fld1
+; X86-SSE-NEXT:    fmul %st, %st(1)
+; X86-SSE-NEXT:    fmul %st, %st(2)
+; X86-SSE-NEXT:    fmul %st, %st(3)
+; X86-SSE-NEXT:    fmulp %st, %st(4)
+; X86-SSE-NEXT:    fxch %st(3)
+; X86-SSE-NEXT:    fstpt 30(%eax)
+; X86-SSE-NEXT:    fxch %st(1)
+; X86-SSE-NEXT:    fstpt 20(%eax)
+; X86-SSE-NEXT:    fstpt 10(%eax)
+; X86-SSE-NEXT:    fstpt (%eax)
+; X86-SSE-NEXT:    retl
+;
+; X86-AVX-LABEL: vec_canonicalize_x86_fp80:
+; X86-AVX:       # %bb.0:
+; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX-NEXT:    fldt 30(%eax)
+; X86-AVX-NEXT:    fldt 20(%eax)
+; X86-AVX-NEXT:    fldt 10(%eax)
+; X86-AVX-NEXT:    fldt (%eax)
+; X86-AVX-NEXT:    fld1
+; X86-AVX-NEXT:    fmul %st, %st(1)
+; X86-AVX-NEXT:    fmul %st, %st(2)
+; X86-AVX-NEXT:    fmul %st, %st(3)
+; X86-AVX-NEXT:    fmulp %st, %st(4)
+; X86-AVX-NEXT:    fxch %st(3)
+; X86-AVX-NEXT:    fstpt 30(%eax)
+; X86-AVX-NEXT:    fxch %st(1)
+; X86-AVX-NEXT:    fstpt 20(%eax)
+; X86-AVX-NEXT:    fstpt 10(%eax)
+; X86-AVX-NEXT:    fstpt (%eax)
+; X86-AVX-NEXT:    retl
+;
 ; SSE-LABEL: vec_canonicalize_x86_fp80:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    fldt 30(%rdi)
@@ -668,5 +1000,3 @@ define void @vec_canonicalize_x86_fp80(<4 x x86_fp80> addrspace(1)* %out) #1 {
   store <4 x x86_fp80> %canonicalized, <4 x x86_fp80> addrspace(1)* %out
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; SSE2: {{.*}}

From db79fb2a91df31a07f312f8e061936927ac5c506 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Sun, 26 Jan 2025 15:40:55 -0800
Subject: [PATCH 151/432] [msan] Add handlers for AVX masked load/store
 intrinsics (#123857)

This patch adds explicit support for AVX masked load/store intrinsics,
largely by applying the intrinsics to the shadows (but subtly different
to handleIntrinsicByApplyingToShadow()).

We do not reuse the handleMaskedLoad/Store functions. The key challenge
is that the LLVM masked intrinsics require a vector of booleans, while
AVX masked intrinsics use the MSBs of a vector of integers.
X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad mentions that the x86
backend does not know how to efficiently convert from a vector of
booleans back into the AVX mask format; therefore, they (and we) do not
reduce AVX masked intrinsics into LLVM masked intrinsics.
---
 .../Instrumentation/MemorySanitizer.cpp       | 154 ++++++++++++++++-
 .../MemorySanitizer/X86/avx-intrinsics-x86.ll | 160 ++++++++++--------
 .../X86/avx2-intrinsics-x86.ll                | 152 +++++++++--------
 .../i386/avx-intrinsics-i386.ll               | 160 ++++++++++--------
 .../i386/avx2-intrinsics-i386.ll              | 152 +++++++++--------
 5 files changed, 489 insertions(+), 289 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 56d3eb10d73e9..b6293af4ab477 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3046,7 +3046,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (maybeHandleSimpleNomemIntrinsic(I))
         return true;
 
-    // FIXME: detect and handle SSE maskstore/maskload
+    // FIXME: detect and handle SSE maskstore/maskload?
+    // Some cases are now handled in handleAVXMasked{Load,Store}.
     return false;
   }
 
@@ -3683,6 +3684,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // TODO: Store origin.
   }
 
+  // Intrinsic::masked_store
+  //
+  // Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked
+  //       stores are lowered to Intrinsic::masked_store.
   void handleMaskedStore(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *V = I.getArgOperand(0);
@@ -3713,6 +3718,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                 std::max(Alignment, kMinOriginAlignment));
   }
 
+  // Intrinsic::masked_load
+  //
+  // Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked
+  //       loads are lowered to Intrinsic::masked_load.
   void handleMaskedLoad(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Ptr = I.getArgOperand(0);
@@ -3754,6 +3763,125 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, Origin);
   }
 
+  // e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>)
+  //                                           dst  mask       src
+  //
+  // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
+  // by handleMaskedStore.
+  //
+  // This function handles AVX and AVX2 masked stores; these use the MSBs of a
+  // vector of integers, unlike the LLVM masked intrinsics, which require a
+  // vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad
+  // mentions that the x86 backend does not know how to efficiently convert
+  // from a vector of booleans back into the AVX mask format; therefore, they
+  // (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked
+  // intrinsics.
+  void handleAVXMaskedStore(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    Value *Dst = I.getArgOperand(0);
+    assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!");
+
+    Value *Mask = I.getArgOperand(1);
+    assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
+
+    Value *Src = I.getArgOperand(2);
+    assert(isa<VectorType>(Src->getType()) && "Source is not a vector!");
+
+    const Align Alignment = Align(1);
+
+    Value *SrcShadow = getShadow(Src);
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Dst, &I);
+      insertShadowCheck(Mask, &I);
+    }
+
+    Value *DstShadowPtr;
+    Value *DstOriginPtr;
+    std::tie(DstShadowPtr, DstOriginPtr) = getShadowOriginPtr(
+        Dst, IRB, SrcShadow->getType(), Alignment, /*isStore*/ true);
+
+    SmallVector<Value *, 2> ShadowArgs;
+    ShadowArgs.append(1, DstShadowPtr);
+    ShadowArgs.append(1, Mask);
+    // The intrinsic may require floating-point but shadows can be arbitrary
+    // bit patterns, of which some would be interpreted as "invalid"
+    // floating-point values (NaN etc.); we assume the intrinsic will happily
+    // copy them.
+    ShadowArgs.append(1, IRB.CreateBitCast(SrcShadow, Src->getType()));
+
+    CallInst *CI =
+        IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), ShadowArgs);
+    setShadow(&I, CI);
+
+    if (!MS.TrackOrigins)
+      return;
+
+    // Approximation only
+    auto &DL = F.getDataLayout();
+    paintOrigin(IRB, getOrigin(Src), DstOriginPtr,
+                DL.getTypeStoreSize(SrcShadow->getType()),
+                std::max(Alignment, kMinOriginAlignment));
+  }
+
+  // e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>)
+  //       return                                    src  mask
+  //
+  // Masked-off values are replaced with 0, which conveniently also represents
+  // initialized memory.
+  //
+  // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
+  // by handleMaskedStore.
+  //
+  // We do not combine this with handleMaskedLoad; see comment in
+  // handleAVXMaskedStore for the rationale.
+  //
+  // This is subtly different than handleIntrinsicByApplyingToShadow(I, 1)
+  // because we need to apply getShadowOriginPtr, not getShadow, to the first
+  // parameter.
+  void handleAVXMaskedLoad(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+
+    Value *Src = I.getArgOperand(0);
+    assert(Src->getType()->isPointerTy() && "Source is not a pointer!");
+
+    Value *Mask = I.getArgOperand(1);
+    assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
+
+    const Align Alignment = Align(1);
+
+    if (ClCheckAccessAddress) {
+      insertShadowCheck(Mask, &I);
+    }
+
+    Type *SrcShadowTy = getShadowTy(Src);
+    Value *SrcShadowPtr, *SrcOriginPtr;
+    std::tie(SrcShadowPtr, SrcOriginPtr) =
+        getShadowOriginPtr(Src, IRB, SrcShadowTy, Alignment, /*isStore*/ false);
+
+    SmallVector<Value *, 2> ShadowArgs;
+    ShadowArgs.append(1, SrcShadowPtr);
+    ShadowArgs.append(1, Mask);
+
+    CallInst *CI =
+        IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs);
+    // The intrinsic may require floating-point but shadows can be arbitrary
+    // bit patterns, of which some would be interpreted as "invalid"
+    // floating-point values (NaN etc.); we assume the intrinsic will happily
+    // copy them.
+    setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
+
+    if (!MS.TrackOrigins)
+      return;
+
+    // The "pass-through" value is always zero (initialized). To the extent
+    // that that results in initialized aligned 4-byte chunks, the origin value
+    // is ignored. It is therefore correct to simply copy the origin from src.
+    Value *PtrSrcOrigin = IRB.CreateLoad(MS.OriginTy, SrcOriginPtr);
+    setOrigin(&I, PtrSrcOrigin);
+  }
+
   // Instrument BMI / BMI2 intrinsics.
   // All of these intrinsics are Z = I(X, Y)
   // where the types of all operands and the result match, and are either i32 or
@@ -4466,6 +4594,30 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
+    case Intrinsic::x86_avx_maskstore_ps:
+    case Intrinsic::x86_avx_maskstore_pd:
+    case Intrinsic::x86_avx_maskstore_ps_256:
+    case Intrinsic::x86_avx_maskstore_pd_256:
+    case Intrinsic::x86_avx2_maskstore_d:
+    case Intrinsic::x86_avx2_maskstore_q:
+    case Intrinsic::x86_avx2_maskstore_d_256:
+    case Intrinsic::x86_avx2_maskstore_q_256: {
+      handleAVXMaskedStore(I);
+      break;
+    }
+
+    case Intrinsic::x86_avx_maskload_ps:
+    case Intrinsic::x86_avx_maskload_pd:
+    case Intrinsic::x86_avx_maskload_ps_256:
+    case Intrinsic::x86_avx_maskload_pd_256:
+    case Intrinsic::x86_avx2_maskload_d:
+    case Intrinsic::x86_avx2_maskload_q:
+    case Intrinsic::x86_avx2_maskload_d_256:
+    case Intrinsic::x86_avx2_maskload_q_256: {
+      handleAVXMaskedLoad(I);
+      break;
+    }
+
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index 7273e431a9c2a..43f51a810d0d2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -532,20 +532,22 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP4]], <2 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]])
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
@@ -556,20 +558,22 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP4]], <4 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP5]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]])
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
@@ -580,20 +584,22 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP4]], <4 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]])
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
@@ -604,20 +610,22 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP4]], <8 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP5]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]])
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
@@ -628,23 +636,25 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[TMP6]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2)
@@ -655,23 +665,25 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP6]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2)
@@ -682,23 +694,25 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[TMP6]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2)
@@ -709,23 +723,25 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP6]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index e10062142c046..c68461dd367ee 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -995,20 +995,21 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP4]], <2 x i64> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1019,20 +1020,21 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP4]], <4 x i64> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -1043,20 +1045,21 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP4]], <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1067,20 +1070,21 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP4]], <8 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
@@ -1091,23 +1095,24 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[TMP6]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
@@ -1118,23 +1123,24 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP6]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
@@ -1145,23 +1151,24 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[TMP6]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2)
@@ -1172,23 +1179,24 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP6]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
index 68337d6d962db..a22ca6dd15da4 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
@@ -550,21 +550,23 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP11]], <2 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]])
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
@@ -575,21 +577,23 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP11]], <4 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]])
+; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
@@ -600,21 +604,23 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP11]], <4 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]])
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
@@ -625,21 +631,23 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP11]], <8 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]])
+; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
@@ -650,24 +658,26 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[TMP7]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2)
@@ -678,24 +688,26 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP7]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2)
@@ -706,24 +718,26 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[TMP7]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2)
@@ -734,24 +748,26 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP7]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]])
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 29e2931d2ca48..442f0c422645a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -1048,21 +1048,22 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP10]], <2 x i64> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1073,21 +1074,22 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP10]], <4 x i64> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]])
+; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -1098,21 +1100,22 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP10]], <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1123,21 +1126,22 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP10]], <8 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]])
+; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
@@ -1148,24 +1152,25 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[TMP7]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
@@ -1176,24 +1181,25 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP7]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
@@ -1204,24 +1210,25 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[TMP7]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2)
@@ -1232,24 +1239,25 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP7]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
+; CHECK:       10:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2)

From 980e86f130eea02bd41b887f4ed896340fc90f6c Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Sun, 26 Jan 2025 16:04:30 -0800
Subject: [PATCH 152/432] [msan] Add avx512-intrinsics.ll and
 avx512-intrinsics-upgrade.ll test case (#123980)

These are forked from the corresponding files in llvm/test/CodeGen/X86/.

avx512-intrinsics.ll shows that many intrinsics are already
heuristically handled by MSan, and can be used to track refinements to
the intrinsic handling.

avx512-intrinsics-upgrade.ll tests intrinsics that LLVM "auto-upgrades";
for example, @llvm.x86.avx512.mask.store is converted into
@llvm.masked.store (which has the interesting side effect that
MemorySanitizer can already handle it via its existing
handleMaskedStore).
---
 .../X86/avx512-intrinsics-upgrade.ll          | 19969 ++++++++++++++++
 .../MemorySanitizer/X86/avx512-intrinsics.ll  | 13714 +++++++++++
 2 files changed, 33683 insertions(+)
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
 create mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000..edb618fdfb8fb
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -0,0 +1,19969 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+define i16 @unpckbw_test(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @unpckbw_test(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i1> [[TMP3]], <16 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i1> [[TMP6]], <16 x i1> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[_MSPROP1]], <8 x i1> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP2]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    store i16 [[TMP10]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP11]]
+;
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_gpr_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = shufflevector <16 x i32> [[_MSPROP6]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP7]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DOTSPLAT2]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[DOTSPLAT2]], <16 x i32> [[X1]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = shufflevector <16 x i32> [[_MSPROP8]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT3]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[DOTSPLAT4]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT10:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[DOTSPLAT4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP5]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[DOTSPLAT]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP18]], <16 x i32> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP10]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP19]], <16 x i32> [[_MSPROP_SELECT10]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP17]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
+
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_gpr_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = shufflevector <8 x i64> [[_MSPROP6]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP7]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[DOTSPLAT2]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[DOTSPLAT2]], <8 x i64> [[X1]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = shufflevector <8 x i64> [[_MSPROP8]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP9]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[DOTSPLAT4]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT10:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[DOTSPLAT4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP5]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[DOTSPLAT]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP18]], <8 x i64> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP10]], 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP19]], <8 x i64> [[_MSPROP_SELECT10]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP17]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
+
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
+
+define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_x86_vbroadcast_ss_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask )  #0 {
+;
+; CHECK-LABEL: @test_x86_mask_vbroadcast_ss_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[A1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[A1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask )  #0 {
+;
+; CHECK-LABEL: @test_x86_maskz_vbroadcast_ss_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1)  #0 {
+; CHECK-LABEL: @test_x86_vbroadcast_sd_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask )  #0 {
+;
+; CHECK-LABEL: @test_x86_mask_vbroadcast_sd_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[A1:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[A1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask )  #0 {
+;
+; CHECK-LABEL: @test_x86_maskz_vbroadcast_sd_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pbroadcastd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pbroadcastq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_movsldup_512(<16 x float> %x0, <16 x float> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_movsldup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_movsldup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_movsldup_512(<16 x float> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_movsldup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_movshdup_512(<16 x float> %x0, <16 x float> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_movshdup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_movshdup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_movshdup_512(<16 x float> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_movshdup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
+  ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_movddup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_movddup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_movddup_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_perm_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_perm_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_perm_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_perm_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_store1(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <16 x float> [[DATA]], ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr, <16 x float> %data, i16 %mask)
+  call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr2, <16 x float> %data, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 )
+
+define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_store2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <8 x double> [[DATA]], ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr, <8 x double> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr2, <8 x double> %data, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8)
+
+define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_store_aligned_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64
+; CHECK-NEXT:    store <16 x float> [[DATA]], ptr [[PTR2]], align 64
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr, <16 x float> %data, i16 %mask)
+  call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr2, <16 x float> %data, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 )
+
+define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_store_aligned_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64
+; CHECK-NEXT:    store <8 x double> [[DATA]], ptr [[PTR2]], align 64
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr, <8 x double> %data, i8 %mask)
+  call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr2, <8 x double> %data, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <8 x i64> [[X1]], ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1
+; CHECK-NEXT:    store <16 x i32> [[X1]], ptr [[PTR2]], align 1
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16)
+
+define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_store_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64
+; CHECK-NEXT:    store <8 x i64> [[X1]], ptr [[PTR2]], align 64
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2)
+  call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8)
+
+define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_store_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
+; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
+; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64
+; CHECK-NEXT:    store <16 x i32> [[X1]], ptr [[PTR2]], align 64
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2)
+  call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16)
+
+define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_aligned_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> %res, i16 %mask)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res2, %res1
+  ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr, <16 x float>, i16)
+
+define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_unaligned_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> %res, i16 %mask)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res2, %res1
+  ret <16 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16)
+
+define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_aligned_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES4]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> %res, i8 %mask)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res2, %res1
+  ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8)
+
+define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_unaligned_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES4]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> %res, i8 %mask)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask)
+  %res4 = fadd <8 x double> %res2, %res1
+  ret <8 x double> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr, <8 x double>, i8)
+
+declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_unaligned_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
+; CHECK:       25:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       26:
+; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr2, <16 x i32> %res, i16 %mask)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask)
+  %res4 = add <16 x i32> %res2, %res1
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_unaligned_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
+; CHECK:       25:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       26:
+; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr2, <8 x i64> %res, i8 %mask)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask)
+  %res4 = add <8 x i64> %res2, %res1
+  ret <8 x i64> %res4
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_aligned_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> %res, i16 %mask)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask)
+  %res4 = add <16 x i32> %res2, %res1
+  ret <16 x i32> %res4
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_load_aligned_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
+; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
+; CHECK:       24:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       25:
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> %res, i8 %mask)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask)
+  %res4 = add <8 x i64> %res2, %res1
+  ret <8 x i64> %res4
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermil_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermil_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
+  ret <16 x float> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pshuf_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pshuf_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_pcmpeq_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    store i16 [[TMP10]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP11]]
+;
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_pcmpeq_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    store i16 [[TMP19]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP20]]
+;
+  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_pcmpeq_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    store i8 [[TMP10]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP11]]
+;
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_pcmpeq_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    store i8 [[TMP19]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP20]]
+;
+  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_pcmpgt_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP13]] to i16
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
+; CHECK-NEXT:    store i16 [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP16]]
+;
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
+  ret i16 %res
+}
+
+define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_pcmpgt_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt <16 x i32> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
+; CHECK-NEXT:    store i16 [[TMP24]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP25]]
+;
+  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
+
+define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_pcmpgt_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i1> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP13]] to i8
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i1> [[TMP14]] to i8
+; CHECK-NEXT:    store i8 [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP16]]
+;
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
+  ret i8 %res
+}
+
+define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_pcmpgt_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt <8 x i64> [[TMP7]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i1> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i1> [[TMP14]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i1> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
+; CHECK-NEXT:    store i8 [[TMP24]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP25]]
+;
+  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret i8 %res
+}
+
+declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_unpckh_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_unpckh_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_unpckl_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_unpckl_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_punpcklqd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklqd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_punpcklqd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_punpckhqd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhqd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_punpckhd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_punpckld_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_punpckld_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
+
+declare void @llvm.x86.avx512.storent.q.512(ptr, <8 x i64>)
+
+define void@test_storent_q_512(<8 x i64> %data, ptr %ptr)  #0 {
+;
+; CHECK-LABEL: @test_storent_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64
+; CHECK-NEXT:    store <8 x i64> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.storent.q.512(ptr %ptr, <8 x i64> %data)
+  ret void
+}
+
+declare void @llvm.x86.avx512.storent.pd.512(ptr, <8 x double>)
+
+define void @test_storent_pd_512(<8 x double> %data, ptr %ptr)  #0 {
+;
+; CHECK-LABEL: @test_storent_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64
+; CHECK-NEXT:    store <8 x double> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.storent.pd.512(ptr %ptr, <8 x double> %data)
+  ret void
+}
+
+declare void @llvm.x86.avx512.storent.ps.512(ptr, <16 x float>)
+
+define void @test_storent_ps_512(<16 x float> %data, ptr %ptr)  #0 {
+;
+; CHECK-LABEL: @test_storent_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP7]], align 64
+; CHECK-NEXT:    store <16 x float> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.storent.ps.512(ptr %ptr, <16 x float> %data)
+  ret void
+}
+
+define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_xor_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_xor_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_or_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_or_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_and_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_and_epi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i32> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP9]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_xor_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_xor_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_or_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_or_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP5]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP1]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_and_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_and_epi64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[A]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP9]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP9]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mask_add_epi32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mask_sub_epi32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_mask_add_epi64_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_add_epi64_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_mask_sub_epi64_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %b = load <8 x i64>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sub_epi64_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mask_mullo_epi32_rr_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rrk_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rrkz_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rm_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rmk_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rmkz_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rmb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rmbk_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mullo_epi32_rmbkz_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %q = load i32, ptr %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
+  ret < 16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+
+declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_f32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
+  ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_f64x2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f64x2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_f64x2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP12]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
+  ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_i32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X3:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_i64x2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i64x2(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X3:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X3]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
+  ret <8 x i64> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP3]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP12]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_shuf_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP3]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
+  ret <16 x float> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmaxs_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmaxs_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmaxu_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmaxu_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmins_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmins_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pminu_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pminu_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B)  #0 {
+;
+; CHECK-LABEL: @test_mm_mask_move_ss(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[__U]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = or i8 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP11]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[__W:%.*]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP]], i32 [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP18]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP24]], i32 [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP16]], float [[TMP17]], float [[TMP18]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP25]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP26]]
+;
+entry:
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U)
+  ret <4 x float> %res
+}
+
+
+define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B)  #0 {
+;
+; CHECK-LABEL: @test_mm_maskz_move_ss(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[__U]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i8 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP10]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[_MSPROP]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i32 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP21]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP15]], float [[TMP16]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP22]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP23]]
+;
+entry:
+  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U)
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B)  #0 {
+;
+; CHECK-LABEL: @test_mm_mask_move_sd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[__U]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = or i8 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP11]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[__W:%.*]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP16]], i64 [[_MSPROP]], i64 [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast double [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP22]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP24]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP16]], double [[TMP17]], double [[TMP18]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP25]], i64 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP26]]
+;
+entry:
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B)  #0 {
+;
+; CHECK-LABEL: @test_mm_maskz_move_sd(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[__U]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i8 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[TMP7]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP10]], -1
+; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i64 [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double [[TMP16]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP21]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP15]], double [[TMP16]], double 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP22]], i64 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP23]]
+;
+entry:
+  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U)
+  ret <2 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovzxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i16> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i16> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovsxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxd_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i16> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i16> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i16> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+  ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+  ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+  ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+  ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_prol_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+  %4 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 4)
+  %5 = bitcast i16 %x3 to <16 x i1>
+  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
+  %7 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 5)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %6, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %7, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_prol_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+  %4 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 4)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+  %7 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 5)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %6, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %7, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_pror_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+  %4 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 4)
+  %5 = bitcast i16 %x3 to <16 x i1>
+  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
+  %7 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 5)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %6, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %7, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_pror_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+  %4 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 4)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
+  %7 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 5)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %6, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %7, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 6)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 6)
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP18]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP18]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP20]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP19]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[TMP15]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP16]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 6, <8 x i64> zeroinitializer, i8 %x3)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <16 x i32>@llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 6)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 6)
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[TMP18]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[TMP15]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP16]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[_MSPROP_SELECT1]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 6, <16 x i32> zeroinitializer, i16 %x3)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psra_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0:%.*]], i32 3)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psra_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0:%.*]], i32 3)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psll_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0:%.*]], i32 3)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_psll_qi_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 3)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0:%.*]], i32 3)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 4)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 5)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 5)
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psll_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psll_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psra_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psra_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_psrlv_q_memop(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i64> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <8 x i1> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP2]], <8 x i64> [[B]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[B]])
+; CHECK-NEXT:    store <8 x i64> [[TMP11]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %b = load <8 x i64>, ptr %ptr
+  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_cvt_dq2pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double>
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[CVT]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_cvt_udq2pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double>
+; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[CVT]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0)  #0 {
+; CHECK-LABEL: @test_x86_vcvtph2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0)  #0 {
+; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> [[A1:%.*]], i16 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
+
+define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b)  #0 {
+; CHECK-LABEL: @test_valign_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[PALIGNR]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_valign_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[PALIGNR]], [[SRC:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[PALIGNR]], <8 x i64> [[SRC]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
+
+define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_valign_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> [[A:%.*]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[PALIGNR]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[PALIGNR]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP18]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+
+define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+  ret <16 x float> %res
+}
+
+; Test case to make sure we can print shuffle decode comments for constant pool loads.
+define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP17]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = or <16 x i32> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP27]], <16 x i32> [[TMP23]]
+; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[TMP20]], <16 x float> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP29]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], [[_MSPROP_SELECT1]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[TMP16]], [[TMP28]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> zeroinitializer, [[_MSPROP]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP32]], [[RES3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> zeroinitializer, i16 %x3)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>, <16 x float> %x2, i16 -1)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res2, %res3
+  ret <16 x float> %res4
+}
+
+define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mask_mul_epi32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP28]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP27]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP32]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmbk_buildvector(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP8]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0
+  %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2
+  %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3
+  %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4
+  %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5
+  %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6
+  %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epi32_rmbkz_buildvector(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP8]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0
+  %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2
+  %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3
+  %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4
+  %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5
+  %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6
+  %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mask_mul_epu32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]]
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP28]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP27]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP32]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
+; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_mul_epu32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
+
+define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_vextractf32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[A:%.*]], <16 x float> [[A]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP12]], <4 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[B]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP13]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
+
+define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_vextracti64x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP4]], [[B:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP10]], <4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP4]], <4 x i64> [[B]]
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP11]]
+;
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
+  ret <4 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
+
+define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_vextracti32x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[A]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i32> [[TMP10]]
+;
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
+
+define <4 x double> @test_vextractf64x4(<8 x double> %a)  #0 {
+; CHECK-LABEL: @test_vextractf64x4(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x double> [[TMP2]]
+;
+  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
+  ret <4 x double> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
+
+declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_insertf32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_insertf32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP14]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> [[X3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP15]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP12]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
+  ret <16 x float> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_inserti32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_inserti32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X3:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X3]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
+  ret <16 x i32> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_insertf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP4]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_insertf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP14]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> [[X3]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP15]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP12]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
+  ret <8 x double> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_inserti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_inserti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X3:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X3]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0)  #0 {
+;
+; CHECK-LABEL: @test_x86_avx512_movntdqa(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[A0:%.*]], align 64, !nontemporal [[META2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    store <8 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.movntdqa(ptr %a0)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly
+
+define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_cmp_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP12]], [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i32> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <16 x i32> [[TMP15]], [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP24]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i32> [[TMP26]], [[TMP1]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP32:%.*]] = and <16 x i32> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i32> [[TMP30]], [[TMP2]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ule <16 x i32> [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ule <16 x i32> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i1> [[TMP36]] to i16
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP38]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP39]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = xor <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i32> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = xor <16 x i32> [[TMP41]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i32> [[TMP43]], [[TMP40]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq <16 x i32> [[TMP44]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP42]], [[TMP45]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i1> [[TMP46]] to i16
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP47]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP48]], i32 4
+; CHECK-NEXT:    [[TMP49:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP50:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP51:%.*]] = and <16 x i32> [[TMP49]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = or <16 x i32> [[TMP49]], [[TMP1]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP54:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i32> [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP56:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
+; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge <16 x i32> [[TMP51]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = icmp uge <16 x i32> [[TMP52]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = xor <16 x i1> [[TMP57]], [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast <16 x i1> [[TMP59]] to i16
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast <16 x i1> [[TMP60]] to i16
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP61]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP62]], i32 5
+; CHECK-NEXT:    [[TMP63:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP64:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP65:%.*]] = and <16 x i32> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = or <16 x i32> [[TMP63]], [[TMP1]]
+; CHECK-NEXT:    [[TMP67:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP68:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = or <16 x i32> [[TMP67]], [[TMP2]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp ugt <16 x i32> [[TMP65]], [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp ugt <16 x i32> [[TMP66]], [[TMP69]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i1> [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP75:%.*]] = bitcast <16 x i1> [[TMP73]] to i16
+; CHECK-NEXT:    [[TMP76:%.*]] = bitcast <16 x i1> [[TMP74]] to i16
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP75]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP76]], i32 6
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
+;
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_cmp_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i32> [[TMP21]], [[TMP1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP27:%.*]] = and <16 x i32> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or <16 x i32> [[TMP25]], [[TMP2]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <16 x i32> [[TMP24]], [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <16 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP31]], [[TMP34]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <16 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP32]], [[TMP34]]
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP41]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP42]], i32 1
+; CHECK-NEXT:    [[TMP43:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP44:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i32> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or <16 x i32> [[TMP43]], [[TMP1]]
+; CHECK-NEXT:    [[TMP47:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i32> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i32> [[TMP47]], [[TMP2]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp ule <16 x i32> [[TMP45]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ule <16 x i32> [[TMP46]], [[TMP49]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP55:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP57:%.*]] = and <16 x i1> [[TMP53]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP53]], [[TMP56]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP57]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = and <16 x i1> [[TMP54]], [[TMP56]]
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i1> [[TMP61]] to i16
+; CHECK-NEXT:    [[TMP64:%.*]] = bitcast <16 x i1> [[TMP62]] to i16
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP63]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP64]], i32 2
+; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP67:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]]
+; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]]
+; CHECK-NEXT:    [[TMP70:%.*]] = or <16 x i1> [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP71:%.*]] = or <16 x i1> [[TMP70]], [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]]
+; CHECK-NEXT:    [[TMP73:%.*]] = bitcast <16 x i1> [[TMP71]] to i16
+; CHECK-NEXT:    [[TMP74:%.*]] = bitcast <16 x i1> [[TMP72]] to i16
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP73]], i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP74]], i32 3
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP76:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <16 x i32> [[TMP76]], zeroinitializer
+; CHECK-NEXT:    [[TMP78:%.*]] = xor <16 x i32> [[TMP76]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP79:%.*]] = and <16 x i32> [[TMP78]], [[TMP75]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq <16 x i32> [[TMP79]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP77]], [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP83:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP84:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = and <16 x i1> [[TMP81]], [[TMP82]]
+; CHECK-NEXT:    [[TMP86:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP83]]
+; CHECK-NEXT:    [[TMP87:%.*]] = or <16 x i1> [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = or <16 x i1> [[TMP87]], [[TMP86]]
+; CHECK-NEXT:    [[TMP89:%.*]] = and <16 x i1> [[TMP81]], [[TMP83]]
+; CHECK-NEXT:    [[TMP90:%.*]] = bitcast <16 x i1> [[TMP88]] to i16
+; CHECK-NEXT:    [[TMP91:%.*]] = bitcast <16 x i1> [[TMP89]] to i16
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP90]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP91]], i32 4
+; CHECK-NEXT:    [[TMP92:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP93:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP94:%.*]] = and <16 x i32> [[TMP92]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = or <16 x i32> [[TMP92]], [[TMP1]]
+; CHECK-NEXT:    [[TMP96:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP97:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP98:%.*]] = and <16 x i32> [[TMP96]], [[TMP97]]
+; CHECK-NEXT:    [[TMP99:%.*]] = or <16 x i32> [[TMP96]], [[TMP2]]
+; CHECK-NEXT:    [[TMP100:%.*]] = icmp uge <16 x i32> [[TMP94]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = icmp uge <16 x i32> [[TMP95]], [[TMP98]]
+; CHECK-NEXT:    [[TMP102:%.*]] = xor <16 x i1> [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP104:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP106:%.*]] = and <16 x i1> [[TMP102]], [[TMP104]]
+; CHECK-NEXT:    [[TMP107:%.*]] = and <16 x i1> [[TMP103]], [[TMP104]]
+; CHECK-NEXT:    [[TMP108:%.*]] = and <16 x i1> [[TMP102]], [[TMP105]]
+; CHECK-NEXT:    [[TMP109:%.*]] = or <16 x i1> [[TMP106]], [[TMP107]]
+; CHECK-NEXT:    [[TMP110:%.*]] = or <16 x i1> [[TMP109]], [[TMP108]]
+; CHECK-NEXT:    [[TMP111:%.*]] = and <16 x i1> [[TMP103]], [[TMP105]]
+; CHECK-NEXT:    [[TMP112:%.*]] = bitcast <16 x i1> [[TMP110]] to i16
+; CHECK-NEXT:    [[TMP113:%.*]] = bitcast <16 x i1> [[TMP111]] to i16
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP112]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP113]], i32 5
+; CHECK-NEXT:    [[TMP114:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP115:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP116:%.*]] = and <16 x i32> [[TMP114]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = or <16 x i32> [[TMP114]], [[TMP1]]
+; CHECK-NEXT:    [[TMP118:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
+; CHECK-NEXT:    [[TMP119:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP120:%.*]] = and <16 x i32> [[TMP118]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = or <16 x i32> [[TMP118]], [[TMP2]]
+; CHECK-NEXT:    [[TMP122:%.*]] = icmp ugt <16 x i32> [[TMP116]], [[TMP121]]
+; CHECK-NEXT:    [[TMP123:%.*]] = icmp ugt <16 x i32> [[TMP117]], [[TMP120]]
+; CHECK-NEXT:    [[TMP124:%.*]] = xor <16 x i1> [[TMP122]], [[TMP123]]
+; CHECK-NEXT:    [[TMP125:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP126:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP127:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP128:%.*]] = and <16 x i1> [[TMP124]], [[TMP126]]
+; CHECK-NEXT:    [[TMP129:%.*]] = and <16 x i1> [[TMP125]], [[TMP126]]
+; CHECK-NEXT:    [[TMP130:%.*]] = and <16 x i1> [[TMP124]], [[TMP127]]
+; CHECK-NEXT:    [[TMP131:%.*]] = or <16 x i1> [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[TMP132:%.*]] = or <16 x i1> [[TMP131]], [[TMP130]]
+; CHECK-NEXT:    [[TMP133:%.*]] = and <16 x i1> [[TMP125]], [[TMP127]]
+; CHECK-NEXT:    [[TMP134:%.*]] = bitcast <16 x i1> [[TMP132]] to i16
+; CHECK-NEXT:    [[TMP135:%.*]] = bitcast <16 x i1> [[TMP133]] to i16
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP134]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP135]], i32 6
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP138:%.*]] = and <16 x i1> zeroinitializer, [[TMP136]]
+; CHECK-NEXT:    [[TMP139:%.*]] = and <16 x i1> splat (i1 true), [[TMP136]]
+; CHECK-NEXT:    [[TMP140:%.*]] = and <16 x i1> zeroinitializer, [[TMP137]]
+; CHECK-NEXT:    [[TMP141:%.*]] = or <16 x i1> [[TMP138]], [[TMP139]]
+; CHECK-NEXT:    [[TMP142:%.*]] = or <16 x i1> [[TMP141]], [[TMP140]]
+; CHECK-NEXT:    [[TMP143:%.*]] = and <16 x i1> splat (i1 true), [[TMP137]]
+; CHECK-NEXT:    [[TMP144:%.*]] = bitcast <16 x i1> [[TMP142]] to i16
+; CHECK-NEXT:    [[TMP145:%.*]] = bitcast <16 x i1> [[TMP143]] to i16
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP144]], i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP145]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
+;
+  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1)  #0 {
+; CHECK-LABEL: @test_ucmp_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i32> [[A0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i32> [[A1]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <16 x i32> [[TMP13]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i1> [[TMP20]] to i16
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP22]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP25:%.*]] = and <16 x i32> [[A0]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[A1]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ule <16 x i32> [[TMP25]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ule <16 x i32> [[TMP26]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <16 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i1> [[TMP32]] to i16
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i1> [[TMP33]] to i16
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP34]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP35]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i32> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = xor <16 x i32> [[TMP37]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i32> [[TMP39]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <16 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP38]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i1> [[TMP42]] to i16
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP43]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP44]], i32 4
+; CHECK-NEXT:    [[TMP45:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i32> [[A0]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i32> [[A1]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp uge <16 x i32> [[TMP46]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp uge <16 x i32> [[TMP47]], [[TMP49]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <16 x i1> [[TMP53]] to i16
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i1> [[TMP54]] to i16
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP55]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP56]], i32 5
+; CHECK-NEXT:    [[TMP57:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i32> [[A0]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP60:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP61:%.*]] = and <16 x i32> [[A1]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP63:%.*]] = icmp ugt <16 x i32> [[TMP58]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ugt <16 x i32> [[TMP59]], [[TMP61]]
+; CHECK-NEXT:    [[TMP65:%.*]] = xor <16 x i1> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP67:%.*]] = bitcast <16 x i1> [[TMP65]] to i16
+; CHECK-NEXT:    [[TMP68:%.*]] = bitcast <16 x i1> [[TMP66]] to i16
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP67]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP68]], i32 6
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
+;
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_ucmp_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i32> [[A0]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP25:%.*]] = and <16 x i32> [[A1]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <16 x i1> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = and <16 x i1> [[TMP29]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = and <16 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = or <16 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP39]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP40]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP42:%.*]] = and <16 x i32> [[A0]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP44:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i32> [[A1]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ule <16 x i32> [[TMP42]], [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp ule <16 x i32> [[TMP43]], [[TMP45]]
+; CHECK-NEXT:    [[TMP49:%.*]] = xor <16 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP53:%.*]] = and <16 x i1> [[TMP49]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP49]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = or <16 x i1> [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP56]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <16 x i1> [[TMP57]] to i16
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i1> [[TMP58]] to i16
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP59]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP60]], i32 2
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP63:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]]
+; CHECK-NEXT:    [[TMP65:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]]
+; CHECK-NEXT:    [[TMP66:%.*]] = or <16 x i1> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = or <16 x i1> [[TMP66]], [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]]
+; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i1> [[TMP67]] to i16
+; CHECK-NEXT:    [[TMP70:%.*]] = bitcast <16 x i1> [[TMP68]] to i16
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP69]], i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP70]], i32 3
+; CHECK-NEXT:    [[TMP71:%.*]] = xor <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP72:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne <16 x i32> [[TMP72]], zeroinitializer
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <16 x i32> [[TMP72]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP75:%.*]] = and <16 x i32> [[TMP74]], [[TMP71]]
+; CHECK-NEXT:    [[TMP76:%.*]] = icmp eq <16 x i32> [[TMP75]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP73]], [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP80:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP78]]
+; CHECK-NEXT:    [[TMP81:%.*]] = and <16 x i1> [[TMP77]], [[TMP78]]
+; CHECK-NEXT:    [[TMP82:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP79]]
+; CHECK-NEXT:    [[TMP83:%.*]] = or <16 x i1> [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = and <16 x i1> [[TMP77]], [[TMP79]]
+; CHECK-NEXT:    [[TMP86:%.*]] = bitcast <16 x i1> [[TMP84]] to i16
+; CHECK-NEXT:    [[TMP87:%.*]] = bitcast <16 x i1> [[TMP85]] to i16
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP86]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP87]], i32 4
+; CHECK-NEXT:    [[TMP88:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP89:%.*]] = and <16 x i32> [[A0]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP91:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP92:%.*]] = and <16 x i32> [[A1]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp uge <16 x i32> [[TMP89]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = icmp uge <16 x i32> [[TMP90]], [[TMP92]]
+; CHECK-NEXT:    [[TMP96:%.*]] = xor <16 x i1> [[TMP94]], [[TMP95]]
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP98:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP99:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP100:%.*]] = and <16 x i1> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP101:%.*]] = and <16 x i1> [[TMP97]], [[TMP98]]
+; CHECK-NEXT:    [[TMP102:%.*]] = and <16 x i1> [[TMP96]], [[TMP99]]
+; CHECK-NEXT:    [[TMP103:%.*]] = or <16 x i1> [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = or <16 x i1> [[TMP103]], [[TMP102]]
+; CHECK-NEXT:    [[TMP105:%.*]] = and <16 x i1> [[TMP97]], [[TMP99]]
+; CHECK-NEXT:    [[TMP106:%.*]] = bitcast <16 x i1> [[TMP104]] to i16
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast <16 x i1> [[TMP105]] to i16
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP106]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP107]], i32 5
+; CHECK-NEXT:    [[TMP108:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP109:%.*]] = and <16 x i32> [[A0]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP111:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP112:%.*]] = and <16 x i32> [[A1]], [[TMP111]]
+; CHECK-NEXT:    [[TMP113:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ugt <16 x i32> [[TMP109]], [[TMP113]]
+; CHECK-NEXT:    [[TMP115:%.*]] = icmp ugt <16 x i32> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP116:%.*]] = xor <16 x i1> [[TMP114]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP119:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP120:%.*]] = and <16 x i1> [[TMP116]], [[TMP118]]
+; CHECK-NEXT:    [[TMP121:%.*]] = and <16 x i1> [[TMP117]], [[TMP118]]
+; CHECK-NEXT:    [[TMP122:%.*]] = and <16 x i1> [[TMP116]], [[TMP119]]
+; CHECK-NEXT:    [[TMP123:%.*]] = or <16 x i1> [[TMP120]], [[TMP121]]
+; CHECK-NEXT:    [[TMP124:%.*]] = or <16 x i1> [[TMP123]], [[TMP122]]
+; CHECK-NEXT:    [[TMP125:%.*]] = and <16 x i1> [[TMP117]], [[TMP119]]
+; CHECK-NEXT:    [[TMP126:%.*]] = bitcast <16 x i1> [[TMP124]] to i16
+; CHECK-NEXT:    [[TMP127:%.*]] = bitcast <16 x i1> [[TMP125]] to i16
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP126]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP127]], i32 6
+; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP129:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP130:%.*]] = and <16 x i1> zeroinitializer, [[TMP128]]
+; CHECK-NEXT:    [[TMP131:%.*]] = and <16 x i1> splat (i1 true), [[TMP128]]
+; CHECK-NEXT:    [[TMP132:%.*]] = and <16 x i1> zeroinitializer, [[TMP129]]
+; CHECK-NEXT:    [[TMP133:%.*]] = or <16 x i1> [[TMP130]], [[TMP131]]
+; CHECK-NEXT:    [[TMP134:%.*]] = or <16 x i1> [[TMP133]], [[TMP132]]
+; CHECK-NEXT:    [[TMP135:%.*]] = and <16 x i1> splat (i1 true), [[TMP129]]
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast <16 x i1> [[TMP134]] to i16
+; CHECK-NEXT:    [[TMP137:%.*]] = bitcast <16 x i1> [[TMP135]] to i16
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP136]], i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP137]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
+;
+  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
+  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
+  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
+  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
+  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
+  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
+  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
+  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
+  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
+  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
+  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
+  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
+  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
+  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
+  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
+  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
+  ret <8 x i16> %vec7
+}
+
+declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
+
+define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_cmp_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP16]], [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <8 x i64> [[TMP15]], [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP24]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP25]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[TMP26]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP26]], [[TMP1]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP32:%.*]] = and <8 x i64> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP30]], [[TMP2]]
+; CHECK-NEXT:    [[TMP34:%.*]] = icmp ule <8 x i64> [[TMP28]], [[TMP33]]
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ule <8 x i64> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP37:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i1> [[TMP36]] to i8
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP38]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP39]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3
+; CHECK-NEXT:    [[TMP40:%.*]] = xor <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i64> [[TMP41]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = xor <8 x i64> [[TMP41]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP44:%.*]] = and <8 x i64> [[TMP43]], [[TMP40]]
+; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq <8 x i64> [[TMP44]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP42]], [[TMP45]]
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i1> [[TMP46]] to i8
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP47]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP48]], i32 4
+; CHECK-NEXT:    [[TMP49:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP50:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP51:%.*]] = and <8 x i64> [[TMP49]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = or <8 x i64> [[TMP49]], [[TMP1]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP54:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i64> [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP56:%.*]] = or <8 x i64> [[TMP53]], [[TMP2]]
+; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge <8 x i64> [[TMP51]], [[TMP56]]
+; CHECK-NEXT:    [[TMP58:%.*]] = icmp uge <8 x i64> [[TMP52]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = xor <8 x i1> [[TMP57]], [[TMP58]]
+; CHECK-NEXT:    [[TMP60:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast <8 x i1> [[TMP59]] to i8
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast <8 x i1> [[TMP60]] to i8
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP61]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP62]], i32 5
+; CHECK-NEXT:    [[TMP63:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP64:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP65:%.*]] = and <8 x i64> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = or <8 x i64> [[TMP63]], [[TMP1]]
+; CHECK-NEXT:    [[TMP67:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP68:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP69:%.*]] = and <8 x i64> [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i64> [[TMP67]], [[TMP2]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp ugt <8 x i64> [[TMP65]], [[TMP70]]
+; CHECK-NEXT:    [[TMP72:%.*]] = icmp ugt <8 x i64> [[TMP66]], [[TMP69]]
+; CHECK-NEXT:    [[TMP73:%.*]] = xor <8 x i1> [[TMP71]], [[TMP72]]
+; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i1> [[TMP73]] to i8
+; CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i1> [[TMP74]] to i8
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP75]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP76]], i32 6
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
+;
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_cmp_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP27:%.*]] = and <8 x i64> [[TMP25]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or <8 x i64> [[TMP25]], [[TMP2]]
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <8 x i64> [[TMP24]], [[TMP27]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP29]], [[TMP30]]
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <8 x i1> [[TMP31]], [[TMP34]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <8 x i1> [[TMP35]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <8 x i1> [[TMP38]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = and <8 x i1> [[TMP32]], [[TMP34]]
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i1> [[TMP40]] to i8
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP41]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP42]], i32 1
+; CHECK-NEXT:    [[TMP43:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP44:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i64> [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or <8 x i64> [[TMP43]], [[TMP1]]
+; CHECK-NEXT:    [[TMP47:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP49:%.*]] = and <8 x i64> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <8 x i64> [[TMP47]], [[TMP2]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp ule <8 x i64> [[TMP45]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp ule <8 x i64> [[TMP46]], [[TMP49]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP55:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP57:%.*]] = and <8 x i1> [[TMP53]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i1> [[TMP54]], [[TMP55]]
+; CHECK-NEXT:    [[TMP59:%.*]] = and <8 x i1> [[TMP53]], [[TMP56]]
+; CHECK-NEXT:    [[TMP60:%.*]] = or <8 x i1> [[TMP57]], [[TMP58]]
+; CHECK-NEXT:    [[TMP61:%.*]] = or <8 x i1> [[TMP60]], [[TMP59]]
+; CHECK-NEXT:    [[TMP62:%.*]] = and <8 x i1> [[TMP54]], [[TMP56]]
+; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <8 x i1> [[TMP61]] to i8
+; CHECK-NEXT:    [[TMP64:%.*]] = bitcast <8 x i1> [[TMP62]] to i8
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP63]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP64]], i32 2
+; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP67:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]]
+; CHECK-NEXT:    [[TMP69:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]]
+; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i1> [[TMP67]], [[TMP68]]
+; CHECK-NEXT:    [[TMP71:%.*]] = or <8 x i1> [[TMP70]], [[TMP69]]
+; CHECK-NEXT:    [[TMP72:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]]
+; CHECK-NEXT:    [[TMP73:%.*]] = bitcast <8 x i1> [[TMP71]] to i8
+; CHECK-NEXT:    [[TMP74:%.*]] = bitcast <8 x i1> [[TMP72]] to i8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP73]], i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP74]], i32 3
+; CHECK-NEXT:    [[TMP75:%.*]] = xor <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP76:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <8 x i64> [[TMP76]], zeroinitializer
+; CHECK-NEXT:    [[TMP78:%.*]] = xor <8 x i64> [[TMP76]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP79:%.*]] = and <8 x i64> [[TMP78]], [[TMP75]]
+; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq <8 x i64> [[TMP79]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP77]], [[TMP80]]
+; CHECK-NEXT:    [[TMP81:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP83:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP84:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = and <8 x i1> [[TMP81]], [[TMP82]]
+; CHECK-NEXT:    [[TMP86:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP83]]
+; CHECK-NEXT:    [[TMP87:%.*]] = or <8 x i1> [[TMP84]], [[TMP85]]
+; CHECK-NEXT:    [[TMP88:%.*]] = or <8 x i1> [[TMP87]], [[TMP86]]
+; CHECK-NEXT:    [[TMP89:%.*]] = and <8 x i1> [[TMP81]], [[TMP83]]
+; CHECK-NEXT:    [[TMP90:%.*]] = bitcast <8 x i1> [[TMP88]] to i8
+; CHECK-NEXT:    [[TMP91:%.*]] = bitcast <8 x i1> [[TMP89]] to i8
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP90]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP91]], i32 4
+; CHECK-NEXT:    [[TMP92:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP93:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP94:%.*]] = and <8 x i64> [[TMP92]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = or <8 x i64> [[TMP92]], [[TMP1]]
+; CHECK-NEXT:    [[TMP96:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP97:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP98:%.*]] = and <8 x i64> [[TMP96]], [[TMP97]]
+; CHECK-NEXT:    [[TMP99:%.*]] = or <8 x i64> [[TMP96]], [[TMP2]]
+; CHECK-NEXT:    [[TMP100:%.*]] = icmp uge <8 x i64> [[TMP94]], [[TMP99]]
+; CHECK-NEXT:    [[TMP101:%.*]] = icmp uge <8 x i64> [[TMP95]], [[TMP98]]
+; CHECK-NEXT:    [[TMP102:%.*]] = xor <8 x i1> [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[TMP103:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP104:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP106:%.*]] = and <8 x i1> [[TMP102]], [[TMP104]]
+; CHECK-NEXT:    [[TMP107:%.*]] = and <8 x i1> [[TMP103]], [[TMP104]]
+; CHECK-NEXT:    [[TMP108:%.*]] = and <8 x i1> [[TMP102]], [[TMP105]]
+; CHECK-NEXT:    [[TMP109:%.*]] = or <8 x i1> [[TMP106]], [[TMP107]]
+; CHECK-NEXT:    [[TMP110:%.*]] = or <8 x i1> [[TMP109]], [[TMP108]]
+; CHECK-NEXT:    [[TMP111:%.*]] = and <8 x i1> [[TMP103]], [[TMP105]]
+; CHECK-NEXT:    [[TMP112:%.*]] = bitcast <8 x i1> [[TMP110]] to i8
+; CHECK-NEXT:    [[TMP113:%.*]] = bitcast <8 x i1> [[TMP111]] to i8
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP112]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP113]], i32 5
+; CHECK-NEXT:    [[TMP114:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP115:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP116:%.*]] = and <8 x i64> [[TMP114]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = or <8 x i64> [[TMP114]], [[TMP1]]
+; CHECK-NEXT:    [[TMP118:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
+; CHECK-NEXT:    [[TMP119:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP120:%.*]] = and <8 x i64> [[TMP118]], [[TMP119]]
+; CHECK-NEXT:    [[TMP121:%.*]] = or <8 x i64> [[TMP118]], [[TMP2]]
+; CHECK-NEXT:    [[TMP122:%.*]] = icmp ugt <8 x i64> [[TMP116]], [[TMP121]]
+; CHECK-NEXT:    [[TMP123:%.*]] = icmp ugt <8 x i64> [[TMP117]], [[TMP120]]
+; CHECK-NEXT:    [[TMP124:%.*]] = xor <8 x i1> [[TMP122]], [[TMP123]]
+; CHECK-NEXT:    [[TMP125:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP126:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP127:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP128:%.*]] = and <8 x i1> [[TMP124]], [[TMP126]]
+; CHECK-NEXT:    [[TMP129:%.*]] = and <8 x i1> [[TMP125]], [[TMP126]]
+; CHECK-NEXT:    [[TMP130:%.*]] = and <8 x i1> [[TMP124]], [[TMP127]]
+; CHECK-NEXT:    [[TMP131:%.*]] = or <8 x i1> [[TMP128]], [[TMP129]]
+; CHECK-NEXT:    [[TMP132:%.*]] = or <8 x i1> [[TMP131]], [[TMP130]]
+; CHECK-NEXT:    [[TMP133:%.*]] = and <8 x i1> [[TMP125]], [[TMP127]]
+; CHECK-NEXT:    [[TMP134:%.*]] = bitcast <8 x i1> [[TMP132]] to i8
+; CHECK-NEXT:    [[TMP135:%.*]] = bitcast <8 x i1> [[TMP133]] to i8
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP134]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP135]], i32 6
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP138:%.*]] = and <8 x i1> zeroinitializer, [[TMP136]]
+; CHECK-NEXT:    [[TMP139:%.*]] = and <8 x i1> splat (i1 true), [[TMP136]]
+; CHECK-NEXT:    [[TMP140:%.*]] = and <8 x i1> zeroinitializer, [[TMP137]]
+; CHECK-NEXT:    [[TMP141:%.*]] = or <8 x i1> [[TMP138]], [[TMP139]]
+; CHECK-NEXT:    [[TMP142:%.*]] = or <8 x i1> [[TMP141]], [[TMP140]]
+; CHECK-NEXT:    [[TMP143:%.*]] = and <8 x i1> splat (i1 true), [[TMP137]]
+; CHECK-NEXT:    [[TMP144:%.*]] = bitcast <8 x i1> [[TMP142]] to i8
+; CHECK-NEXT:    [[TMP145:%.*]] = bitcast <8 x i1> [[TMP143]] to i8
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP144]], i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP145]], i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
+;
+  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1)  #0 {
+; CHECK-LABEL: @test_ucmp_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[A0]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[A1]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i1> [[TMP20]] to i8
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP21]] to i8
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP22]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP23]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[A0]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[A1]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ule <8 x i64> [[TMP25]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ule <8 x i64> [[TMP26]], [[TMP28]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor <8 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i1> [[TMP32]] to i8
+; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i1> [[TMP33]] to i8
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP34]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP35]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <8 x i64> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = xor <8 x i64> [[TMP37]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP40:%.*]] = and <8 x i64> [[TMP39]], [[TMP36]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <8 x i64> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP38]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i1> [[TMP42]] to i8
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP43]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP44]], i32 4
+; CHECK-NEXT:    [[TMP45:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i64> [[A0]], [[TMP45]]
+; CHECK-NEXT:    [[TMP47:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP49:%.*]] = and <8 x i64> [[A1]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP51:%.*]] = icmp uge <8 x i64> [[TMP46]], [[TMP50]]
+; CHECK-NEXT:    [[TMP52:%.*]] = icmp uge <8 x i64> [[TMP47]], [[TMP49]]
+; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]]
+; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i1> [[TMP53]] to i8
+; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i1> [[TMP54]] to i8
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP55]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP56]], i32 5
+; CHECK-NEXT:    [[TMP57:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i64> [[A0]], [[TMP57]]
+; CHECK-NEXT:    [[TMP59:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP60:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP61:%.*]] = and <8 x i64> [[A1]], [[TMP60]]
+; CHECK-NEXT:    [[TMP62:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP63:%.*]] = icmp ugt <8 x i64> [[TMP58]], [[TMP62]]
+; CHECK-NEXT:    [[TMP64:%.*]] = icmp ugt <8 x i64> [[TMP59]], [[TMP61]]
+; CHECK-NEXT:    [[TMP65:%.*]] = xor <8 x i1> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP66:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP67:%.*]] = bitcast <8 x i1> [[TMP65]] to i8
+; CHECK-NEXT:    [[TMP68:%.*]] = bitcast <8 x i1> [[TMP66]] to i8
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP67]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP68]], i32 6
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
+;
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_ucmp_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0
+; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[A0]], [[TMP21]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[A1]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP25]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = and <8 x i1> [[TMP29]], [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP30]], [[TMP31]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP29]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = or <8 x i1> [[TMP33]], [[TMP34]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i1> [[TMP36]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP30]], [[TMP32]]
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP39]], i32 1
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP40]], i32 1
+; CHECK-NEXT:    [[TMP41:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP42:%.*]] = and <8 x i64> [[A0]], [[TMP41]]
+; CHECK-NEXT:    [[TMP43:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP44:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i64> [[A1]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp ule <8 x i64> [[TMP42]], [[TMP46]]
+; CHECK-NEXT:    [[TMP48:%.*]] = icmp ule <8 x i64> [[TMP43]], [[TMP45]]
+; CHECK-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP47]], [[TMP48]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP51]]
+; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP51]]
+; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP49]], [[TMP52]]
+; CHECK-NEXT:    [[TMP56:%.*]] = or <8 x i1> [[TMP53]], [[TMP54]]
+; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP56]], [[TMP55]]
+; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i1> [[TMP50]], [[TMP52]]
+; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <8 x i1> [[TMP57]] to i8
+; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i1> [[TMP58]] to i8
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP59]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP60]], i32 2
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP63:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]]
+; CHECK-NEXT:    [[TMP64:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]]
+; CHECK-NEXT:    [[TMP65:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]]
+; CHECK-NEXT:    [[TMP66:%.*]] = or <8 x i1> [[TMP63]], [[TMP64]]
+; CHECK-NEXT:    [[TMP67:%.*]] = or <8 x i1> [[TMP66]], [[TMP65]]
+; CHECK-NEXT:    [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]]
+; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <8 x i1> [[TMP67]] to i8
+; CHECK-NEXT:    [[TMP70:%.*]] = bitcast <8 x i1> [[TMP68]] to i8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP69]], i32 3
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP70]], i32 3
+; CHECK-NEXT:    [[TMP71:%.*]] = xor <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP72:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne <8 x i64> [[TMP72]], zeroinitializer
+; CHECK-NEXT:    [[TMP74:%.*]] = xor <8 x i64> [[TMP72]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP75:%.*]] = and <8 x i64> [[TMP74]], [[TMP71]]
+; CHECK-NEXT:    [[TMP76:%.*]] = icmp eq <8 x i64> [[TMP75]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP73]], [[TMP76]]
+; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP80:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP78]]
+; CHECK-NEXT:    [[TMP81:%.*]] = and <8 x i1> [[TMP77]], [[TMP78]]
+; CHECK-NEXT:    [[TMP82:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP79]]
+; CHECK-NEXT:    [[TMP83:%.*]] = or <8 x i1> [[TMP80]], [[TMP81]]
+; CHECK-NEXT:    [[TMP84:%.*]] = or <8 x i1> [[TMP83]], [[TMP82]]
+; CHECK-NEXT:    [[TMP85:%.*]] = and <8 x i1> [[TMP77]], [[TMP79]]
+; CHECK-NEXT:    [[TMP86:%.*]] = bitcast <8 x i1> [[TMP84]] to i8
+; CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i1> [[TMP85]] to i8
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP86]], i32 4
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP87]], i32 4
+; CHECK-NEXT:    [[TMP88:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP89:%.*]] = and <8 x i64> [[A0]], [[TMP88]]
+; CHECK-NEXT:    [[TMP90:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP91:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP92:%.*]] = and <8 x i64> [[A1]], [[TMP91]]
+; CHECK-NEXT:    [[TMP93:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP94:%.*]] = icmp uge <8 x i64> [[TMP89]], [[TMP93]]
+; CHECK-NEXT:    [[TMP95:%.*]] = icmp uge <8 x i64> [[TMP90]], [[TMP92]]
+; CHECK-NEXT:    [[TMP96:%.*]] = xor <8 x i1> [[TMP94]], [[TMP95]]
+; CHECK-NEXT:    [[TMP97:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP98:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP99:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP100:%.*]] = and <8 x i1> [[TMP96]], [[TMP98]]
+; CHECK-NEXT:    [[TMP101:%.*]] = and <8 x i1> [[TMP97]], [[TMP98]]
+; CHECK-NEXT:    [[TMP102:%.*]] = and <8 x i1> [[TMP96]], [[TMP99]]
+; CHECK-NEXT:    [[TMP103:%.*]] = or <8 x i1> [[TMP100]], [[TMP101]]
+; CHECK-NEXT:    [[TMP104:%.*]] = or <8 x i1> [[TMP103]], [[TMP102]]
+; CHECK-NEXT:    [[TMP105:%.*]] = and <8 x i1> [[TMP97]], [[TMP99]]
+; CHECK-NEXT:    [[TMP106:%.*]] = bitcast <8 x i1> [[TMP104]] to i8
+; CHECK-NEXT:    [[TMP107:%.*]] = bitcast <8 x i1> [[TMP105]] to i8
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP106]], i32 5
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP107]], i32 5
+; CHECK-NEXT:    [[TMP108:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP109:%.*]] = and <8 x i64> [[A0]], [[TMP108]]
+; CHECK-NEXT:    [[TMP110:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP111:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP112:%.*]] = and <8 x i64> [[A1]], [[TMP111]]
+; CHECK-NEXT:    [[TMP113:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP114:%.*]] = icmp ugt <8 x i64> [[TMP109]], [[TMP113]]
+; CHECK-NEXT:    [[TMP115:%.*]] = icmp ugt <8 x i64> [[TMP110]], [[TMP112]]
+; CHECK-NEXT:    [[TMP116:%.*]] = xor <8 x i1> [[TMP114]], [[TMP115]]
+; CHECK-NEXT:    [[TMP117:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP119:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP120:%.*]] = and <8 x i1> [[TMP116]], [[TMP118]]
+; CHECK-NEXT:    [[TMP121:%.*]] = and <8 x i1> [[TMP117]], [[TMP118]]
+; CHECK-NEXT:    [[TMP122:%.*]] = and <8 x i1> [[TMP116]], [[TMP119]]
+; CHECK-NEXT:    [[TMP123:%.*]] = or <8 x i1> [[TMP120]], [[TMP121]]
+; CHECK-NEXT:    [[TMP124:%.*]] = or <8 x i1> [[TMP123]], [[TMP122]]
+; CHECK-NEXT:    [[TMP125:%.*]] = and <8 x i1> [[TMP117]], [[TMP119]]
+; CHECK-NEXT:    [[TMP126:%.*]] = bitcast <8 x i1> [[TMP124]] to i8
+; CHECK-NEXT:    [[TMP127:%.*]] = bitcast <8 x i1> [[TMP125]] to i8
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP126]], i32 6
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP127]], i32 6
+; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP129:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT:    [[TMP130:%.*]] = and <8 x i1> zeroinitializer, [[TMP128]]
+; CHECK-NEXT:    [[TMP131:%.*]] = and <8 x i1> splat (i1 true), [[TMP128]]
+; CHECK-NEXT:    [[TMP132:%.*]] = and <8 x i1> zeroinitializer, [[TMP129]]
+; CHECK-NEXT:    [[TMP133:%.*]] = or <8 x i1> [[TMP130]], [[TMP131]]
+; CHECK-NEXT:    [[TMP134:%.*]] = or <8 x i1> [[TMP133]], [[TMP132]]
+; CHECK-NEXT:    [[TMP135:%.*]] = and <8 x i1> splat (i1 true), [[TMP129]]
+; CHECK-NEXT:    [[TMP136:%.*]] = bitcast <8 x i1> [[TMP134]] to i8
+; CHECK-NEXT:    [[TMP137:%.*]] = bitcast <8 x i1> [[TMP135]] to i8
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP136]], i32 7
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP137]], i32 7
+; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
+;
+  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
+  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
+  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
+  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
+  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
+  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
+  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
+  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
+  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
+  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
+  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
+  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
+  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
+  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
+  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
+  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
+  ret <8 x i8> %vec7
+}
+
+declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
+
+declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x float> [[TMP15]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP22]], <16 x i32> [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP17]], <16 x float> [[TMP15]], <16 x float> zeroinitializer
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSPROP_SELECT]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP4]], [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <16 x i32> [[_MSPROP_SELECT3]], [[_MSPROP4]]
+; CHECK-NEXT:    [[RES5:%.*]] = fadd <16 x float> [[TMP23]], [[RES4]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES5]]
+;
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
+  %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
+  %res4 = fadd <16 x float> %res1, %res2
+  %res5 = fadd <16 x float> %res3, %res4
+  ret <16 x float> %res5
+}
+
+define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr, <16 x float> %x2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = load <4 x float>, ptr [[X0PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %x0 = load <4 x float>, ptr %x0ptr
+  %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_broadcastf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcastf64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, <8 x double> %x2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = load <4 x double>, ptr [[X0PTR:%.*]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP18]]
+;
+  %x0 = load <4 x double>, ptr %x0ptr
+  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP19]], <16 x i32> [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[TMP13]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP4]], 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP21]], <16 x i32> [[_MSPROP_SELECT]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP12]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP22]], <16 x i32> [[_MSPROP_SELECT3]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP20]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP23]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, <16 x i32> %x2, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = load <4 x i32>, ptr [[X0PTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %x0 = load <4 x i32>, ptr %x0ptr
+  %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_broadcasti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcasti64x4_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 x i64> %x2, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X0:%.*]] = load <4 x i64>, ptr [[X0PTR:%.*]], align 32
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[X0]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %x0 = load <4 x i64>, ptr %x0ptr
+  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pabs_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pabs_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+  ret <8 x i64> %res
+}
+
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m)  #0 {
+;
+; CHECK-LABEL: @test_vptestmq(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i1> [[TMP16]] to i8
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[A0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP26]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP29:%.*]] = and <8 x i64> [[TMP28]], [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <8 x i64> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP27]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[M:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP32]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP33]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <8 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP40]], [[TMP17]]
+; CHECK-NEXT:    [[RES2:%.*]] = add i8 [[TMP41]], [[TMP18]]
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES2]]
+;
+  %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+  %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
+  %res2 = add i8 %res1, %res
+  ret i8 %res2
+}
+declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
+
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m)  #0 {
+;
+; CHECK-LABEL: @test_vptestmd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[A0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[A1:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i1> [[TMP16]] to i16
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i32> [[A0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i32> [[TMP1]], [[A1]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i32> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i32> [[A0]], [[A1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i32> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i32> [[TMP26]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i32> [[TMP28]], [[TMP25]]
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <16 x i32> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP27]], [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[M:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP32]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP32]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP33]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i1> [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or <16 x i1> [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP40]], [[TMP17]]
+; CHECK-NEXT:    [[RES2:%.*]] = add i16 [[TMP41]], [[TMP18]]
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[RES2]]
+;
+  %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+  %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
+  %res2 = add i16 %res1, %res
+  ret i16 %res2
+}
+declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
+
+declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
+
+define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_ptestnm_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[X0]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
+; CHECK-NEXT:    [[TMP27:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[X0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i32> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i32> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <16 x i32> [[TMP30]], [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = and <16 x i32> [[X0]], [[X1]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <16 x i32> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = or <16 x i32> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i32> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i32> [[TMP34]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <16 x i32> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq <16 x i32> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP25]], [[TMP40]]
+; CHECK-NEXT:    [[RES2:%.*]] = add i16 [[TMP26]], [[TMP41]]
+; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[RES2]]
+;
+  %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
+  %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
+  %res2 = add i16 %res, %res1
+  ret i16 %res2
+}
+
+declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
+
+define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_ptestnm_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[X0:%.*]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[X0]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i1> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP18]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i1> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i1> [[TMP24]] to i8
+; CHECK-NEXT:    [[TMP27:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[X0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP29:%.*]] = and <8 x i64> [[TMP1]], [[X1]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP27]], [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = and <8 x i64> [[X0]], [[X1]]
+; CHECK-NEXT:    [[TMP33:%.*]] = xor <8 x i64> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP34:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i64> [[TMP34]], zeroinitializer
+; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i64> [[TMP34]], splat (i64 -1)
+; CHECK-NEXT:    [[TMP37:%.*]] = and <8 x i64> [[TMP36]], [[TMP33]]
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <8 x i64> [[TMP37]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP35]], [[TMP38]]
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq <8 x i64> [[TMP32]], zeroinitializer
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP1]] to i8
+; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP25]], [[TMP40]]
+; CHECK-NEXT:    [[RES2:%.*]] = add i8 [[TMP26]], [[TMP41]]
+; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES2]]
+;
+  %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
+  %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
+  %res2 = add i8 %res, %res1
+  ret i8 %res2
+}
+
+declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
+define i16 @test_kand(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @test_kand(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i1> [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP11]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i1> [[TMP13]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP13]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i1> [[TMP17]], [[TMP18]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
+; CHECK-NEXT:    store i16 [[TMP23]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP24]]
+;
+  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
+define i16 @test_kandn(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @test_kandn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[_MSPROP]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true)
+; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP16]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP17]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
+; CHECK-NEXT:    store i16 [[TMP25]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP26]]
+;
+  %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
+define i16 @test_knot(i16 %a0)  #0 {
+;
+; CHECK-LABEL: @test_knot(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    store i16 [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP6]]
+;
+  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
+define i16 @test_kor(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @test_kor(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP3]], <i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 0), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 1), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 2), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 3), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 4), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 5), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 6), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 7), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 8), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 9), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 10), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 11), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 12), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 13), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 14), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 15), i1 true)>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true)
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i1> [[TMP17]], splat (i1 true)
+; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
+; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i1> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i1> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
+; CHECK-NEXT:    store i16 [[TMP26]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP27]]
+;
+  %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
+; TODO: the two kxnor instructions here a no op and should be elimintaed,
+; probably by FoldConstantArithmetic in SelectionDAG.
+define i16 @test_kxnor(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @test_kxnor(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP6]] to i16
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP8]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i1> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[TMP10]], splat (i1 true)
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i1> [[_MSPROP2]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i1> [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[_MSPROP3]] to i16
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
+; CHECK-NEXT:    store i16 [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP16]]
+;
+  %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
+define i16 @test_kxor(i16 %a0, i16 %a1)  #0 {
+;
+; CHECK-LABEL: @test_kxor(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[TMP8]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i1> [[TMP9]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP12]] to i16
+; CHECK-NEXT:    store i16 [[TMP13]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP14]]
+;
+  %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
+define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D)  #0 {
+; CHECK-LABEL: @test_kortestz(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i16 [[TMP43]], 0
+; CHECK-NEXT:    [[TMP45:%.*]] = or i16 [[TMP42]], 0
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = xor i16 [[TMP45]], -1
+; CHECK-NEXT:    [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32
+; CHECK-NEXT:    [[TMP51:%.*]] = zext i1 [[TMP50]] to i32
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP51]]
+;
+entry:
+  %0 = bitcast <8 x i64> %A to <16 x i32>
+  %1 = bitcast <8 x i64> %B to <16 x i32>
+  %2 = icmp ne <16 x i32> %0, %1
+  %3 = bitcast <8 x i64> %C to <16 x i32>
+  %4 = bitcast <8 x i64> %D to <16 x i32>
+  %5 = icmp ne <16 x i32> %3, %4
+  %6 = bitcast <16 x i1> %2 to i16
+  %7 = bitcast <16 x i1> %5 to i16
+  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
+define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D)  #0 {
+; CHECK-LABEL: @test_kortestc(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1)
+; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]]
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1>
+; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1>
+; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1>
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true)
+; CHECK-NEXT:    [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
+; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]]
+; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]]
+; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]]
+; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]]
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i16 [[TMP43]], 0
+; CHECK-NEXT:    [[TMP45:%.*]] = or i16 [[TMP42]], 0
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = xor i16 [[TMP45]], -1
+; CHECK-NEXT:    [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]]
+; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0
+; CHECK-NEXT:    [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]]
+; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0
+; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32
+; CHECK-NEXT:    [[TMP51:%.*]] = zext i1 [[TMP50]] to i32
+; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[TMP51]]
+;
+entry:
+  %0 = bitcast <8 x i64> %A to <16 x i32>
+  %1 = bitcast <8 x i64> %B to <16 x i32>
+  %2 = icmp ne <16 x i32> %0, %1
+  %3 = bitcast <8 x i64> %C to <16 x i32>
+  %4 = bitcast <8 x i64> %D to <16 x i32>
+  %5 = icmp ne <16 x i32> %3, %4
+  %6 = bitcast <16 x i1> %2 to i16
+  %7 = bitcast <16 x i1> %5 to i16
+  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)
+  ret i32 %res
+}
+
+define i16 @test_cmpps(<16 x float> %a, <16 x float> %b)  #0 {
+; CHECK-LABEL: @test_cmpps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16
+; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
+  ret i16 %res
+}
+declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
+
+define i8 @test_cmppd(<8 x double> %a, <8 x double> %b)  #0 {
+; CHECK-LABEL: @test_cmppd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP7]]
+;
+  %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
+  ret i8 %res
+}
+declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+
+define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mul_epi32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32)
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
+; CHECK-NEXT:    [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
+; CHECK-NEXT:    [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epi32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
+; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
+; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
+; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
+
+define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
+; CHECK-LABEL: @test_mul_epu32_rr(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rrk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rrkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rmk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rmkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %b = load <16 x i32>, ptr %ptr_b
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rmb(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rmbk(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
+; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
+  ret < 8 x i64> %res
+}
+
+define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mul_epu32_rmbkz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
+; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
+; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
+; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
+; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %q = load i64, ptr %ptr_b
+  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
+  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
+  %b = bitcast <8 x i64> %b64 to <16 x i32>
+  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
+  ret < 8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
+
+define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
+;
+; CHECK-LABEL: @test_x86_avx512_mm_cvtu32_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[B:%.*]] to double
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP4]], i64 0
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[TMP5]]
+;
+  #0 {
+  %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
+
+define <16 x float> @test_x86_vbroadcast_ss_512(ptr %a0)  #0 {
+;
+; CHECK-LABEL: @test_x86_vbroadcast_ss_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[A0:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> poison, float [[TMP4]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[TMP4]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[TMP4]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i32> [[_MSPROP3]], i32 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x float> [[TMP11]], float [[TMP4]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i32> [[_MSPROP4]], i32 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x float> [[TMP12]], float [[TMP4]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i32> [[_MSPROP5]], i32 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x float> [[TMP13]], float [[TMP4]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i32> [[_MSPROP6]], i32 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x float> [[TMP14]], float [[TMP4]], i32 7
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i32> [[_MSPROP7]], i32 [[_MSLD]], i32 8
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> [[TMP15]], float [[TMP4]], i32 8
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i32> [[_MSPROP8]], i32 [[_MSLD]], i32 9
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP4]], i32 9
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i32> [[_MSPROP9]], i32 [[_MSLD]], i32 10
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP4]], i32 10
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i32> [[_MSPROP10]], i32 [[_MSLD]], i32 11
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP4]], i32 11
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i32> [[_MSPROP11]], i32 [[_MSLD]], i32 12
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 12
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i32> [[_MSPROP12]], i32 [[_MSLD]], i32 13
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP4]], i32 13
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i32> [[_MSPROP13]], i32 [[_MSLD]], i32 14
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP4]], i32 14
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i32> [[_MSPROP14]], i32 [[_MSLD]], i32 15
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP4]], i32 15
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP23]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr) nounwind readonly
+
+define <8 x double> @test_x86_vbroadcast_sd_512(ptr %a0)  #0 {
+;
+; CHECK-LABEL: @test_x86_vbroadcast_sd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[A0:%.*]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP4]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP4]], i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP4]], i32 3
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP4]], i32 5
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP4]], i32 6
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP4]], i32 7
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP15]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr) nounwind readonly
+
+declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP18]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
+  ret <16 x float> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
+
+define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
+
+define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+
+define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP20]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
+
+define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP23]]
+;
+  %x2s = load double, ptr %x2ptr
+  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
+  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
+  %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
+  ret <8 x double> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
+
+define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
+  ret <16 x float> %res
+}
+
+
+declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+
+define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vsubps_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vsubps_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 9)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vsubps_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vsubps_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vmulps_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vmulps_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 9)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vmulps_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
+; CHECK-LABEL: @test_vmulps_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+
+;; mask float
+define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> zeroinitializer, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+;; With Passthru value
+define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> %passthru, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> %passthru, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> %passthru, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
+  <16 x float> %passthru, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+;; mask double
+define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulpd_mask_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+  <8 x double> zeroinitializer, i8 %mask, i32 8)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulpd_mask_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+  <8 x double> zeroinitializer, i8 %mask, i32 9)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulpd_mask_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+  <8 x double> zeroinitializer, i8 %mask, i32 10)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_vmulpd_mask_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
+  <8 x double> zeroinitializer, i8 %mask, i32 11)
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_store_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
+
+define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data)  #0 {
+;
+; CHECK-LABEL: @test_compress_store_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_store_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
+
+define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data)  #0 {
+;
+; CHECK-LABEL: @test_compress_store_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_store_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
+
+define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data)  #0 {
+;
+; CHECK-LABEL: @test_compress_store_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 -1)
+  ret void
+}
+
+define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_store_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
+  ret void
+}
+
+declare void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
+
+define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data)  #0 {
+;
+; CHECK-LABEL: @test_compress_store_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 -1)
+  ret void
+}
+
+define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_load_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP12]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_load_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer)
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
+
+define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data)  #0 {
+;
+; CHECK-LABEL: @test_expand_load_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP8]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 -1)
+  ret <8 x double> %res
+}
+
+; Make sure we don't crash if you pass 0 to the mask.
+define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
+; CHECK-LABEL: @test_zero_mask_expand_load_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP8]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 0)
+  ret <8 x double> %res
+}
+
+define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_load_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP12]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_load_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer)
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
+
+define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data)  #0 {
+;
+; CHECK-LABEL: @test_expand_load_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP8]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 -1)
+  ret <16 x float> %res
+}
+
+define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_load_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_load_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
+
+define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data)  #0 {
+;
+; CHECK-LABEL: @test_expand_load_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_load_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_load_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
+
+define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data)  #0 {
+;
+; CHECK-LABEL: @test_expand_load_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0)  #0 {
+; CHECK-LABEL: @test_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> undef, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> %passthru, i8 %mask, i32 4)
+  ret <8 x double> %res
+}
+define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <8 x double> %res
+}
+define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0)  #0 {
+; CHECK-LABEL: @test_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP5]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> undef, i8 -1, i32 11)
+  ret <8 x double> %res
+}
+define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> %passthru, i8 %mask, i32 11)
+  ret <8 x double> %res
+}
+define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 %mask, i32 11)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0)  #0 {
+; CHECK-LABEL: @test_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
+  ret <16 x float> %res
+}
+define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0)  #0 {
+; CHECK-LABEL: @test_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP5]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 11)
+  ret <16 x float> %res
+}
+define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 11)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+
+define <8 x i64>@test_int_x86_avx512_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
+  ret <8 x i64> %res
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512_old(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
+  ret <8 x i64> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prol_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_prol_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16)
+
+define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pror_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
+; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
+  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
+  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
+  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
+  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
+  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8)
+
+define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pror_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
+; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
+; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
+; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
+; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
+; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
+  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
+  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
+  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
+  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
+  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
+  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP16]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP11]], double [[TMP8]], double 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[X0]], double [[TMP17]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP15:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]]
+; CHECK-NEXT:    br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK:       22:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]], i32 11)
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double [[TMP24]] to i64
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP31]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP32]], i64 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP27]], double [[TMP24]], double 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT11]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x double> [[X0]], double [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[TMP18]], [[TMP34]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP13]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES2]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 11)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP16]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP11]], float [[TMP8]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[X0]], float [[TMP17]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP15:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]]
+; CHECK-NEXT:    br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK:       22:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       23:
+; CHECK-NEXT:    [[TMP24:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]], i32 11)
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP29]], 0
+; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 0
+; CHECK-NEXT:    [[TMP32:%.*]] = or i32 [[TMP31]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i32 [[TMP32]], i32 [[TMP28]]
+; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP27]], float [[TMP24]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT11]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[X0]], float [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[TMP18]], [[TMP34]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP18]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 11)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c)  #0 {
+;
+; CHECK-LABEL: @fmadd_ss_mask_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK:       29:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       30:
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP33]], align 4
+; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a.val = load float, ptr %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, ptr %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c)  #0 {
+;
+; CHECK-LABEL: @fmadd_ss_maskz_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a.val = load float, ptr %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, ptr %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c)  #0 {
+;
+; CHECK-LABEL: @fmadd_sd_mask_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK:       29:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       30:
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP33]], align 8
+; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.val = load double, ptr %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, ptr %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c)  #0 {
+;
+; CHECK-LABEL: @fmadd_sd_maskz_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.val = load double, ptr %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, ptr %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, ptr %a
+  ret void
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK:       35:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       36:
+; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast double [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast double [[TMP38]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]]
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK:       35:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       36:
+; CHECK-NEXT:    [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float [[TMP37]] to i32
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast float [[TMP38]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       39:
+; CHECK-NEXT:    [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast double [[TMP40]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast double [[TMP41]] to i64
+; CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or i64 [[TMP48]], 0
+; CHECK-NEXT:    [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res2, %res3
+  ret <2 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       39:
+; CHECK-NEXT:    [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast float [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float [[TMP41]] to i32
+; CHECK-NEXT:    [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], 0
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res2, %res3
+  ret <4 x float> %res4
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP24]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
+  ret < 4 x float> %res
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP24]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1,  i8 %x3, i32 4)
+  ret < 4 x float> %res
+}
+
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP19]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
+  ret < 4 x float> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1)  #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP11]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP10]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2)  #0 {
+;
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_compress_pd_512(<8 x double> %data)  #0 {
+; CHECK-LABEL: @test_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_compress_ps_512(<16 x float> %data)  #0 {
+; CHECK-LABEL: @test_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
+
+define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_compress_q_512(<8 x i64> %data)  #0 {
+; CHECK-LABEL: @test_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
+
+define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_compress_d_512(<16 x i32> %data)  #0 {
+; CHECK-LABEL: @test_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
+
+define <8 x double> @test_expand_pd_512(<8 x double> %data)  #0 {
+; CHECK-LABEL: @test_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
+
+define <16 x float> @test_expand_ps_512(<16 x float> %data)  #0 {
+; CHECK-LABEL: @test_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
+
+define <8 x i64> @test_expand_q_512(<8 x i64> %data)  #0 {
+; CHECK-LABEL: @test_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
+
+define <16 x i32> @test_expand_d_512(<16 x i32> %data)  #0 {
+; CHECK-LABEL: @test_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_mask_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask)  #0 {
+;
+; CHECK-LABEL: @test_maskz_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
+
+define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p)  #0 {
+;
+; CHECK-LABEL: @test_cmp_512(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 1, <16 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[C:%.*]], <16 x float> [[D:%.*]], i32 1, <16 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = load <16 x float>, ptr [[P:%.*]], align 64
+; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 87960930222080
+; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP20]], align 64
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i1> [[TMP9]], [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> zeroinitializer, <16 x i32> [[_MSLD]]
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x float> [[TMP17]] to <16 x i32>
+; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> zeroinitializer, [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], [[_MSLD]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP26]], <16 x i32> [[TMP22]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP21]], <16 x float> zeroinitializer, <16 x float> [[TMP17]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP27]]
+;
+  entry:
+  %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8)
+  %1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4)
+  %2 = load <16 x float>, ptr %p
+  %3 = xor <16 x i1> %0, %1
+  %4 = select <16 x i1> %3, <16 x float> zeroinitializer, <16 x float> %2
+  ret <16 x float> %4
+}
+
+declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
+
+attributes #0 = { sanitize_memory }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
new file mode 100644
index 0000000000000..052b497831ee1
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -0,0 +1,13714 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512-intrinsics.ll
+
+define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
+  ret <8 x double> %2
+}
+
+define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
+  ret <8 x double> %2
+}
+
+define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 {
+; CHECK-LABEL: @test_compress_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x double> %1
+}
+
+define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
+  ret <16 x float> %2
+}
+
+define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
+  ret <16 x float> %2
+}
+
+define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 {
+; CHECK-LABEL: @test_compress_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <16 x float> %1
+}
+
+define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 {
+; CHECK-LABEL: @test_compress_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i64> %1
+}
+
+define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 {
+; CHECK-LABEL: @test_compress_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <16 x i32> %1
+}
+
+define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 {
+; CHECK-LABEL: @test_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
+  ret <8 x double> %2
+}
+
+define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
+  ret <8 x double> %2
+}
+
+define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 {
+; CHECK-LABEL: @test_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
+  ret <16 x float> %2
+}
+
+define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
+  ret <16 x float> %2
+}
+
+define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 {
+; CHECK-LABEL: @test_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %1 = bitcast i8 %mask to <8 x i1>
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
+  ret <8 x i64> %2
+}
+
+define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 {
+; CHECK-LABEL: @test_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <16 x i32> %1
+}
+
+define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_expand_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %1 = bitcast i16 %mask to <16 x i1>
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
+  ret <16 x i32> %2
+}
+
+define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_rcp_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_rcp_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
+
+declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
+
+define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: @test_rndscale_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
+  ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+  ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_mask_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[B:%.*]] = load <2 x double>, ptr [[BPTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[BPTR]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %b = load <2 x double>, ptr %bptr
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
+  ret <2 x double>%res
+}
+
+define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_sd_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
+  ret <2 x double>%res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
+
+define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) #0 {
+; CHECK-LABEL: @test_rndscale_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) #0 {
+; CHECK-LABEL: @test_rndscale_ss_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load <4 x float>, ptr [[BPTR:%.*]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BPTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B]], <4 x float> undef, i8 -1, i32 11, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %b = load <4 x float>, ptr %bptr
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_ss_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
+  ret <4 x float>%res
+}
+
+define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 {
+; CHECK-LABEL: @test_rndscale_ss_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
+  ret <4 x float>%res
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double> @test7(<8 x double> %a) #0 {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
+  ret <8 x double>%res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+define <16 x float> @test8(<16 x float> %a) #0 {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
+  ret <16 x float>%res
+}
+
+define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_rsqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP2]]
+;
+  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP13]]
+;
+  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP11]]
+;
+  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+
+define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP5]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+  ret <8 x double> %1
+}
+
+define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP14]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP2]]
+;
+  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP13]]
+;
+  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP11]]
+;
+  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+
+define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP5]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP14]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
+
+define <8 x double> @test_getexp_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_getexp_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) #0 {
+; CHECK-LABEL: @test_getexp_round_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 12)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 12)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_getexp_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_getexp_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) #0 {
+; CHECK-LABEL: @test_getexp_round_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
+declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_sqrt_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 9)
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 [[MASK]], i32 10)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 -1, i32 11)
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES_2:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[RES_1]], [[RES_2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
+
+  %res.1 = fadd <4 x float> %res0, %res1
+  %res.2 = fadd <4 x float> %res2, %res3
+  %res   = fadd <4 x float> %res.1, %res.2
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_sqrt_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 9)
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 [[MASK]], i32 10)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
+; CHECK:       21:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       22:
+; CHECK-NEXT:    [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 -1, i32 11)
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES_2:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT:    [[RES:%.*]] = fadd <2 x double> [[RES_1]], [[RES_2]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
+
+  %res.1 = fadd <2 x double> %res0, %res1
+  %res.2 = fadd <2 x double> %res2, %res3
+  %res   = fadd <2 x double> %res.1, %res.2
+  ret <2 x double> %res
+}
+
+define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttsd2usi(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0]], i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttsd2si(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0]], i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2si(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0]], i32 4)
+; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2si_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    [[A1:%.*]] = load <4 x float>, ptr [[A0:%.*]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
+; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A1]], i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %a1 = load <4 x float>, ptr %a0
+  %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvttss2usi(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0]], i32 4)
+; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES2]]
+;
+  %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
+  %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
+  %res2 = add i32 %res0, %res1
+  ret i32 %res2
+}
+declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsd2usi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 9)
+; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES4]]
+;
+  %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
+  %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
+  %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
+  %res3 = add i32 %res, %res1
+  %res4 = add i32 %res3, %res2
+  ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsd2si32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 9)
+; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES4]]
+;
+  %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
+  %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
+  %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
+  %res3 = add i32 %res, %res1
+  %res4 = add i32 %res3, %res2
+  ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtss2usi32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i32 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 9)
+; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES4]]
+;
+  %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
+  %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
+  %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
+  %res3 = add i32 %res, %res1
+  %res4 = add i32 %res3, %res2
+  ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
+
+define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtss2si32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 11)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 9)
+; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES4]]
+;
+  %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
+  %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
+  %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
+  %res3 = add i32 %res, %res1
+  %res4 = add i32 %res3, %res2
+  ret i32 %res4
+}
+declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
+
+define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 {
+; CHECK-LABEL: @test_x86_vcvtps2ph_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]])
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
+; CHECK:       15:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       16:
+; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080
+; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32
+; CHECK-NEXT:    store <16 x i16> [[RES1]], ptr [[DST]], align 32
+; CHECK-NEXT:    [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]]
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[RES]]
+;
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
+  %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
+  store <16 x i16> %res1, ptr %dst
+  %res  = add <16 x i16> %res2, %res3
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
+
+define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 {
+; CHECK-LABEL: @test_cmpps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16
+; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i16 [[TMP7]]
+;
+  %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
+  %1 = bitcast <16 x i1> %res to i16
+  ret i16 %1
+}
+declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
+
+define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 {
+; CHECK-LABEL: @test_cmppd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[TMP7]]
+;
+  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %1 = bitcast <8 x i1> %res to i8
+  ret i8 %1
+}
+declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
+
+
+  ; fp min - max
+define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 {
+; CHECK-LABEL: @test_vmaxpd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
+  ret <8 x double> %1
+}
+declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
+
+define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 {
+; CHECK-LABEL: @test_vminpd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
+  ret <8 x double> %1
+}
+declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
+
+define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_store_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[MASK:%.*]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[MASK]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
+; CHECK:       16:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       17:
+; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]])
+; CHECK-NEXT:    ret void
+;
+  %1 = and i8 %mask, 1
+  %2 = bitcast i8 %1 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract)
+  ret void
+}
+declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1
+
+
+declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
+declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
+
+define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vsubps_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
+; CHECK-LABEL: @test_vmulps_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
+  ret <16 x float> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_vmulpd_mask_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_add_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_sub_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_div_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_min_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
+
+define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
+  ret <16 x float> %3
+}
+
+define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
+  ret <16 x float> %1
+}
+
+define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_mm512_max_round_ps_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
+  ret <16 x float> %1
+}
+declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
+
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_ss_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_add_ss_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_ss_current_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %a1.val = load float, ptr %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_ss_current_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %a1.val = load float, ptr %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_ru(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_rz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_current(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_sd_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_add_sd_rn(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_add_sd_current_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %a1.val = load double, ptr %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_add_sd_current_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %a1.val = load double, ptr %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_max_ss_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_max_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_ss_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %a1.val = load float, ptr %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_ss_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %a1.val = load float, ptr %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_max_sd_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_max_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_max_sd_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
+; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %a1.val = load double, ptr %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_max_sd_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
+; CHECK:       11:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       12:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %a1.val = load double, ptr %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 11)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
+
+define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 9)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 9)
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %b = load i32, ptr %ptr
+  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 4)
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
+; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 4)
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %b = load i32, ptr %ptr
+  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
+
+declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
+  ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
+
+define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP9]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
+  ret <8 x double> %1
+}
+
+define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
+  %2 = bitcast <8 x i64> %x1 to <8 x double>
+  %3 = bitcast i8 %x3 to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
+  ret <8 x double> %4
+}
+
+declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
+
+define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP9]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
+  ret <16 x float> %1
+}
+
+define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP20]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
+  %2 = bitcast <16 x i32> %x1 to <16 x float>
+  %3 = bitcast i16 %x3 to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
+  ret <16 x float> %4
+}
+
+declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
+  ret <8 x i64> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %x2 = load <16 x i32>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP23]]
+;
+  %x2s = load double, ptr %x2ptr
+  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
+  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
+  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
+  ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
+define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
+define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x float> [[X2]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i16> [[RES4]]
+;
+  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
+  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
+  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
+  %res3 = add <8 x i16> %res0, %res1
+  %res4 = add <8 x i16> %res3, %res2
+  ret <8 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = trunc <8 x i64> %x0 to <8 x i32>
+  ret <8 x i32> %1
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP11]]
+;
+  %1 = trunc <8 x i64> %x0 to <8 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  ret <8 x i32> %3
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0,  i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[TMP10]]
+;
+  %1 = trunc <8 x i64> %x0 to <8 x i32>
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
+  ret <8 x i32> %3
+}
+
+declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
+  ret <8 x i32> %res
+}
+
+define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
+  ret <8 x i32> %res
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8)
+
+define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
+  call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i8> [[RES4]]
+;
+  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
+  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
+  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i8> %res0, %res1
+  %res4 = add <16 x i8> %res3, %res2
+  ret <16 x i8> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[RES4]]
+;
+  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i16> %res0, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[RES4]]
+;
+  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i16> %res0, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
+; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[RES4]]
+;
+  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
+  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
+  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
+  %res3 = add <16 x i16> %res0, %res1
+  %res4 = add <16 x i16> %res3, %res2
+  ret <16 x i16> %res4
+}
+
+declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16)
+
+define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
+  call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
+  ret void
+}
+
+declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %cvt = sitofp <16 x i32> %x0 to <16 x float>
+  %1 = bitcast i16 %x2 to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
+  %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
+  %res2 = fadd <16 x float> %2, %3
+  ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES2]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
+
+define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0]], <8 x float> [[X1]], i8 -1, i32 10)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[RES2]]
+;
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES2]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0]], <8 x double> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES2]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %cvt = uitofp <16 x i32> %x0 to <16 x float>
+  %1 = bitcast i16 %x2 to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
+  %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
+  %res2 = fadd <16 x float> %2, %3
+  ret <16 x float> %res2
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+
+define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i32> [[RES2]]
+;
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
+  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
+  %res2 = add <16 x i32> %res, %res1
+  ret <16 x i32> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_getexp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_getexp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 8)
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES_1]]
+;
+  %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
+  %res.1 = fadd <4 x float> %res0, %res1
+  ret <4 x float> %res.1
+}
+
+define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_getexp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_getexp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_mask_getexp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
+; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 8)
+; CHECK-NEXT:    [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES_1]]
+;
+  %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
+  %res.1 = fadd <2 x double> %res0, %res1
+  ret <2 x double> %res.1
+}
+
+define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_maskz_getexp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
+  ret <2 x double> %res
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
+
+define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 5, i8 [[X3:%.*]], i32 8)
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES4]]
+;
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+  ret i8 %res4
+}
+
+define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 2, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
+; CHECK-NEXT:    br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 5, i8 [[X3]], i32 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i8 [[RES1]], -1
+; CHECK-NEXT:    [[TMP21:%.*]] = xor i8 [[RES2]], -1
+; CHECK-NEXT:    [[TMP22:%.*]] = and i8 [[TMP20]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = and i8 0, [[TMP21]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i8 0, [[TMP22]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i8 [[TMP24]], [[TMP23]]
+; CHECK-NEXT:    [[RES11:%.*]] = or i8 [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[TMP26:%.*]] = xor i8 [[RES3]], -1
+; CHECK-NEXT:    [[TMP27:%.*]] = xor i8 [[RES4]], -1
+; CHECK-NEXT:    [[TMP28:%.*]] = and i8 [[TMP26]], 0
+; CHECK-NEXT:    [[TMP29:%.*]] = and i8 0, [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = or i8 0, [[TMP28]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i8 [[TMP30]], [[TMP29]]
+; CHECK-NEXT:    [[RES12:%.*]] = or i8 [[RES3]], [[RES4]]
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i8 [[RES11]], -1
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i8 [[RES12]], -1
+; CHECK-NEXT:    [[TMP34:%.*]] = and i8 [[TMP25]], [[TMP31]]
+; CHECK-NEXT:    [[TMP35:%.*]] = and i8 [[TMP32]], [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = and i8 [[TMP25]], [[TMP33]]
+; CHECK-NEXT:    [[TMP37:%.*]] = or i8 [[TMP34]], [[TMP35]]
+; CHECK-NEXT:    [[TMP38:%.*]] = or i8 [[TMP37]], [[TMP36]]
+; CHECK-NEXT:    [[RES13:%.*]] = or i8 [[RES11]], [[RES12]]
+; CHECK-NEXT:    store i8 [[TMP38]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES13]]
+;
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
+
+  %res11 = or i8 %res1, %res2
+  %res12 = or i8 %res3, %res4
+  %res13 = or i8 %res11, %res12
+  ret i8 %res13
+}
+
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 3, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES2]]
+;
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
+  ret i8 %res2
+}
+
+
+define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 2, i8 -1, i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
+; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
+; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
+; CHECK-NEXT:    br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
+; CHECK:       18:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       19:
+; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 5, i8 [[X3]], i32 8)
+; CHECK-NEXT:    [[TMP20:%.*]] = and i8 [[RES1]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = and i8 0, [[RES2]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i8 0, [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = or i8 [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[RES11:%.*]] = and i8 [[RES1]], [[RES2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = and i8 [[RES3]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = and i8 0, [[RES4]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i8 0, [[TMP24]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or i8 [[TMP26]], [[TMP25]]
+; CHECK-NEXT:    [[RES12:%.*]] = and i8 [[RES3]], [[RES4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = and i8 [[TMP23]], [[TMP27]]
+; CHECK-NEXT:    [[TMP29:%.*]] = and i8 [[RES11]], [[TMP27]]
+; CHECK-NEXT:    [[TMP30:%.*]] = and i8 [[TMP23]], [[RES12]]
+; CHECK-NEXT:    [[TMP31:%.*]] = or i8 [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP32:%.*]] = or i8 [[TMP31]], [[TMP30]]
+; CHECK-NEXT:    [[RES13:%.*]] = and i8 [[RES11]], [[RES12]]
+; CHECK-NEXT:    store i8 [[TMP32]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i8 [[RES13]]
+;
+  %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
+  %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
+  %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
+
+  %res11 = and i8 %res1, %res2
+  %res12 = and i8 %res3, %res4
+  %res13 = and i8 %res11, %res12
+  ret i8 %res13
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0:%.*]], i32 11, <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0]], i32 11, <8 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <8 x double> %res, %res1
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0:%.*]], i32 11, <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
+; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0]], i32 11, <16 x float> [[X2]], i16 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
+  %res2 = fadd <16 x float> %res, %res1
+  ret <16 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 11, <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 12, <2 x double> zeroinitializer, i8 [[X3]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 13, <2 x double> [[X2]], i8 [[X3]], i32 8)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP21:%.*]] = icmp ne i128 [[TMP21]], 0
+; CHECK-NEXT:    [[_MSOR22:%.*]] = or i1 [[_MSOR20]], [[_MSCMP21]]
+; CHECK-NEXT:    br i1 [[_MSOR22]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK:       22:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       23:
+; CHECK-NEXT:    [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 14, <2 x double> [[X2]], i8 -1, i32 4)
+; CHECK-NEXT:    [[RES11:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES12:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
+; CHECK-NEXT:    [[RES13:%.*]] = fadd <2 x double> [[RES11]], [[RES12]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES13]]
+;
+  %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4)
+  %res11 = fadd <2 x double> %res, %res1
+  %res12 = fadd <2 x double> %res2, %res3
+  %res13 = fadd <2 x double> %res11, %res12
+  ret <2 x double> %res13
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 11, <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 12, <4 x float> zeroinitializer, i8 [[X3]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 13, <4 x float> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i128 [[TMP19]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i128 [[TMP20]], 0
+; CHECK-NEXT:    [[_MSOR18:%.*]] = or i1 [[_MSCMP16]], [[_MSCMP17]]
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP21]], 0
+; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSOR18]], [[_MSCMP19]]
+; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK:       22:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       23:
+; CHECK-NEXT:    [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 14, <4 x float> [[X2]], i8 -1, i32 4)
+; CHECK-NEXT:    [[RES11:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES12:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
+; CHECK-NEXT:    [[RES13:%.*]] = fadd <4 x float> [[RES11]], [[RES12]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES13]]
+;
+  %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4)
+  %res11 = fadd <4 x float> %res, %res1
+  %res12 = fadd <4 x float> %res2, %res3
+  %res13 = fadd <4 x float> %res11, %res12
+  ret <4 x float> %res13
+}
+
+define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[X1:%.*]] = load <4 x float>, ptr [[X1P:%.*]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[X1P]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1]], i32 11, <4 x float> undef, i8 -1, i32 4)
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES]]
+;
+  %x1 = load <4 x float>, ptr %x1p
+  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4)
+  ret <4 x float> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+  ret <8 x double> %res
+}
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
+  ret <8 x double> %res2
+}
+
+define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES2]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
+  ret <8 x double> %res2
+}
+
+declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
+  ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
+  ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+  ret <16 x float> %res
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
+  ret <16 x float> %res2
+}
+
+define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES2]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
+  ret <16 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0]], <4 x float> [[X1]], <2 x double> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES2]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0:%.*]], <2 x double> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
+; CHECK:       13:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       14:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0]], <2 x double> [[X1]], <4 x float> [[X2]], i8 -1, i32 8)
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
+
+define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
+  ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
+  %2 = bitcast i16 %x4 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
+
+define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+  ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
+  ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
+  %2 = bitcast i8 %x4 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 8)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 8)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 8)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 8)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_comi_sd_lt(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
+  ret i32 %res
+}
+
+define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
+
+define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i32 9, i32 4)
+; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
+  ret i32 %res
+}
+
+declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
+
+declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
+
+define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP7]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+  ret <8 x double> %1
+}
+
+define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP18]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
+  ret <8 x double> %3
+}
+
+define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[TMP16]]
+;
+  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
+  ret <8 x double> %3
+}
+
+declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
+
+define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+  ret <8 x i64> %1
+}
+
+define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
+  ret <8 x i64> %3
+}
+
+define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
+;
+  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
+  %2 = bitcast i8 %x3 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
+
+define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP7]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+  ret <16 x float> %1
+}
+
+define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
+  ret <16 x float> %3
+}
+
+define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP16]]
+;
+  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
+  ret <16 x float> %3
+}
+
+declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+  ret <16 x i32> %1
+}
+
+define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
+  ret <16 x i32> %3
+}
+
+define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
+;
+  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
+  %2 = bitcast i16 %x3 to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 4, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> [[X1]], <8 x i64> [[X2]], i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 3, i8 -1, i32 8)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES4]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
+  %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X2:%.*]] = load <8 x i64>, ptr [[X2PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[_MSLD]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2]], i32 3, i8 -1, i32 4)
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %x2 = load <8 x i64>, ptr %x2ptr
+  %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+
+declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
+
+define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 3, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 2, i8 -1, i32 8)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES4]]
+;
+  %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
+  %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
+  %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
+  %res3 = fadd <8 x double> %res, %res1
+  %res4 = fadd <8 x double> %res3, %res2
+  ret <8 x double> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 -1, i32 8)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
+
+define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 6, i8 -1, i32 4)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
+  %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4)
+  %res3 = fadd <4 x float> %res, %res1
+  %res4 = fadd <4 x float> %res3, %res2
+  ret <4 x float> %res4
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 5, i16 [[X4]], i32 4)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 5, i16 -1, i32 8)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
+  %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res3, %res2
+  ret <16 x float> %res4
+}
+
+define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2PTR:%.*]], align 64
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2]], i32 5, i16 -1, i32 4)
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %x2 = load <16 x i32>, ptr %x2ptr
+  %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
+
+define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 6, i16 [[X4]], i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 7, i16 -1, i32 4)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES4]]
+;
+  %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
+  %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8)
+  %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4)
+  %res3 = fadd <16 x float> %res, %res1
+  %res4 = fadd <16 x float> %res3, %res2
+  ret <16 x float> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 6, i8 -1, i32 4)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
+
+define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
+; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
+; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
+; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
+; CHECK:       12:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       13:
+; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
+; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
+; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
+; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
+; CHECK:       17:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       18:
+; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]], i32 8)
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
+  %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
+  %res3 = fadd <2 x double> %res, %res1
+  %res4 = fadd <2 x double> %res3, %res2
+  ret <2 x double> %res4
+}
+
+declare double @llvm.fma.f64(double, double, double) #1
+declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
+
+define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP27]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %1 = extractelement <2 x double> %x0, i64 0
+  %2 = extractelement <2 x double> %x1, i64 0
+  %3 = extractelement <2 x double> %x2, i64 0
+  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, double %4, double %1
+  %8 = insertelement <2 x double> %x0, double %7, i64 0
+  %9 = extractelement <2 x double> %x0, i64 0
+  %10 = extractelement <2 x double> %x1, i64 0
+  %11 = extractelement <2 x double> %x2, i64 0
+  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+  %13 = insertelement <2 x double> %x0, double %12, i64 0
+  %14 = extractelement <2 x double> %x0, i64 0
+  %15 = extractelement <2 x double> %x1, i64 0
+  %16 = extractelement <2 x double> %x2, i64 0
+  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
+  %18 = bitcast i8 %x3 to <8 x i1>
+  %19 = extractelement <8 x i1> %18, i64 0
+  %20 = select i1 %19, double %17, double %14
+  %21 = insertelement <2 x double> %x0, double %20, i64 0
+  %res3 = fadd <2 x double> %8, %13
+  %res4 = fadd <2 x double> %21, %res3
+  ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %x1, i64 0
+  %3 = extractelement <4 x float> %x2, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float %1
+  %8 = insertelement <4 x float> %x0, float %7, i64 0
+  %9 = extractelement <4 x float> %x0, i64 0
+  %10 = extractelement <4 x float> %x1, i64 0
+  %11 = extractelement <4 x float> %x2, i64 0
+  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+  %13 = insertelement <4 x float> %x0, float %12, i64 0
+  %14 = extractelement <4 x float> %x0, i64 0
+  %15 = extractelement <4 x float> %x1, i64 0
+  %16 = extractelement <4 x float> %x2, i64 0
+  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
+  %18 = bitcast i8 %x3 to <8 x i1>
+  %19 = extractelement <8 x i1> %18, i64 0
+  %20 = select i1 %19, float %17, float %14
+  %21 = insertelement <4 x float> %x0, float %20, i64 0
+  %res3 = fadd <4 x float> %8, %13
+  %res4 = fadd <4 x float> %21, %res3
+  ret <4 x float> %res4
+}
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i64 [[TMP11]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> [[X0]], double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]], i32 11)
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select i1 false, i64 [[TMP24]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], double [[TMP17]], double 0.000000e+00
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[TMP13]], [[TMP26]]
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES2]]
+;
+  %1 = extractelement <2 x double> %x0, i64 0
+  %2 = extractelement <2 x double> %x1, i64 0
+  %3 = extractelement <2 x double> %x2, i64 0
+  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, double %4, double 0.000000e+00
+  %8 = insertelement <2 x double> %x0, double %7, i64 0
+  %9 = extractelement <2 x double> %x0, i64 0
+  %10 = extractelement <2 x double> %x1, i64 0
+  %11 = extractelement <2 x double> %x2, i64 0
+  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+  %13 = bitcast i8 %x3 to <8 x i1>
+  %14 = extractelement <8 x i1> %13, i64 0
+  %15 = select i1 %14, double %12, double 0.000000e+00
+  %16 = insertelement <2 x double> %x0, double %15, i64 0
+  %res2 = fadd <2 x double> %8, %16
+  ret <2 x double> %res2
+}
+
+declare float @llvm.fma.f32(float, float, float) #1
+declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP11]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[X0]], float [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]], i32 11)
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select i1 false, i32 [[TMP24]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP17]], float 0.000000e+00
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
+; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[TMP13]], [[TMP26]]
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES2]]
+;
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %x1, i64 0
+  %3 = extractelement <4 x float> %x2, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float 0.000000e+00
+  %8 = insertelement <4 x float> %x0, float %7, i64 0
+  %9 = extractelement <4 x float> %x0, i64 0
+  %10 = extractelement <4 x float> %x1, i64 0
+  %11 = extractelement <4 x float> %x2, i64 0
+  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+  %13 = bitcast i8 %x3 to <8 x i1>
+  %14 = extractelement <8 x i1> %13, i64 0
+  %15 = select i1 %14, float %12, float 0.000000e+00
+  %16 = insertelement <4 x float> %x0, float %15, i64 0
+  %res2 = fadd <4 x float> %8, %16
+  ret <4 x float> %res2
+}
+
+define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0(
+; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 16
+; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
+; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 16
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[_MSLD]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], [[TMP6]]
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], [[TMP7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.fma.f32(float [[TMP15]], float [[TMP2:%.*]], float [[TMP3:%.*]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP0:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <8 x i1> [[TMP17]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[_MSPROP2]], i32 0
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP3]], i32 [[TMP24]], i32 [[TMP20]]
+; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP16]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <4 x i32> [[_MSLD]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP25]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP26]]
+;
+  %5 = load <4 x float>, ptr %1, align 16
+  %6 = extractelement <4 x float> %5, i64 0
+  %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2
+  %8 = bitcast i8 %0 to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %7, float 0.000000e+00
+  %11 = insertelement <4 x float> %5, float %10, i64 0
+  ret <4 x float> %11
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP29]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %1 = extractelement <2 x double> %x0, i64 0
+  %2 = extractelement <2 x double> %x1, i64 0
+  %3 = extractelement <2 x double> %x2, i64 0
+  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, double %4, double %3
+  %8 = insertelement <2 x double> %x2, double %7, i64 0
+  %9 = extractelement <2 x double> %x0, i64 0
+  %10 = extractelement <2 x double> %x1, i64 0
+  %11 = extractelement <2 x double> %x2, i64 0
+  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
+  %13 = insertelement <2 x double> %x2, double %12, i64 0
+  %14 = extractelement <2 x double> %x0, i64 0
+  %15 = extractelement <2 x double> %x1, i64 0
+  %16 = extractelement <2 x double> %x2, i64 0
+  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
+  %18 = bitcast i8 %x3 to <8 x i1>
+  %19 = extractelement <8 x i1> %18, i64 0
+  %20 = select i1 %19, double %17, double %16
+  %21 = insertelement <2 x double> %x2, double %20, i64 0
+  %res3 = fadd <2 x double> %8, %13
+  %res4 = fadd <2 x double> %21, %res3
+  ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
+; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
+; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
+; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
+; CHECK:       23:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       24:
+; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
+; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
+; CHECK:       30:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       31:
+; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
+; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP29]] to i32
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
+; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
+; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]]
+; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]]
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
+; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %x1, i64 0
+  %3 = extractelement <4 x float> %x2, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float %3
+  %8 = insertelement <4 x float> %x2, float %7, i64 0
+  %9 = extractelement <4 x float> %x0, i64 0
+  %10 = extractelement <4 x float> %x1, i64 0
+  %11 = extractelement <4 x float> %x2, i64 0
+  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
+  %13 = insertelement <4 x float> %x2, float %12, i64 0
+  %14 = extractelement <4 x float> %x0, i64 0
+  %15 = extractelement <4 x float> %x1, i64 0
+  %16 = extractelement <4 x float> %x2, i64 0
+  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
+  %18 = bitcast i8 %x3 to <8 x i1>
+  %19 = extractelement <8 x i1> %18, i64 0
+  %20 = select i1 %19, float %17, float %16
+  %21 = insertelement <4 x float> %x2, float %20, i64 0
+  %res3 = fadd <4 x float> %8, %13
+  %res4 = fadd <4 x float> %21, %res3
+  ret <4 x float> %res4
+}
+
+define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_ss_mask_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK:       29:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       30:
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP33]], align 4
+; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a.val = load float, ptr %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, ptr %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv =  insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+  %1 = extractelement <4 x float> %av, i64 0
+  %2 = extractelement <4 x float> %bv, i64 0
+  %3 = extractelement <4 x float> %av, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %c to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float %1
+  %8 = insertelement <4 x float> %av, float %7, i64 0
+  %sr = extractelement <4 x float> %8, i32 0
+  store float %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_ss_maskz_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
+; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
+; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
+; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
+; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
+; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP32]], align 4
+; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
+; CHECK-NEXT:    ret void
+;
+  %a.val = load float, ptr %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, ptr %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+  %1 = extractelement <4 x float> %av, i64 0
+  %2 = extractelement <4 x float> %bv, i64 0
+  %3 = extractelement <4 x float> %av, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %c to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float 0.000000e+00
+  %8 = insertelement <4 x float> %av, float %7, i64 0
+  %sr = extractelement <4 x float> %8, i32 0
+  store float %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_sd_mask_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double [[TMP14]] to i64
+; CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]]
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
+; CHECK:       29:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       30:
+; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
+; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP33]], align 8
+; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.val = load double, ptr %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, ptr %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+  %1 = extractelement <2 x double> %av, i64 0
+  %2 = extractelement <2 x double> %bv, i64 0
+  %3 = extractelement <2 x double> %av, i64 0
+  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+  %5 = bitcast i8 %c to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, double %4, double %1
+  %8 = insertelement <2 x double> %av, double %7, i64 0
+  %sr = extractelement <2 x double> %8, i32 0
+  store double %sr, ptr %a
+  ret void
+}
+
+define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
+; CHECK-LABEL: @fmadd_sd_maskz_memfold(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
+; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
+; CHECK:       9:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       10:
+; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
+; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
+; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
+; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
+; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]]
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
+; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0
+; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
+; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
+; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
+; CHECK-NEXT:    ret void
+;
+  %a.val = load double, ptr %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, ptr %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+  %1 = extractelement <2 x double> %av, i64 0
+  %2 = extractelement <2 x double> %bv, i64 0
+  %3 = extractelement <2 x double> %av, i64 0
+  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
+  %5 = bitcast i8 %c to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, double %4, double 0.000000e+00
+  %8 = insertelement <2 x double> %av, double %7, i64 0
+  %sr = extractelement <2 x double> %8, i32 0
+  store double %sr, ptr %a
+  ret void
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK:       35:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       36:
+; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast double [[TMP37]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast double [[TMP38]] to i64
+; CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]]
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %1 = fneg <2 x double> %x2
+  %2 = extractelement <2 x double> %x0, i64 0
+  %3 = extractelement <2 x double> %x1, i64 0
+  %4 = extractelement <2 x double> %1, i64 0
+  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
+  %6 = extractelement <2 x double> %x2, i64 0
+  %7 = bitcast i8 %x3 to <8 x i1>
+  %8 = extractelement <8 x i1> %7, i64 0
+  %9 = select i1 %8, double %5, double %6
+  %10 = insertelement <2 x double> %x2, double %9, i64 0
+  %11 = fneg <2 x double> %x2
+  %12 = extractelement <2 x double> %x0, i64 0
+  %13 = extractelement <2 x double> %x1, i64 0
+  %14 = extractelement <2 x double> %11, i64 0
+  %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11)
+  %16 = extractelement <2 x double> %x2, i64 0
+  %17 = insertelement <2 x double> %x2, double %15, i64 0
+  %18 = fneg <2 x double> %x2
+  %19 = extractelement <2 x double> %x0, i64 0
+  %20 = extractelement <2 x double> %x1, i64 0
+  %21 = extractelement <2 x double> %18, i64 0
+  %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10)
+  %23 = extractelement <2 x double> %x2, i64 0
+  %24 = bitcast i8 %x3 to <8 x i1>
+  %25 = extractelement <8 x i1> %24, i64 0
+  %26 = select i1 %25, double %22, double %23
+  %27 = insertelement <2 x double> %x2, double %26, i64 0
+  %res3 = fadd <2 x double> %10, %17
+  %res4 = fadd <2 x double> %27, %res3
+  ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float [[TMP9]] to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
+; CHECK:       35:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       36:
+; CHECK-NEXT:    [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float [[TMP37]] to i32
+; CHECK-NEXT:    [[TMP44:%.*]] = bitcast float [[TMP38]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]]
+; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP45]], 0
+; CHECK-NEXT:    [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]]
+; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %1 = fneg <4 x float> %x2
+  %2 = extractelement <4 x float> %x0, i64 0
+  %3 = extractelement <4 x float> %x1, i64 0
+  %4 = extractelement <4 x float> %1, i64 0
+  %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
+  %6 = extractelement <4 x float> %x2, i64 0
+  %7 = bitcast i8 %x3 to <8 x i1>
+  %8 = extractelement <8 x i1> %7, i64 0
+  %9 = select i1 %8, float %5, float %6
+  %10 = insertelement <4 x float> %x2, float %9, i64 0
+  %11 = fneg <4 x float> %x2
+  %12 = extractelement <4 x float> %x0, i64 0
+  %13 = extractelement <4 x float> %x1, i64 0
+  %14 = extractelement <4 x float> %11, i64 0
+  %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11)
+  %16 = extractelement <4 x float> %x2, i64 0
+  %17 = insertelement <4 x float> %x2, float %15, i64 0
+  %18 = fneg <4 x float> %x2
+  %19 = extractelement <4 x float> %x0, i64 0
+  %20 = extractelement <4 x float> %x1, i64 0
+  %21 = extractelement <4 x float> %18, i64 0
+  %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10)
+  %23 = extractelement <4 x float> %x2, i64 0
+  %24 = bitcast i8 %x3 to <8 x i1>
+  %25 = extractelement <8 x i1> %24, i64 0
+  %26 = select i1 %25, float %22, float %23
+  %27 = insertelement <4 x float> %x2, float %26, i64 0
+  %res3 = fadd <4 x float> %10, %17
+  %res4 = fadd <4 x float> %27, %res3
+  ret <4 x float> %res4
+}
+
+define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double [[TMP11]] to i64
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = fneg <2 x double> [[X0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fneg <2 x double> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       39:
+; CHECK-NEXT:    [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast double [[TMP40]] to i64
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast double [[TMP41]] to i64
+; CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or i64 [[TMP48]], 0
+; CHECK-NEXT:    [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]]
+; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x double> [[RES4]]
+;
+  %1 = fneg <2 x double> %x0
+  %2 = fneg <2 x double> %x2
+  %3 = extractelement <2 x double> %1, i64 0
+  %4 = extractelement <2 x double> %x1, i64 0
+  %5 = extractelement <2 x double> %2, i64 0
+  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
+  %7 = extractelement <2 x double> %x2, i64 0
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, double %6, double %7
+  %11 = insertelement <2 x double> %x2, double %10, i64 0
+  %12 = fneg <2 x double> %x0
+  %13 = fneg <2 x double> %x2
+  %14 = extractelement <2 x double> %12, i64 0
+  %15 = extractelement <2 x double> %x1, i64 0
+  %16 = extractelement <2 x double> %13, i64 0
+  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
+  %18 = extractelement <2 x double> %x2, i64 0
+  %19 = insertelement <2 x double> %x2, double %17, i64 0
+  %20 = fneg <2 x double> %x0
+  %21 = fneg <2 x double> %x2
+  %22 = extractelement <2 x double> %20, i64 0
+  %23 = extractelement <2 x double> %x1, i64 0
+  %24 = extractelement <2 x double> %21, i64 0
+  %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10)
+  %26 = extractelement <2 x double> %x2, i64 0
+  %27 = bitcast i8 %x3 to <8 x i1>
+  %28 = extractelement <8 x i1> %27, i64 0
+  %29 = select i1 %28, double %25, double %26
+  %30 = insertelement <2 x double> %x2, double %29, i64 0
+  %res3 = fadd <2 x double> %11, %19
+  %res4 = fadd <2 x double> %30, %res3
+  ret <2 x double> %res4
+}
+
+define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
+; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]]
+; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT:    [[TMP24:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0
+; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
+; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
+; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
+; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
+; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
+; CHECK:       28:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       29:
+; CHECK-NEXT:    [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11)
+; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0
+; CHECK-NEXT:    [[TMP33:%.*]] = fneg <4 x float> [[X0]]
+; CHECK-NEXT:    [[TMP34:%.*]] = fneg <4 x float> [[X2]]
+; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0
+; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0
+; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
+; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
+; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
+; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
+; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
+; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
+; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
+; CHECK:       38:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       39:
+; CHECK-NEXT:    [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10)
+; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
+; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]]
+; CHECK-NEXT:    [[TMP46:%.*]] = bitcast float [[TMP40]] to i32
+; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float [[TMP41]] to i32
+; CHECK-NEXT:    [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]]
+; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], 0
+; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]]
+; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]]
+; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]]
+; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0
+; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0
+; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
+; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]]
+; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
+; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[RES4]]
+;
+  %1 = fneg <4 x float> %x0
+  %2 = fneg <4 x float> %x2
+  %3 = extractelement <4 x float> %1, i64 0
+  %4 = extractelement <4 x float> %x1, i64 0
+  %5 = extractelement <4 x float> %2, i64 0
+  %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
+  %7 = extractelement <4 x float> %x2, i64 0
+  %8 = bitcast i8 %x3 to <8 x i1>
+  %9 = extractelement <8 x i1> %8, i64 0
+  %10 = select i1 %9, float %6, float %7
+  %11 = insertelement <4 x float> %x2, float %10, i64 0
+  %12 = fneg <4 x float> %x0
+  %13 = fneg <4 x float> %x2
+  %14 = extractelement <4 x float> %12, i64 0
+  %15 = extractelement <4 x float> %x1, i64 0
+  %16 = extractelement <4 x float> %13, i64 0
+  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
+  %18 = extractelement <4 x float> %x2, i64 0
+  %19 = insertelement <4 x float> %x2, float %17, i64 0
+  %20 = fneg <4 x float> %x0
+  %21 = fneg <4 x float> %x2
+  %22 = extractelement <4 x float> %20, i64 0
+  %23 = extractelement <4 x float> %x1, i64 0
+  %24 = extractelement <4 x float> %21, i64 0
+  %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10)
+  %26 = extractelement <4 x float> %x2, i64 0
+  %27 = bitcast i8 %x3 to <8 x i1>
+  %28 = extractelement <8 x i1> %27, i64 0
+  %29 = select i1 %28, float %25, float %26
+  %30 = insertelement <4 x float> %x2, float %29, i64 0
+  %res3 = fadd <4 x float> %11, %19
+  %res4 = fadd <4 x float> %30, %res3
+  ret <4 x float> %res4
+}
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP24]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %vecinit.i, i64 0
+  %3 = extractelement <4 x float> %x1, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float %3
+  %8 = insertelement <4 x float> %x1, float %7, i64 0
+  ret <4 x float> %8
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
+; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]]
+; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP24]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %vecinit.i, i64 0
+  %3 = extractelement <4 x float> %x1, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = bitcast i8 %x3 to <8 x i1>
+  %6 = extractelement <8 x i1> %5, i64 0
+  %7 = select i1 %6, float %4, float %1
+  %8 = insertelement <4 x float> %x0, float %7, i64 0
+  ret <4 x float> %8
+}
+
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
+; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
+; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
+; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
+; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]])
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP14]], 0
+; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], 0
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00
+; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP19]]
+;
+  %q = load float, ptr %ptr_b
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %1 = extractelement <4 x float> %x0, i64 0
+  %2 = extractelement <4 x float> %x1, i64 0
+  %3 = extractelement <4 x float> %vecinit.i, i64 0
+  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
+  %5 = select i1 false, float %4, float 0.000000e+00
+  %6 = insertelement <4 x float> %x0, float %5, i64 0
+  ret <4 x float> %6
+}
+
+define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psll_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_pslli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psra_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+
+define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrai_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
+
+
+
+define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrl_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+
+define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  ret <16 x i32> %res
+}
+define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
+  ret <16 x i32> %res2
+}
+define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
+
+
+define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  ret <8 x i64> %res
+}
+define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
+  ret <8 x i64> %res2
+}
+define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_d_512_const(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+  %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1>)
+  %res2 = add <16 x i32> %res0, %res1
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psllv_q_512_const(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
+  %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
+  %res2 = add <8 x i64> %res0, %res1
+  ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+  ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrav_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+  ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_d_512_const(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
+  %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1  >)
+  %res2 = add <16 x i32> %res0, %res1
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[RES2]]
+;
+  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
+  ret <16 x i32> %res2
+}
+
+declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %res
+}
+
+define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 {
+; CHECK-LABEL: @test_x86_avx512_psrlv_q_512_const(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
+; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
+  %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
+  %res2 = add <8 x i64> %res0, %res1
+  ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
+  ret <8 x i64> %res2
+}
+
+define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
+; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
+; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES2]]
+;
+  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  %mask.cast = bitcast i8 %mask to <8 x i1>
+  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
+  ret <8 x i64> %res2
+}
+
+declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+
+define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castpd128_pd256_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <2 x double> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %a1 = freeze <2 x double> poison
+  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  ret <8 x double> %res
+}
+
+
+define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castpd256_pd256_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x double> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x double> [[RES]]
+;
+  %a1 = freeze <4 x double> poison
+  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}
+
+
+define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castps128_ps512_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x float> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %a1 = freeze <4 x float> poison
+  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x float> %res
+}
+
+
+define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm256_castps256_ps512_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <8 x float> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[RES]]
+;
+  %a1 = freeze <8 x float> poison
+  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x float> %res
+}
+
+
+define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm512_castsi128_si512_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <2 x i64> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a1 = freeze <2 x i64> poison
+  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
+  ret <8 x i64> %res
+}
+
+
+define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind #0 {
+; CHECK-LABEL: @test_mm512_castsi256_si512_pd256_freeze(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x i64> poison
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[RES]]
+;
+  %a1 = freeze <4 x i64> poison
+  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i64> %res
+}
+
+
+define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
+; CHECK-LABEL: @bad_mask_transition(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP12]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP13]], 0
+; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
+; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK:       14:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       15:
+; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[C:%.*]], <8 x double> [[D:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i1> [[TMP16]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i16
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP17]] to i16
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[CONV2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP18]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x i1> [[TMP19]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]]
+; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[TMP24]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = or <16 x i32> [[TMP26]], [[TMP4]]
+; CHECK-NEXT:    [[TMP28:%.*]] = or <16 x i32> [[TMP27]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP28]], <16 x i32> [[TMP23]]
+; CHECK-NEXT:    [[TMP29:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[F]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP29]]
+;
+entry:
+  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %1 = bitcast <8 x i1> %0 to i8
+  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %3 = bitcast <8 x i1> %2 to i8
+  %conv = zext i8 %1 to i16
+  %conv2 = zext i8 %3 to i16
+  %4 = bitcast i16 %conv to <16 x i1>
+  %5 = bitcast i16 %conv2 to <16 x i1>
+  %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
+  ret <16 x float> %9
+}
+
+define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
+; CHECK-LABEL: @bad_mask_transition_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP8]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP9]] to i16
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP16]], <16 x i32> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[F]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP17]]
+;
+entry:
+  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
+  %1 = bitcast <8 x i1> %0 to i8
+  %conv = zext i8 %1 to i16
+  %2 = bitcast i16 %conv to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
+  ret <16 x float> %3
+}
+
+declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
+declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
+declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
+declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
+declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
+declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
+declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
+
+attributes #0 = { sanitize_memory }

From 84af3ee5124de3385b829c3a9980fd734f0d92e8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 16:13:51 -0800
Subject: [PATCH 153/432] [ELF] Replace Fatal with Err

---
 lld/ELF/Arch/ARM.cpp   |  4 ++--
 lld/ELF/Arch/RISCV.cpp |  2 +-
 lld/ELF/InputFiles.cpp | 34 ++++++++++++++++++++++------------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index de6e45c6cc65c..7d2953ddf64f0 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1536,8 +1536,8 @@ template <typename ELFT> void elf::writeARMCmseImportLib(Ctx &ctx) {
   }
 
   if (auto e = buffer->commit())
-    Fatal(ctx) << "failed to write output '" << buffer->getPath()
-               << "': " << std::move(e);
+    Err(ctx) << "failed to write output '" << buffer->getPath()
+             << "': " << std::move(e);
 }
 
 void elf::setARMTargetInfo(Ctx &ctx) { ctx.target.reset(new ARM(ctx)); }
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 36ae31be6ed2a..4d8989a21b501 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -885,7 +885,7 @@ static bool relax(Ctx &ctx, InputSection &sec) {
   }
   // Inform assignAddresses that the size has changed.
   if (!isUInt<32>(delta))
-    Fatal(ctx) << "section size decrease is too large: " << delta;
+    Err(ctx) << "section size decrease is too large: " << delta;
   sec.bytesDropped = delta;
   return changed;
 }
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index eba4c234d3f16..42d0e4c202ec6 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1131,8 +1131,8 @@ void ObjFile<ELFT>::initializeSymbols(const object::ELFFile<ELFT> &obj) {
     sym->isUsedInRegularObj = true;
     if (LLVM_UNLIKELY(eSym.st_shndx == SHN_COMMON)) {
       if (value == 0 || value >= UINT32_MAX)
-        Fatal(ctx) << this << ": common symbol '" << sym->getName()
-                   << "' has invalid alignment: " << value;
+        Err(ctx) << this << ": common symbol '" << sym->getName()
+                 << "' has invalid alignment: " << value;
       hasCommonSyms = true;
       sym->resolve(ctx, CommonSymbol{ctx, this, StringRef(), binding, stOther,
                                      type, value, size});
@@ -1384,16 +1384,22 @@ std::vector<uint32_t> SharedFile::parseVerneed(const ELFFile<ELFT> &obj,
   ArrayRef<uint8_t> data = CHECK2(obj.getSectionContents(*sec), this);
   const uint8_t *verneedBuf = data.begin();
   for (unsigned i = 0; i != sec->sh_info; ++i) {
-    if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end())
-      Fatal(ctx) << this << " has an invalid Verneed";
+    if (verneedBuf + sizeof(typename ELFT::Verneed) > data.end()) {
+      Err(ctx) << this << " has an invalid Verneed";
+      break;
+    }
     auto *vn = reinterpret_cast<const typename ELFT::Verneed *>(verneedBuf);
     const uint8_t *vernauxBuf = verneedBuf + vn->vn_aux;
     for (unsigned j = 0; j != vn->vn_cnt; ++j) {
-      if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end())
-        Fatal(ctx) << this << " has an invalid Vernaux";
+      if (vernauxBuf + sizeof(typename ELFT::Vernaux) > data.end()) {
+        Err(ctx) << this << " has an invalid Vernaux";
+        break;
+      }
       auto *aux = reinterpret_cast<const typename ELFT::Vernaux *>(vernauxBuf);
-      if (aux->vna_name >= this->stringTable.size())
-        Fatal(ctx) << this << " has a Vernaux with an invalid vna_name";
+      if (aux->vna_name >= this->stringTable.size()) {
+        Err(ctx) << this << " has a Vernaux with an invalid vna_name";
+        break;
+      }
       uint16_t version = aux->vna_other & VERSYM_VERSION;
       if (version >= verneeds.size())
         verneeds.resize(version + 1);
@@ -1481,13 +1487,17 @@ template <class ELFT> void SharedFile::parse() {
   for (const Elf_Dyn &dyn : dynamicTags) {
     if (dyn.d_tag == DT_NEEDED) {
       uint64_t val = dyn.getVal();
-      if (val >= this->stringTable.size())
-        Fatal(ctx) << this << ": invalid DT_NEEDED entry";
+      if (val >= this->stringTable.size()) {
+        Err(ctx) << this << ": invalid DT_NEEDED entry";
+        return;
+      }
       dtNeeded.push_back(this->stringTable.data() + val);
     } else if (dyn.d_tag == DT_SONAME) {
       uint64_t val = dyn.getVal();
-      if (val >= this->stringTable.size())
-        Fatal(ctx) << this << ": invalid DT_SONAME entry";
+      if (val >= this->stringTable.size()) {
+        Err(ctx) << this << ": invalid DT_SONAME entry";
+        return;
+      }
       soName = this->stringTable.data() + val;
     }
   }

From a6044a05cd16d2c5dbca80757a160cba9a2cb037 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Sun, 26 Jan 2025 16:59:34 -0800
Subject: [PATCH 154/432] [msan] Fix-forward avx512-intrinsics-upgrade.ll
 (#124495)

I had added the test in https://github.com/llvm/llvm-project/pull/123980
and contemporaneously added AVX masked store/load intrinsics
(https://github.com/llvm/llvm-project/pull/123857) and forgot to update
the test output for the intersection. This patch fixes the output.
---
 .../X86/avx512-intrinsics-upgrade.ll          | 184 +++++-------------
 1 file changed, 44 insertions(+), 140 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index edb618fdfb8fb..1ab13a1f1bfeb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -16542,23 +16542,15 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16576,23 +16568,15 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16611,24 +16595,16 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16647,24 +16623,16 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16680,18 +16648,10 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -16703,18 +16663,10 @@ define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x flo
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -16729,23 +16681,15 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16763,23 +16707,15 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16798,24 +16734,16 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16834,24 +16762,16 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16867,18 +16787,10 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -16890,18 +16802,10 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)

From b9d301cc7e4fe4c442ec15169686fa4a18f5cdfc Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 27 Jan 2025 01:10:35 +0000
Subject: [PATCH 155/432] Revert "[msan] Add handlers for AVX masked load/store
 intrinsics (#123857)"

This reverts commit db79fb2a91df31a07f312f8e061936927ac5c506.

Reason: buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/144/builds/16636/steps/6/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll)
---
 .../Instrumentation/MemorySanitizer.cpp       | 154 +----------------
 .../MemorySanitizer/X86/avx-intrinsics-x86.ll | 160 ++++++++----------
 .../X86/avx2-intrinsics-x86.ll                | 152 ++++++++---------
 .../i386/avx-intrinsics-i386.ll               | 160 ++++++++----------
 .../i386/avx2-intrinsics-i386.ll              | 152 ++++++++---------
 5 files changed, 289 insertions(+), 489 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b6293af4ab477..56d3eb10d73e9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3046,8 +3046,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       if (maybeHandleSimpleNomemIntrinsic(I))
         return true;
 
-    // FIXME: detect and handle SSE maskstore/maskload?
-    // Some cases are now handled in handleAVXMasked{Load,Store}.
+    // FIXME: detect and handle SSE maskstore/maskload
     return false;
   }
 
@@ -3684,10 +3683,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // TODO: Store origin.
   }
 
-  // Intrinsic::masked_store
-  //
-  // Note: handleAVXMaskedStore handles AVX/AVX2 variants, though AVX512 masked
-  //       stores are lowered to Intrinsic::masked_store.
   void handleMaskedStore(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *V = I.getArgOperand(0);
@@ -3718,10 +3713,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                 std::max(Alignment, kMinOriginAlignment));
   }
 
-  // Intrinsic::masked_load
-  //
-  // Note: handleAVXMaskedLoad handles AVX/AVX2 variants, though AVX512 masked
-  //       loads are lowered to Intrinsic::masked_load.
   void handleMaskedLoad(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Ptr = I.getArgOperand(0);
@@ -3763,125 +3754,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, Origin);
   }
 
-  // e.g., void @llvm.x86.avx.maskstore.ps.256(ptr, <8 x i32>, <8 x float>)
-  //                                           dst  mask       src
-  //
-  // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
-  // by handleMaskedStore.
-  //
-  // This function handles AVX and AVX2 masked stores; these use the MSBs of a
-  // vector of integers, unlike the LLVM masked intrinsics, which require a
-  // vector of booleans. X86InstCombineIntrinsic.cpp::simplifyX86MaskedLoad
-  // mentions that the x86 backend does not know how to efficiently convert
-  // from a vector of booleans back into the AVX mask format; therefore, they
-  // (and we) do not reduce AVX/AVX2 masked intrinsics into LLVM masked
-  // intrinsics.
-  void handleAVXMaskedStore(IntrinsicInst &I) {
-    IRBuilder<> IRB(&I);
-
-    Value *Dst = I.getArgOperand(0);
-    assert(Dst->getType()->isPointerTy() && "Destination is not a pointer!");
-
-    Value *Mask = I.getArgOperand(1);
-    assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
-
-    Value *Src = I.getArgOperand(2);
-    assert(isa<VectorType>(Src->getType()) && "Source is not a vector!");
-
-    const Align Alignment = Align(1);
-
-    Value *SrcShadow = getShadow(Src);
-
-    if (ClCheckAccessAddress) {
-      insertShadowCheck(Dst, &I);
-      insertShadowCheck(Mask, &I);
-    }
-
-    Value *DstShadowPtr;
-    Value *DstOriginPtr;
-    std::tie(DstShadowPtr, DstOriginPtr) = getShadowOriginPtr(
-        Dst, IRB, SrcShadow->getType(), Alignment, /*isStore*/ true);
-
-    SmallVector<Value *, 2> ShadowArgs;
-    ShadowArgs.append(1, DstShadowPtr);
-    ShadowArgs.append(1, Mask);
-    // The intrinsic may require floating-point but shadows can be arbitrary
-    // bit patterns, of which some would be interpreted as "invalid"
-    // floating-point values (NaN etc.); we assume the intrinsic will happily
-    // copy them.
-    ShadowArgs.append(1, IRB.CreateBitCast(SrcShadow, Src->getType()));
-
-    CallInst *CI =
-        IRB.CreateIntrinsic(IRB.getVoidTy(), I.getIntrinsicID(), ShadowArgs);
-    setShadow(&I, CI);
-
-    if (!MS.TrackOrigins)
-      return;
-
-    // Approximation only
-    auto &DL = F.getDataLayout();
-    paintOrigin(IRB, getOrigin(Src), DstOriginPtr,
-                DL.getTypeStoreSize(SrcShadow->getType()),
-                std::max(Alignment, kMinOriginAlignment));
-  }
-
-  // e.g., <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>)
-  //       return                                    src  mask
-  //
-  // Masked-off values are replaced with 0, which conveniently also represents
-  // initialized memory.
-  //
-  // AVX512 masked stores are lowered to Intrinsic::masked_load and are handled
-  // by handleMaskedStore.
-  //
-  // We do not combine this with handleMaskedLoad; see comment in
-  // handleAVXMaskedStore for the rationale.
-  //
-  // This is subtly different than handleIntrinsicByApplyingToShadow(I, 1)
-  // because we need to apply getShadowOriginPtr, not getShadow, to the first
-  // parameter.
-  void handleAVXMaskedLoad(IntrinsicInst &I) {
-    IRBuilder<> IRB(&I);
-
-    Value *Src = I.getArgOperand(0);
-    assert(Src->getType()->isPointerTy() && "Source is not a pointer!");
-
-    Value *Mask = I.getArgOperand(1);
-    assert(isa<VectorType>(Mask->getType()) && "Mask is not a vector!");
-
-    const Align Alignment = Align(1);
-
-    if (ClCheckAccessAddress) {
-      insertShadowCheck(Mask, &I);
-    }
-
-    Type *SrcShadowTy = getShadowTy(Src);
-    Value *SrcShadowPtr, *SrcOriginPtr;
-    std::tie(SrcShadowPtr, SrcOriginPtr) =
-        getShadowOriginPtr(Src, IRB, SrcShadowTy, Alignment, /*isStore*/ false);
-
-    SmallVector<Value *, 2> ShadowArgs;
-    ShadowArgs.append(1, SrcShadowPtr);
-    ShadowArgs.append(1, Mask);
-
-    CallInst *CI =
-        IRB.CreateIntrinsic(I.getType(), I.getIntrinsicID(), ShadowArgs);
-    // The intrinsic may require floating-point but shadows can be arbitrary
-    // bit patterns, of which some would be interpreted as "invalid"
-    // floating-point values (NaN etc.); we assume the intrinsic will happily
-    // copy them.
-    setShadow(&I, IRB.CreateBitCast(CI, getShadowTy(&I)));
-
-    if (!MS.TrackOrigins)
-      return;
-
-    // The "pass-through" value is always zero (initialized). To the extent
-    // that that results in initialized aligned 4-byte chunks, the origin value
-    // is ignored. It is therefore correct to simply copy the origin from src.
-    Value *PtrSrcOrigin = IRB.CreateLoad(MS.OriginTy, SrcOriginPtr);
-    setOrigin(&I, PtrSrcOrigin);
-  }
-
   // Instrument BMI / BMI2 intrinsics.
   // All of these intrinsics are Z = I(X, Y)
   // where the types of all operands and the result match, and are either i32 or
@@ -4594,30 +4466,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       break;
     }
 
-    case Intrinsic::x86_avx_maskstore_ps:
-    case Intrinsic::x86_avx_maskstore_pd:
-    case Intrinsic::x86_avx_maskstore_ps_256:
-    case Intrinsic::x86_avx_maskstore_pd_256:
-    case Intrinsic::x86_avx2_maskstore_d:
-    case Intrinsic::x86_avx2_maskstore_q:
-    case Intrinsic::x86_avx2_maskstore_d_256:
-    case Intrinsic::x86_avx2_maskstore_q_256: {
-      handleAVXMaskedStore(I);
-      break;
-    }
-
-    case Intrinsic::x86_avx_maskload_ps:
-    case Intrinsic::x86_avx_maskload_pd:
-    case Intrinsic::x86_avx_maskload_ps_256:
-    case Intrinsic::x86_avx_maskload_pd_256:
-    case Intrinsic::x86_avx2_maskload_d:
-    case Intrinsic::x86_avx2_maskload_q:
-    case Intrinsic::x86_avx2_maskload_d_256:
-    case Intrinsic::x86_avx2_maskload_q_256: {
-      handleAVXMaskedLoad(I);
-      break;
-    }
-
     case Intrinsic::fshl:
     case Intrinsic::fshr:
       handleFunnelShift(I);
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
index 43f51a810d0d2..7273e431a9c2a 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx-intrinsics-x86.ll
@@ -532,22 +532,20 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP4]], <2 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x double> [[TMP5]] to <2 x i64>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]])
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
@@ -558,22 +556,20 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP4]], <4 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x double> [[TMP5]] to <4 x i64>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]])
-; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
@@ -584,22 +580,20 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP4]], <4 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <4 x i32>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]])
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
@@ -610,22 +604,20 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP10]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP4]], <8 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x float> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]])
-; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
@@ -636,25 +628,23 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[TMP6]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2)
@@ -665,25 +655,23 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP6]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2)
@@ -694,25 +682,23 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[TMP6]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2)
@@ -723,25 +709,23 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP6]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP7]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
index c68461dd367ee..e10062142c046 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll
@@ -995,21 +995,20 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP4]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1020,21 +1019,20 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP4]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]])
-; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -1045,21 +1043,20 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP4]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1070,21 +1067,20 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP4:%.*]] = inttoptr i64 [[TMP9]] to ptr
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP4]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]])
-; CHECK-NEXT:    store <8 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8
+; CHECK:       5:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
@@ -1095,24 +1091,23 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[TMP6]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
@@ -1123,24 +1118,23 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP6]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
@@ -1151,24 +1145,23 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[TMP6]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2)
@@ -1179,24 +1172,23 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP6]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]])
+; CHECK:       7:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
index a22ca6dd15da4..68337d6d962db 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx-intrinsics-i386.ll
@@ -550,23 +550,21 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(ptr) nounwind readonly
 
 define <2 x double> @test_x86_avx_maskload_pd(ptr %a0, <2 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[TMP11]], <2 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x double> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0]], <2 x i64> [[MASK]])
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx.maskload.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x double> [[RES]]
 ;
   %res = call <2 x double> @llvm.x86.avx.maskload.pd(ptr %a0, <2 x i64> %mask) ; <<2 x double>> [#uses=1]
@@ -577,23 +575,21 @@ declare <2 x double> @llvm.x86.avx.maskload.pd(ptr, <2 x i64>) nounwind readonly
 
 define <4 x double> @test_x86_avx_maskload_pd_256(ptr %a0, <4 x i64> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_pd_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[TMP11]], <4 x i64> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x double> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0]], <4 x i64> [[MASK]])
-; CHECK-NEXT:    store <4 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x double> [[RES]]
 ;
   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(ptr %a0, <4 x i64> %mask) ; <<4 x double>> [#uses=1]
@@ -604,23 +600,21 @@ declare <4 x double> @llvm.x86.avx.maskload.pd.256(ptr, <4 x i64>) nounwind read
 
 define <4 x float> @test_x86_avx_maskload_ps(ptr %a0, <4 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[TMP11]], <4 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <4 x i32>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0]], <4 x i32> [[MASK]])
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx.maskload.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x float> [[RES]]
 ;
   %res = call <4 x float> @llvm.x86.avx.maskload.ps(ptr %a0, <4 x i32> %mask) ; <<4 x float>> [#uses=1]
@@ -631,23 +625,21 @@ declare <4 x float> @llvm.x86.avx.maskload.ps(ptr, <4 x i32>) nounwind readonly
 
 define <8 x float> @test_x86_avx_maskload_ps_256(ptr %a0, <8 x i32> %mask) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskload_ps_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[TMP11]], <8 x i32> [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x float> [[TMP6]] to <8 x i32>
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0]], <8 x i32> [[MASK]])
-; CHECK-NEXT:    store <8 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x float> [[RES]]
 ;
   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(ptr %a0, <8 x i32> %mask) ; <<8 x float>> [#uses=1]
@@ -658,26 +650,24 @@ declare <8 x float> @llvm.x86.avx.maskload.ps.256(ptr, <8 x i32>) nounwind reado
 
 define void @test_x86_avx_maskstore_pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[TMP7]], <2 x i64> [[MASK:%.*]], <2 x double> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0]], <2 x i64> [[MASK]], <2 x double> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd(ptr [[A0:%.*]], <2 x i64> [[MASK:%.*]], <2 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd(ptr %a0, <2 x i64> %mask, <2 x double> %a2)
@@ -688,26 +678,24 @@ declare void @llvm.x86.avx.maskstore.pd(ptr, <2 x i64>, <2 x double>) nounwind
 
 define void @test_x86_avx_maskstore_pd_256(ptr %a0, <4 x i64> %mask, <4 x double> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_pd_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[TMP3]] to <4 x double>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[TMP7]], <4 x i64> [[MASK:%.*]], <4 x double> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0]], <4 x i64> [[MASK]], <4 x double> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.pd.256(ptr [[A0:%.*]], <4 x i64> [[MASK:%.*]], <4 x double> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.pd.256(ptr %a0, <4 x i64> %mask, <4 x double> %a2)
@@ -718,26 +706,24 @@ declare void @llvm.x86.avx.maskstore.pd.256(ptr, <4 x i64>, <4 x double>) nounwi
 
 define void @test_x86_avx_maskstore_ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP3]] to <4 x float>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[TMP7]], <4 x i32> [[MASK:%.*]], <4 x float> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0]], <4 x i32> [[MASK]], <4 x float> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps(ptr [[A0:%.*]], <4 x i32> [[MASK:%.*]], <4 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps(ptr %a0, <4 x i32> %mask, <4 x float> %a2)
@@ -748,26 +734,24 @@ declare void @llvm.x86.avx.maskstore.ps(ptr, <4 x i32>, <4 x float>) nounwind
 
 define void @test_x86_avx_maskstore_ps_256(ptr %a0, <8 x i32> %mask, <8 x float> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx_maskstore_ps_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = and i64 [[TMP9]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP3]] to <8 x float>
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[TMP7]], <8 x i32> [[MASK:%.*]], <8 x float> [[TMP8]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn()
 ; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0]], <8 x i32> [[MASK]], <8 x float> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx.maskstore.ps.256(ptr [[A0:%.*]], <8 x i32> [[MASK:%.*]], <8 x float> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx.maskstore.ps.256(ptr %a0, <8 x i32> %mask, <8 x float> %a2)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
index 442f0c422645a..29e2931d2ca48 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll
@@ -1048,22 +1048,21 @@ declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind reado
 
 define <2 x i64> @test_x86_avx2_maskload_q(ptr %a0, <2 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[TMP10]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <2 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]])
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <2 x i64> [[RES]]
 ;
   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(ptr %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
@@ -1074,22 +1073,21 @@ declare <2 x i64> @llvm.x86.avx2.maskload.q(ptr, <2 x i64>) nounwind readonly
 
 define <4 x i64> @test_x86_avx2_maskload_q_256(ptr %a0, <4 x i64> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_q_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[TMP10]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0]], <4 x i64> [[A1]])
-; CHECK-NEXT:    store <4 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]])
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i64> [[RES]]
 ;
   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
@@ -1100,22 +1098,21 @@ declare <4 x i64> @llvm.x86.avx2.maskload.q.256(ptr, <4 x i64>) nounwind readonl
 
 define <4 x i32> @test_x86_avx2_maskload_d(ptr %a0, <4 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[TMP10]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.x86.avx2.maskload.d(ptr %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
@@ -1126,22 +1123,21 @@ declare <4 x i32> @llvm.x86.avx2.maskload.d(ptr, <4 x i32>) nounwind readonly
 
 define <8 x i32> @test_x86_avx2_maskload_d_256(ptr %a0, <8 x i32> %a1) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskload_d_256(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP7]], -2147483649
-; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[TMP10]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
 ; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0]], <8 x i32> [[A1]])
-; CHECK-NEXT:    store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
+; CHECK:       6:
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
   %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
@@ -1152,25 +1148,24 @@ declare <8 x i32> @llvm.x86.avx2.maskload.d.256(ptr, <8 x i32>) nounwind readonl
 
 define void @test_x86_avx2_maskstore_q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[TMP7]], <2 x i64> [[A1:%.*]], <2 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0]], <2 x i64> [[A1]], <2 x i64> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q(ptr [[A0:%.*]], <2 x i64> [[A1:%.*]], <2 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q(ptr %a0, <2 x i64> %a1, <2 x i64> %a2)
@@ -1181,25 +1176,24 @@ declare void @llvm.x86.avx2.maskstore.q(ptr, <2 x i64>, <2 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_q_256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_q_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[TMP7]], <4 x i64> [[A1:%.*]], <4 x i64> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i64> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i64> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0]], <4 x i64> [[A1]], <4 x i64> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.q.256(ptr [[A0:%.*]], <4 x i64> [[A1:%.*]], <4 x i64> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.q.256(ptr %a0, <4 x i64> %a1, <4 x i64> %a2)
@@ -1210,25 +1204,24 @@ declare void @llvm.x86.avx2.maskstore.q.256(ptr, <4 x i64>, <4 x i64>) nounwind
 
 define void @test_x86_avx2_maskstore_d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[TMP7]], <4 x i32> [[A1:%.*]], <4 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0]], <4 x i32> [[A1]], <4 x i32> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d(ptr [[A0:%.*]], <4 x i32> [[A1:%.*]], <4 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d(ptr %a0, <4 x i32> %a1, <4 x i32> %a2)
@@ -1239,25 +1232,24 @@ declare void @llvm.x86.avx2.maskstore.d(ptr, <4 x i32>, <4 x i32>) nounwind
 
 define void @test_x86_avx2_maskstore_d_256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2) #0 {
 ; CHECK-LABEL: @test_x86_avx2_maskstore_d_256(
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP8:%.*]] = ptrtoint ptr [[A0:%.*]] to i64
-; CHECK-NEXT:    [[TMP11:%.*]] = and i64 [[TMP8]], -2147483649
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP11]] to ptr
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[TMP7]], <8 x i32> [[A1:%.*]], <8 x i32> [[TMP3]])
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP3]] to i256
 ; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR6]]
 ; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0]], <8 x i32> [[A1]], <8 x i32> [[A2:%.*]])
+; CHECK:       8:
+; CHECK-NEXT:    call void @llvm.x86.avx2.maskstore.d.256(ptr [[A0:%.*]], <8 x i32> [[A1:%.*]], <8 x i32> [[A2:%.*]])
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.x86.avx2.maskstore.d.256(ptr %a0, <8 x i32> %a1, <8 x i32> %a2)

From b6eeec586fa6c0db4ab1b0e129111e82a97c7283 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 27 Jan 2025 01:10:51 +0000
Subject: [PATCH 156/432] Revert "[msan] Fix-forward
 avx512-intrinsics-upgrade.ll (#124495)"

This reverts commit a6044a05cd16d2c5dbca80757a160cba9a2cb037.

Reason: buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/144/builds/16636/steps/6/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll)
---
 .../X86/avx512-intrinsics-upgrade.ll          | 184 +++++++++++++-----
 1 file changed, 140 insertions(+), 44 deletions(-)

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
index 1ab13a1f1bfeb..edb618fdfb8fb 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
@@ -16542,15 +16542,23 @@ define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x f
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16568,15 +16576,23 @@ define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16595,16 +16611,24 @@ define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x fl
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16623,16 +16647,24 @@ define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16648,10 +16680,18 @@ define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -16663,10 +16703,18 @@ define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x flo
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
@@ -16681,15 +16729,23 @@ define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x f
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16707,15 +16763,23 @@ define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
+; CHECK:       6:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       7:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
@@ -16734,16 +16798,24 @@ define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x fl
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16762,16 +16834,24 @@ define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
+; CHECK:       7:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       8:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
@@ -16787,10 +16867,18 @@ define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float>
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
@@ -16802,10 +16890,18 @@ define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x flo
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
+; CHECK:       5:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
+; CHECK-NEXT:    unreachable
+; CHECK:       6:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x float> [[TMP7]]
 ;
   %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)

From b2647ffbf797dd5a457b6b19faab06956934d067 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 27 Jan 2025 01:35:32 +0000
Subject: [PATCH 157/432] Revert "[msan] Add avx512-intrinsics.ll and
 avx512-intrinsics-upgrade.ll test case (#123980)"

This reverts commit 980e86f130eea02bd41b887f4ed896340fc90f6c.

Reason: buildbot breakage
(https://lab.llvm.org/buildbot/#/builders/154/builds/10901/steps/5/logs/FAIL__LLVM__avx512-intrinsics-upgrade_ll)
---
 .../X86/avx512-intrinsics-upgrade.ll          | 19969 ----------------
 .../MemorySanitizer/X86/avx512-intrinsics.ll  | 13714 -----------
 2 files changed, 33683 deletions(-)
 delete mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
 delete mode 100644 llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll

diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
deleted file mode 100644
index edb618fdfb8fb..0000000000000
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics-upgrade.ll
+++ /dev/null
@@ -1,19969 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s
-;
-; Forked from llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
-
-declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
-
-define i16 @unpckbw_test(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @unpckbw_test(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i1> [[TMP3]], <16 x i1> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <16 x i1> [[TMP4]], <16 x i1> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i1> [[TMP5]], <16 x i1> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i1> [[TMP6]], <16 x i1> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[_MSPROP1]], <8 x i1> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <8 x i1> [[TMP8]], <8 x i1> [[TMP7]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP2]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    store i16 [[TMP10]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP11]]
-;
-  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
-  ret i16 %res
-}
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_gpr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = shufflevector <16 x i32> [[_MSPROP6]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT1]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP7]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DOTSPLAT2]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[DOTSPLAT2]], <16 x i32> [[X1]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <16 x i32> poison, i32 [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = shufflevector <16 x i32> [[_MSPROP8]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT3]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[DOTSPLAT4]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT10:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[DOTSPLAT4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP5]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[DOTSPLAT]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP18]], <16 x i32> [[_MSPROP_SELECT]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP10]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP19]], <16 x i32> [[_MSPROP_SELECT10]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP17]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP20]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16)
-
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_gpr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = shufflevector <8 x i64> [[_MSPROP6]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT2:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP7]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[DOTSPLAT2]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[DOTSPLAT2]], <8 x i64> [[X1]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <8 x i64> poison, i64 [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = shufflevector <8 x i64> [[_MSPROP8]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT3]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP9]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[DOTSPLAT4]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT10:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[DOTSPLAT4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP5]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[DOTSPLAT]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP18]], <8 x i64> [[_MSPROP_SELECT]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP10]], 1
-; CHECK-NEXT:    [[TMP20:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP19]], <8 x i64> [[_MSPROP_SELECT10]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP17]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP20]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8)
-
-
-declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly
-
-define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_x86_vbroadcast_ss_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask )  #0 {
-;
-; CHECK-LABEL: @test_x86_mask_vbroadcast_ss_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[A1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[A1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask )  #0 {
-;
-; CHECK-LABEL: @test_x86_maskz_vbroadcast_ss_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask)
-  ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly
-
-define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1)  #0 {
-; CHECK-LABEL: @test_x86_vbroadcast_sd_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask )  #0 {
-;
-; CHECK-LABEL: @test_x86_mask_vbroadcast_sd_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[A1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[A1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask )  #0 {
-;
-; CHECK-LABEL: @test_x86_maskz_vbroadcast_sd_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pbroadcastd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pbroadcastq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pbroadcastq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pbroadcastq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i64> [[X0:%.*]], <2 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_movsldup_512(<16 x float> %x0, <16 x float> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_movsldup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_movsldup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_movsldup_512(<16 x float> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_movsldup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_movshdup_512(<16 x float> %x0, <16 x float> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_movshdup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_movshdup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_movshdup_512(<16 x float> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_movshdup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2)
-  ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_movddup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_movddup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_movddup_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_perm_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_perm_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_perm_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_perm_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_perm_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define void @test_store1(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_store1(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1
-; CHECK-NEXT:    store <16 x float> [[DATA]], ptr [[PTR2]], align 1
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr, <16 x float> %data, i16 %mask)
-  call void @llvm.x86.avx512.mask.storeu.ps.512(ptr %ptr2, <16 x float> %data, i16 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.ps.512(ptr, <16 x float>, i16 )
-
-define void @test_store2(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_store2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 1, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1
-; CHECK-NEXT:    store <8 x double> [[DATA]], ptr [[PTR2]], align 1
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr, <8 x double> %data, i8 %mask)
-  call void @llvm.x86.avx512.mask.storeu.pd.512(ptr %ptr2, <8 x double> %data, i8 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.pd.512(ptr, <8 x double>, i8)
-
-define void @test_mask_store_aligned_ps(<16 x float> %data, ptr %ptr, ptr %ptr2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_store_aligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16f32.p0(<16 x float> [[DATA:%.*]], ptr [[PTR]], i32 64, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64
-; CHECK-NEXT:    store <16 x float> [[DATA]], ptr [[PTR2]], align 64
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr, <16 x float> %data, i16 %mask)
-  call void @llvm.x86.avx512.mask.store.ps.512(ptr %ptr2, <16 x float> %data, i16 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.ps.512(ptr, <16 x float>, i16 )
-
-define void @test_mask_store_aligned_pd(<8 x double> %data, ptr %ptr, ptr %ptr2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_store_aligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8f64.p0(<8 x double> [[DATA:%.*]], ptr [[PTR]], i32 64, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64
-; CHECK-NEXT:    store <8 x double> [[DATA]], ptr [[PTR2]], align 64
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr, <8 x double> %data, i8 %mask)
-  call void @llvm.x86.avx512.mask.store.pd.512(ptr %ptr2, <8 x double> %data, i8 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.pd.512(ptr, <8 x double>, i8)
-
-define void@test_int_x86_avx512_mask_storeu_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 1, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 1, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 1
-; CHECK-NEXT:    store <8 x i64> [[X1]], ptr [[PTR2]], align 1
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2)
-  call void @llvm.x86.avx512.mask.storeu.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.q.512(ptr, <8 x i64>, i8)
-
-define void@test_int_x86_avx512_mask_storeu_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_storeu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 1, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 1, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 1
-; CHECK-NEXT:    store <16 x i32> [[X1]], ptr [[PTR2]], align 1
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2)
-  call void @llvm.x86.avx512.mask.storeu.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.storeu.d.512(ptr, <16 x i32>, i16)
-
-define void@test_int_x86_avx512_mask_store_q_512(ptr %ptr1, ptr %ptr2, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_store_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[TMP2]], ptr [[TMP9]], i32 64, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[TMP5]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v8i64.p0(<8 x i64> [[X1:%.*]], ptr [[PTR1]], i32 64, <8 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP17]], align 64
-; CHECK-NEXT:    store <8 x i64> [[X1]], ptr [[PTR2]], align 64
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr1, <8 x i64> %x1, i8 %x2)
-  call void @llvm.x86.avx512.mask.store.q.512(ptr %ptr2, <8 x i64> %x1, i8 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.q.512(ptr, <8 x i64>, i8)
-
-define void@test_int_x86_avx512_mask_store_d_512(ptr %ptr1, ptr %ptr2, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_store_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR1:%.*]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[TMP2]], ptr [[TMP9]], i32 64, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    call void @llvm.masked.store.v16i32.p0(<16 x i32> [[X1:%.*]], ptr [[PTR1]], i32 64, <16 x i1> [[TMP6]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = xor i64 [[TMP15]], 87960930222080
-; CHECK-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[TMP16]] to ptr
-; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP17]], align 64
-; CHECK-NEXT:    store <16 x i32> [[X1]], ptr [[PTR2]], align 64
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr1, <16 x i32> %x1, i16 %x2)
-  call void @llvm.x86.avx512.mask.store.d.512(ptr %ptr2, <16 x i32> %x1, i16 -1)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.store.d.512(ptr, <16 x i32>, i16)
-
-define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_aligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> %res, i16 %mask)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask)
-  %res4 = fadd <16 x float> %res2, %res1
-  ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(ptr, <16 x float>, i16)
-
-define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, ptr %ptr, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_unaligned_ps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x float>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 1, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP10]], <16 x float> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 1, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP19]], <16 x float> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 -1)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> %res, i16 %mask)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr %ptr, <16 x float> zeroinitializer, i16 %mask)
-  %res4 = fadd <16 x float> %res2, %res1
-  ret <16 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(ptr, <16 x float>, i16)
-
-define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_aligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES4]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> %res, i8 %mask)
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask)
-  %res4 = fadd <8 x double> %res2, %res1
-  ret <8 x double> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(ptr, <8 x double>, i8)
-
-define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, ptr %ptr, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_unaligned_pd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x double>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 1, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP10]], <8 x double> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 1, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP19]], <8 x double> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES4]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 -1)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> %res, i8 %mask)
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr %ptr, <8 x double> zeroinitializer, i8 %mask)
-  %res4 = fadd <8 x double> %res2, %res1
-  ret <8 x double> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(ptr, <8 x double>, i8)
-
-declare <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr, <16 x i32>, i16)
-
-define <16 x i32> @test_mask_load_unaligned_d(ptr %ptr, ptr %ptr2, <16 x i32> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_unaligned_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP14]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
-; CHECK:       16:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR2]], i32 1, <16 x i1> [[TMP11]], <16 x i32> [[TMP6]])
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
-; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP23]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP19]] to i16
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
-; CHECK:       25:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 1, <16 x i1> [[TMP20]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP27]], [[TMP18]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr2, <16 x i32> %res, i16 %mask)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.loadu.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask)
-  %res4 = add <16 x i32> %res2, %res1
-  ret <16 x i32> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr, <8 x i64>, i8)
-
-define <8 x i64> @test_mask_load_unaligned_q(ptr %ptr, ptr %ptr2, <8 x i64> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_unaligned_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR2:%.*]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP14]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
-; CHECK:       16:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    [[TMP18:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR2]], i32 1, <8 x i1> [[TMP11]], <8 x i64> [[TMP6]])
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
-; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP23]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP19]] to i8
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP25:%.*]], label [[TMP26:%.*]], !prof [[PROF1]]
-; CHECK:       25:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       26:
-; CHECK-NEXT:    [[TMP27:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 1, <8 x i1> [[TMP20]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP27]], [[TMP18]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr2, <8 x i64> %res, i8 %mask)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.loadu.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask)
-  %res4 = add <8 x i64> %res2, %res1
-  ret <8 x i64> %res4
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr, <16 x i32>, i16)
-
-define <16 x i32> @test_mask_load_aligned_d(<16 x i32> %data, ptr %ptr, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_aligned_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr [[PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP13]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i16 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP10]], <16 x i32> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[TMP22]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0(ptr [[PTR]], i32 64, <16 x i1> [[TMP19]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i32> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 -1)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> %res, i16 %mask)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.load.d.512(ptr %ptr, <16 x i32> zeroinitializer, i16 %mask)
-  %res4 = add <16 x i32> %res2, %res1
-  ret <16 x i32> %res4
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr, <8 x i64>, i8)
-
-define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, ptr %ptr, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_load_aligned_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP13]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[_MSLD]])
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i8 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP10]], <8 x i64> [[TMP5]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i64 [[TMP20]], 87960930222080
-; CHECK-NEXT:    [[TMP22:%.*]] = inttoptr i64 [[TMP21]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDLD1:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[TMP22]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP24:%.*]], label [[TMP25:%.*]], !prof [[PROF1]]
-; CHECK:       24:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       25:
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.masked.load.v8i64.p0(ptr [[PTR]], i32 64, <8 x i1> [[TMP19]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[_MSMASKEDLD1]], [[_MSMASKEDLD]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i64> [[TMP26]], [[TMP17]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 -1)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> %res, i8 %mask)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.load.q.512(ptr %ptr, <8 x i64> zeroinitializer, i8 %mask)
-  %res4 = add <8 x i64> %res2, %res1
-  ret <8 x i64> %res4
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermil_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 3, i32 2, i32 5, i32 4, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermil_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermil_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermil_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X0]], <16 x i32> <i32 2, i32 1, i32 1, i32 0, i32 6, i32 5, i32 5, i32 4, i32 10, i32 9, i32 9, i32 8, i32 14, i32 13, i32 13, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3)
-  ret <16 x float> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pshuf_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pshuf_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pshuf_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> <i32 3, i32 0, i32 0, i32 0, i32 7, i32 4, i32 4, i32 4, i32 11, i32 8, i32 8, i32 8, i32 15, i32 12, i32 12, i32 12>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP3]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_pcmpeq_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    store i16 [[TMP10]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP11]]
-;
-  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
-  ret i16 %res
-}
-
-define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_pcmpeq_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    store i16 [[TMP19]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP20]]
-;
-  %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
-  ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_pcmpeq_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    store i8 [[TMP10]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP11]]
-;
-  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
-  ret i8 %res
-}
-
-define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_pcmpeq_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    store i8 [[TMP19]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP20]]
-;
-  %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
-  ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64>, <8 x i64>, i8)
-
-define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_pcmpgt_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <16 x i32> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[TMP13]] to i16
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
-; CHECK-NEXT:    store i16 [[TMP15]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP16]]
-;
-  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
-  ret i16 %res
-}
-
-define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_pcmpgt_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP6]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt <16 x i32> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i1> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP21]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i1> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
-; CHECK-NEXT:    store i16 [[TMP24]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP25]]
-;
-  %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
-  ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32>, <16 x i32>, i16)
-
-define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_pcmpgt_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt <8 x i64> [[TMP5]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i1> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i1> [[TMP13]] to i8
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i1> [[TMP14]] to i8
-; CHECK-NEXT:    store i8 [[TMP15]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP16]]
-;
-  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
-  ret i8 %res
-}
-
-define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_pcmpgt_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ugt <8 x i64> [[TMP6]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp ugt <8 x i64> [[TMP7]], [[TMP10]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i1> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i1> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i1> [[TMP14]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i1> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP21]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i1> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
-; CHECK-NEXT:    store i8 [[TMP24]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP25]]
-;
-  %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
-  ret i8 %res
-}
-
-declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8)
-
-declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_unpckh_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_unpckh_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_unpckh_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_unpckl_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_unpckl_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_unpckl_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_punpcklqd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_punpcklqd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_punpcklqd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_punpckhqd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhqd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_punpckhd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_punpckhd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 2, i32 18, i32 3, i32 19, i32 6, i32 22, i32 7, i32 23, i32 10, i32 26, i32 11, i32 27, i32 14, i32 30, i32 15, i32 31>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_punpckld_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_punpckld_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 4, i32 20, i32 5, i32 21, i32 8, i32 24, i32 9, i32 25, i32 12, i32 28, i32 13, i32 29>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_pslli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_pslli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_pslli_d(<16 x i32> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_pslli_q(<8 x i64> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_pslli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_pslli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_pslli_q(<8 x i64> %a0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrli_d(<16 x i32> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrli_d(<16 x i32> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrli_q(<8 x i64> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrli_q(<8 x i64> %a0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrai_d(<16 x i32> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrai_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrai_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[A1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrai_d(<16 x i32> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32>, i32, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrai_q(<8 x i64> %a0)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrai_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrai_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[A1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrai_q(<8 x i64> %a0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64>, i32, <8 x i64>, i8) nounwind readnone
-
-declare void @llvm.x86.avx512.storent.q.512(ptr, <8 x i64>)
-
-define void@test_storent_q_512(<8 x i64> %data, ptr %ptr)  #0 {
-;
-; CHECK-LABEL: @test_storent_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64
-; CHECK-NEXT:    store <8 x i64> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2:![0-9]+]]
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.storent.q.512(ptr %ptr, <8 x i64> %data)
-  ret void
-}
-
-declare void @llvm.x86.avx512.storent.pd.512(ptr, <8 x double>)
-
-define void @test_storent_pd_512(<8 x double> %data, ptr %ptr)  #0 {
-;
-; CHECK-LABEL: @test_storent_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr [[TMP7]], align 64
-; CHECK-NEXT:    store <8 x double> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]]
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.storent.pd.512(ptr %ptr, <8 x double> %data)
-  ret void
-}
-
-declare void @llvm.x86.avx512.storent.ps.512(ptr, <16 x float>)
-
-define void @test_storent_ps_512(<16 x float> %data, ptr %ptr)  #0 {
-;
-; CHECK-LABEL: @test_storent_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    store <16 x i32> [[TMP2]], ptr [[TMP7]], align 64
-; CHECK-NEXT:    store <16 x float> [[DATA:%.*]], ptr [[PTR]], align 64, !nontemporal [[META2]]
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.storent.ps.512(ptr %ptr, <16 x float> %data)
-  ret void
-}
-
-define <16 x i32> @test_xor_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_xor_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_xor_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_or_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_or_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_or_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i32> [[A:%.*]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[B:%.*]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_and_epi32(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_and_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[TMP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a,<16 x i32> %b, <16 x i32>zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_and_epi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[A:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP1]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP9]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_xor_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_xor_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_or_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP3]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_or_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <8 x i64> [[A:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[B:%.*]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP5]], [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_and_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[TMP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_and_epi64(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[A:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP1]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[A]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP9]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP9]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <16 x i32> @test_mask_add_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mask_add_epi32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_add_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32> @test_mask_sub_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mask_sub_epi32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_sub_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <8 x i64> @test_mask_add_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_mask_add_epi64_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = add <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_add_epi64_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = add <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64> @test_mask_sub_epi64_rr(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_mask_sub_epi64_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rrkz(<8 x i64> %a, <8 x i64> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sub <8 x i64> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rm(<8 x i64> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rmkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %b = load <8 x i64>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sub_epi64_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sub <8 x i64> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <16 x i32> @test_mask_mullo_epi32_rr_512(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mask_mullo_epi32_rr_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rrk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rrkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rm_512(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rm_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rmk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rmkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rmkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rmb_512(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rmb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 -1)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, ptr %ptr_b, <16 x i32> %passThru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rmbk_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP2]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-define <16 x i32> @test_mask_mullo_epi32_rmbkz_512(<16 x i32> %a, ptr %ptr_b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mullo_epi32_rmbkz_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i32, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <16 x i32> undef, i32 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[_MSPROP]], <16 x i32> splat (i32 -1), <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[B:%.*]] = shufflevector <16 x i32> [[VECINIT_I]], <16 x i32> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> [[TMP2]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[A:%.*]], [[B]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %q = load i32, ptr %ptr_b
-  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
-  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> zeroinitializer, i16 %mask)
-  ret < 16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-
-declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_f32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
-  ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_f64x2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_f64x2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_f64x2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP12]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
-  ret <8 x double> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_i32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X3:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_i64x2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_i64x2(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i32> <i32 4, i32 5, i32 2, i32 3, i32 10, i32 11, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X3:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X3]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4)
-  ret <8 x i64> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP3]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP13]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> [[X3]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_shuf_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i32> <i32 0, i32 9, i32 3, i32 10, i32 5, i32 12, i32 6, i32 14>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP11]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP12]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_shuf_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP3]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_shuf_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> <i32 2, i32 1, i32 17, i32 16, i32 6, i32 5, i32 21, i32 20, i32 10, i32 9, i32 25, i32 24, i32 14, i32 13, i32 29, i32 28>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4)
-  ret <16 x float> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmaxs_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmaxs_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxs_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmaxu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umax.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmaxu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmaxu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umax.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmins_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.smin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmins_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmins_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.smin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pminu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pminu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pminu_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.umin.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B)  #0 {
-;
-; CHECK-LABEL: @test_mm_mask_move_ss(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[__U]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i8 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP11]], -1
-; CHECK-NEXT:    [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x float> [[__W:%.*]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP]], i32 [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP18]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP24]], i32 [[TMP19]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP16]], float [[TMP17]], float [[TMP18]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP25]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP26]]
-;
-entry:
-  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> %__W, i8 %__U)
-  ret <4 x float> %res
-}
-
-
-define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B)  #0 {
-;
-; CHECK-LABEL: @test_mm_maskz_move_ss(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[__U]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i8 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP10]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[__B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i32 [[_MSPROP]], i32 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP16]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = xor i32 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i32 [[TMP21]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP15]], float [[TMP16]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x float> [[__A:%.*]], float [[TMP22]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP23]]
-;
-entry:
-  %res = call <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float> %__A, <4 x float> %__B, <4 x float> zeroinitializer, i8 %__U)
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B)  #0 {
-;
-; CHECK-LABEL: @test_mm_mask_move_sd(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[__U:%.*]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[__U]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i8 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i8 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne i8 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i8 [[TMP11]], -1
-; CHECK-NEXT:    [[TMP14:%.*]] = and i8 [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[__W:%.*]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP16]], i64 [[_MSPROP]], i64 [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast double [[TMP18]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP22]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP24]], i64 [[TMP19]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP16]], double [[TMP17]], double [[TMP18]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP25]], i64 0
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[TMP26]]
-;
-entry:
-  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> %__W, i8 %__U)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B)  #0 {
-;
-; CHECK-LABEL: @test_mm_maskz_move_sd(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = and i8 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[__U:%.*]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = or i8 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = and i8 [[__U]], 1
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i8 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i8 [[TMP7]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne i8 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i8 [[TMP10]], -1
-; CHECK-NEXT:    [[TMP13:%.*]] = and i8 [[TMP12]], [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i8 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and i1 [[TMP11]], [[TMP14]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[__B:%.*]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP15]], i64 [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double [[TMP16]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP_ICMP]], i64 [[TMP21]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = select i1 [[TMP15]], double [[TMP16]], double 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <2 x double> [[__A:%.*]], double [[TMP22]], i64 0
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[TMP23]]
-;
-entry:
-  %res = call <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double> %__A, <2 x double> %__B, <2 x double> zeroinitializer, i8 %__U)
-  ret <2 x double> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.move.ss(<4 x float>, <4 x float>, <4 x float>, i8)
-declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, <2 x double>, i8)
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovzxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovzxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i16> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i16> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i16> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovzxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovzxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovzxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = zext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i8> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovsxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i8> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxb_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> splat (i8 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i8> [[X0:%.*]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i8> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovsxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxd_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> splat (i32 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[X0:%.*]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i32> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i32> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i16> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i16> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> splat (i16 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[X0:%.*]], <16 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <16 x i16> [[_MSPROP]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i16> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovsxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovsxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovsxw_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> splat (i16 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i16> [[X0:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = sext <8 x i16> [[_MSPROP]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i16> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>)
-
-define <16 x i32>@test_int_x86_avx512_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
-  ret <16 x i32> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
-  ret <16 x i32> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64>, <8 x i64>)
-
-define <8 x i64>@test_int_x86_avx512_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  ret <8 x i64> %1
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
-  ret <8 x i64> %3
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
-  ret <8 x i64> %3
-}
-
-declare <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32>, <16 x i32>)
-
-define <16 x i32>@test_int_x86_avx512_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
-  ret <16 x i32> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
-  ret <16 x i32> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64>, <8 x i64>)
-
-define <8 x i64>@test_int_x86_avx512_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  ret <8 x i64> %1
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
-  ret <8 x i64> %3
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
-  ret <8 x i64> %3
-}
-
-declare <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32>, i32)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_prol_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 3)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
-  %4 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 4)
-  %5 = bitcast i16 %x3 to <16 x i1>
-  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %7 = call <16 x i32> @llvm.x86.avx512.prol.d.512(<16 x i32> %x0, i32 5)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %6, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %7, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64>, i32)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_prol_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 3)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
-  %4 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 4)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
-  %7 = call <8 x i64> @llvm.x86.avx512.prol.q.512(<8 x i64> %x0, i32 5)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %6, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %7, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32>, i32)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_pror_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 3)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
-  %4 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 4)
-  %5 = bitcast i16 %x3 to <16 x i1>
-  %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
-  %7 = call <16 x i32> @llvm.x86.avx512.pror.d.512(<16 x i32> %x0, i32 5)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %6, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %7, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64>, i32)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_pror_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 3)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
-  %4 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 4)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer
-  %7 = call <8 x i64> @llvm.x86.avx512.pror.q.512(<8 x i64> %x0, i32 5)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %3, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %6, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %7, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 6)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[X0]], i32 6)
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP18]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP18]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP20]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP19]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[TMP15]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP16]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[_MSPROP_SELECT1]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 6, <8 x i64> zeroinitializer, i8 %x3)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <16 x i32>@llvm.x86.avx512.mask.psrl.di.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psrl_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP17:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 6)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[X0]], i32 6)
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i32> [[TMP23]], [[TMP18]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP20]], <16 x i32> [[TMP25]], <16 x i32> [[TMP22]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP19]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[TMP15]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP16]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[_MSPROP_SELECT1]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 6, <16 x i32> zeroinitializer, i16 %x3)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psra_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psra_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0:%.*]], i32 3)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psra.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psra_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psra_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0:%.*]], i32 3)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psra.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_psll_di_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psll_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0:%.*]], i32 3)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.psll.di.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_psll_qi_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_psll_qi_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 3)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0:%.*]], i32 3)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 4)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 5)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[X0]], i32 5)
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.psll.qi.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-define <16 x i32> @test_x86_avx512_psll_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psll_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psll_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psll_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psll_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psll_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psll_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psll_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psll_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psll_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrl_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrl_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrl_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrl_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrl_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrl_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrl_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrl_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psra_d(<16 x i32> %a0, <4 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psra_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psra_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP18]], <16 x i32> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP12]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP19]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psra_d(<16 x i32> %a0, <4 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psra_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP18]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32>, <4 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psra_q(<8 x i64> %a0, <2 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psra_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psra_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[TMP12]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP11]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP18]], <8 x i64> [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP14]], <8 x i64> [[TMP12]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psra_q(<8 x i64> %a0, <2 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psra_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP10]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP17]], <8 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> [[TMP11]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP18]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64>, <2 x i64>, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psllv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psllv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psllv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psllv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psllv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psllv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
-
-
-define <16 x i32> @test_x86_avx512_psrav_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrav_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrav_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrav_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrav_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrav_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrav_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrav_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrlv_d(<16 x i32> %a0, <16 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrlv_q(<8 x i64> %a0, <8 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, ptr %ptr)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_psrlv_q_memop(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i64> [[_MSLD]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <8 x i1> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP2]], <8 x i64> [[B]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[B]])
-; CHECK-NEXT:    store <8 x i64> [[TMP11]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %b = load <8 x i64>, ptr %ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_cvt_dq2pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double>
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[CVT]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[CVT:%.*]] = sitofp <8 x i32> [[X0:%.*]] to <8 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_cvt_udq2pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double>
-; CHECK-NEXT:    store <8 x i64> [[TMP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[CVT]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[CVT:%.*]] = uitofp <8 x i32> [[X0:%.*]] to <8 x double>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[CVT]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X1:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[CVT]], <8 x double> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2)
-  ret <8 x double> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0)  #0 {
-; CHECK-LABEL: @test_x86_vcvtph2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_sae(<16 x i16> %a0)  #0 {
-; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_rrk(<16 x i16> %a0,<16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> [[A1:%.*]], i16 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> %a1, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_sae_rrkz(<16 x i16> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_vcvtph2ps_512_sae_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_x86_vcvtph2ps_512_rrkz(<16 x i16> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_x86_vcvtph2ps_512_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> [[A0:%.*]], <16 x float> zeroinitializer, i16 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
-
-define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b)  #0 {
-; CHECK-LABEL: @test_valign_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[PALIGNR]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_valign_q(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <8 x i64> [[B:%.*]], <8 x i64> [[A:%.*]], <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[PALIGNR]], [[SRC:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[PALIGNR]], <8 x i64> [[SRC]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8)
-
-define <16 x i32> @test_maskz_valign_d(<16 x i32> %a, <16 x i32> %b, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_valign_d(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-; CHECK-NEXT:    [[PALIGNR:%.*]] = shufflevector <16 x i32> [[B:%.*]], <16 x i32> [[A:%.*]], <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[PALIGNR]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[PALIGNR]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32> %a, <16 x i32> %b, i32 5, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16)
-
-declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP18]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-
-define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
-  ret <16 x float> %res
-}
-
-; Test case to make sure we can print shuffle decode comments for constant pool loads.
-define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[X2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP17]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
-; CHECK:       18:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       19:
-; CHECK-NEXT:    [[TMP20:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>)
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[TMP20]] to <16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = or <16 x i32> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[TMP27]], <16 x i32> [[TMP23]]
-; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[TMP20]], <16 x float> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP29]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], [[_MSPROP_SELECT1]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[TMP16]], [[TMP28]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i32> zeroinitializer, [[_MSPROP]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP32]], [[RES3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 2, i32 3, i32 0, i32 1, i32 3, i32 2, i32 1, i32 0, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> %x2, i16 %x3)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3>, <16 x float> zeroinitializer, i16 %x3)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>, <16 x float> %x2, i16 -1)
-  %res3 = fadd <16 x float> %res, %res1
-  %res4 = fadd <16 x float> %res2, %res3
-  ret <16 x float> %res4
-}
-
-define <8 x i64> @test_mask_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mask_mul_epi32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32)
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
-; CHECK-NEXT:    [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]]
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP28]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP27]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP32]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
-; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmbk_buildvector(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmbk_buildvector(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
-; CHECK-NEXT:    [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
-; CHECK-NEXT:    [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
-; CHECK-NEXT:    [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
-; CHECK-NEXT:    [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
-; CHECK-NEXT:    [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP8]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
-; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0
-  %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1
-  %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2
-  %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3
-  %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4
-  %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5
-  %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6
-  %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epi32_rmbkz_buildvector(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epi32_rmbkz_buildvector(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[VECINIT_I1:%.*]] = insertelement <8 x i64> [[VECINIT_I]], i64 [[Q]], i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
-; CHECK-NEXT:    [[VECINIT_I2:%.*]] = insertelement <8 x i64> [[VECINIT_I1]], i64 [[Q]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
-; CHECK-NEXT:    [[VECINIT_I3:%.*]] = insertelement <8 x i64> [[VECINIT_I2]], i64 [[Q]], i32 3
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
-; CHECK-NEXT:    [[VECINIT_I4:%.*]] = insertelement <8 x i64> [[VECINIT_I3]], i64 [[Q]], i32 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
-; CHECK-NEXT:    [[VECINIT_I5:%.*]] = insertelement <8 x i64> [[VECINIT_I4]], i64 [[Q]], i32 5
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
-; CHECK-NEXT:    [[VECINIT_I6:%.*]] = insertelement <8 x i64> [[VECINIT_I5]], i64 [[Q]], i32 6
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[B64:%.*]] = insertelement <8 x i64> [[VECINIT_I6]], i64 [[Q]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP7]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP8]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement < 8 x i64> undef, i64 %q, i32 0
-  %vecinit.i1 = insertelement < 8 x i64> %vecinit.i, i64 %q, i32 1
-  %vecinit.i2 = insertelement < 8 x i64> %vecinit.i1, i64 %q, i32 2
-  %vecinit.i3 = insertelement < 8 x i64> %vecinit.i2, i64 %q, i32 3
-  %vecinit.i4 = insertelement < 8 x i64> %vecinit.i3, i64 %q, i32 4
-  %vecinit.i5 = insertelement < 8 x i64> %vecinit.i4, i64 %q, i32 5
-  %vecinit.i6 = insertelement < 8 x i64> %vecinit.i5, i64 %q, i32 6
-  %b64 = insertelement < 8 x i64> %vecinit.i6, i64 %q, i32 7
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
-
-define <8 x i64> @test_mask_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mask_mul_epu32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP24:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP27:%.*]] = or <8 x i64> [[TMP26]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP27]], <8 x i64> [[TMP24]]
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP23]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP28]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP27]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[TMP32:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP32]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 -1)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP30:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP32]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP33]], <8 x i64> [[TMP30]]
-; CHECK-NEXT:    [[TMP34:%.*]] = select <8 x i1> [[TMP29]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP34]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mask_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_mul_epu32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP33]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> zeroinitializer, i8 %mask)
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
-
-define <4 x float> @test_mask_vextractf32x4(<4 x float> %b, <16 x float> %a, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_vextractf32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[A:%.*]], <16 x float> [[A]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x float> [[TMP4]] to <4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x float> [[B:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <4 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i32> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP12]], <4 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <4 x i1> [[EXTRACT]], <4 x float> [[TMP4]], <4 x float> [[B]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP13]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float> %a, i32 2, <4 x float> %b, i8 %mask)
-  ret <4 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.512(<16 x float>, i32, <4 x float>, i8)
-
-define <4 x i64> @test_mask_vextracti64x4(<4 x i64> %b, <8 x i64> %a, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_vextracti64x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[A:%.*]], <8 x i64> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[_MSPROP]], <4 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i64> [[TMP4]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i64> [[TMP10]], <4 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i64> [[TMP4]], <4 x i64> [[B]]
-; CHECK-NEXT:    store <4 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x i64> [[TMP11]]
-;
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64> %a, i32 1, <4 x i64> %b, i8 %mask)
-  ret <4 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.mask.vextracti64x4.512(<8 x i64>, i32, <4 x i64>, i8)
-
-define <4 x i32> @test_maskz_vextracti32x4(<16 x i32> %a, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_vextracti32x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[A]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[_MSPROP]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <4 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <4 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP1]], <4 x i32> [[TMP9]], <4 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x i32> [[TMP10]]
-;
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32> %a, i32 2, <4 x i32> zeroinitializer, i8 %mask)
-  ret <4 x i32> %res
-}
-
-declare <4 x i32> @llvm.x86.avx512.mask.vextracti32x4.512(<16 x i32>, i32, <4 x i32>, i8)
-
-define <4 x double> @test_vextractf64x4(<8 x double> %a)  #0 {
-; CHECK-LABEL: @test_vextractf64x4(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> [[A]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <4 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x double> [[TMP2]]
-;
-  %res = call <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double> %a, i32 1, <4 x double> zeroinitializer, i8 -1)
-  ret <4 x double> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, <4 x double>, i8)
-
-declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[X3:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP14]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> [[X3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP15]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X1:%.*]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x float> [[X0:%.*]], <16 x float> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP12]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4)
-  ret <16 x float> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP3]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X3:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X3]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP13]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> splat (i32 -1), <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X1:%.*]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> [[_MSPROP]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[X0:%.*]], <16 x i32> [[TMP4]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4)
-  ret <16 x i32> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP4]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[X3:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP14]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> [[X3]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP15]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_insertf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X1:%.*]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x double> [[X0:%.*]], <8 x double> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP12]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x double> [[TMP5]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4)
-  ret <8 x double> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 160) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X3:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X3]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP13]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_inserti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> splat (i64 -1), <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X1:%.*]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[_MSPROP]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[X0:%.*]], <8 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_movntdqa(ptr %a0)  #0 {
-;
-; CHECK-LABEL: @test_x86_avx512_movntdqa(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr [[A0:%.*]], align 64, !nontemporal [[META2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    store <8 x i64> [[_MSLD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.movntdqa(ptr %a0)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.movntdqa(ptr) nounwind readonly
-
-define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_cmp_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP12]], [[TMP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i32> [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <16 x i32> [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i1> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP24]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP25]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i32> [[TMP26]], [[TMP1]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP32:%.*]] = and <16 x i32> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <16 x i32> [[TMP30]], [[TMP2]]
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ule <16 x i32> [[TMP28]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ule <16 x i32> [[TMP29]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i1> [[TMP36]] to i16
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP38]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP39]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = xor <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i32> [[TMP41]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = xor <16 x i32> [[TMP41]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i32> [[TMP43]], [[TMP40]]
-; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq <16 x i32> [[TMP44]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP42]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16
-; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x i1> [[TMP46]] to i16
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP47]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP48]], i32 4
-; CHECK-NEXT:    [[TMP49:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP50:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP51:%.*]] = and <16 x i32> [[TMP49]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = or <16 x i32> [[TMP49]], [[TMP1]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP54:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i32> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge <16 x i32> [[TMP51]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp uge <16 x i32> [[TMP52]], [[TMP55]]
-; CHECK-NEXT:    [[TMP59:%.*]] = xor <16 x i1> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP61:%.*]] = bitcast <16 x i1> [[TMP59]] to i16
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast <16 x i1> [[TMP60]] to i16
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP61]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP62]], i32 5
-; CHECK-NEXT:    [[TMP63:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP64:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP65:%.*]] = and <16 x i32> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = or <16 x i32> [[TMP63]], [[TMP1]]
-; CHECK-NEXT:    [[TMP67:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP68:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i32> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = or <16 x i32> [[TMP67]], [[TMP2]]
-; CHECK-NEXT:    [[TMP71:%.*]] = icmp ugt <16 x i32> [[TMP65]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp ugt <16 x i32> [[TMP66]], [[TMP69]]
-; CHECK-NEXT:    [[TMP73:%.*]] = xor <16 x i1> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast <16 x i1> [[TMP73]] to i16
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast <16 x i1> [[TMP74]] to i16
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP75]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP76]], i32 6
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7
-; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
-;
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
-  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
-  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
-  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
-  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
-  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
-  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
-  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
-  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
-  ret <8 x i16> %vec7
-}
-
-define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_cmp_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i32> [[TMP21]], [[TMP1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP27:%.*]] = and <16 x i32> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = or <16 x i32> [[TMP25]], [[TMP2]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <16 x i32> [[TMP24]], [[TMP27]]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <16 x i1> [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP31]], [[TMP34]]
-; CHECK-NEXT:    [[TMP38:%.*]] = or <16 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP38]], [[TMP37]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i1> [[TMP32]], [[TMP34]]
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP41]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP43:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP44:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i32> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or <16 x i32> [[TMP43]], [[TMP1]]
-; CHECK-NEXT:    [[TMP47:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i32> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i32> [[TMP47]], [[TMP2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = icmp ule <16 x i32> [[TMP45]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp ule <16 x i32> [[TMP46]], [[TMP49]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp sle <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP57:%.*]] = and <16 x i1> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP59:%.*]] = and <16 x i1> [[TMP53]], [[TMP56]]
-; CHECK-NEXT:    [[TMP60:%.*]] = or <16 x i1> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP61:%.*]] = or <16 x i1> [[TMP60]], [[TMP59]]
-; CHECK-NEXT:    [[TMP62:%.*]] = and <16 x i1> [[TMP54]], [[TMP56]]
-; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <16 x i1> [[TMP61]] to i16
-; CHECK-NEXT:    [[TMP64:%.*]] = bitcast <16 x i1> [[TMP62]] to i16
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP63]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP64]], i32 2
-; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP67:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]]
-; CHECK-NEXT:    [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP65]]
-; CHECK-NEXT:    [[TMP69:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]]
-; CHECK-NEXT:    [[TMP70:%.*]] = or <16 x i1> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP71:%.*]] = or <16 x i1> [[TMP70]], [[TMP69]]
-; CHECK-NEXT:    [[TMP72:%.*]] = and <16 x i1> zeroinitializer, [[TMP66]]
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast <16 x i1> [[TMP71]] to i16
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast <16 x i1> [[TMP72]] to i16
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP73]], i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP74]], i32 3
-; CHECK-NEXT:    [[TMP75:%.*]] = xor <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP76:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <16 x i32> [[TMP76]], zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <16 x i32> [[TMP76]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP79:%.*]] = and <16 x i32> [[TMP78]], [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq <16 x i32> [[TMP79]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP77]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP83:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP84:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP82]]
-; CHECK-NEXT:    [[TMP85:%.*]] = and <16 x i1> [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP86:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP83]]
-; CHECK-NEXT:    [[TMP87:%.*]] = or <16 x i1> [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP88:%.*]] = or <16 x i1> [[TMP87]], [[TMP86]]
-; CHECK-NEXT:    [[TMP89:%.*]] = and <16 x i1> [[TMP81]], [[TMP83]]
-; CHECK-NEXT:    [[TMP90:%.*]] = bitcast <16 x i1> [[TMP88]] to i16
-; CHECK-NEXT:    [[TMP91:%.*]] = bitcast <16 x i1> [[TMP89]] to i16
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP90]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP91]], i32 4
-; CHECK-NEXT:    [[TMP92:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP93:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP94:%.*]] = and <16 x i32> [[TMP92]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = or <16 x i32> [[TMP92]], [[TMP1]]
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP98:%.*]] = and <16 x i32> [[TMP96]], [[TMP97]]
-; CHECK-NEXT:    [[TMP99:%.*]] = or <16 x i32> [[TMP96]], [[TMP2]]
-; CHECK-NEXT:    [[TMP100:%.*]] = icmp uge <16 x i32> [[TMP94]], [[TMP99]]
-; CHECK-NEXT:    [[TMP101:%.*]] = icmp uge <16 x i32> [[TMP95]], [[TMP98]]
-; CHECK-NEXT:    [[TMP102:%.*]] = xor <16 x i1> [[TMP100]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = icmp sge <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP104:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP106:%.*]] = and <16 x i1> [[TMP102]], [[TMP104]]
-; CHECK-NEXT:    [[TMP107:%.*]] = and <16 x i1> [[TMP103]], [[TMP104]]
-; CHECK-NEXT:    [[TMP108:%.*]] = and <16 x i1> [[TMP102]], [[TMP105]]
-; CHECK-NEXT:    [[TMP109:%.*]] = or <16 x i1> [[TMP106]], [[TMP107]]
-; CHECK-NEXT:    [[TMP110:%.*]] = or <16 x i1> [[TMP109]], [[TMP108]]
-; CHECK-NEXT:    [[TMP111:%.*]] = and <16 x i1> [[TMP103]], [[TMP105]]
-; CHECK-NEXT:    [[TMP112:%.*]] = bitcast <16 x i1> [[TMP110]] to i16
-; CHECK-NEXT:    [[TMP113:%.*]] = bitcast <16 x i1> [[TMP111]] to i16
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP112]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP113]], i32 5
-; CHECK-NEXT:    [[TMP114:%.*]] = xor <16 x i32> [[A0]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP115:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP116:%.*]] = and <16 x i32> [[TMP114]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = or <16 x i32> [[TMP114]], [[TMP1]]
-; CHECK-NEXT:    [[TMP118:%.*]] = xor <16 x i32> [[A1]], splat (i32 -2147483648)
-; CHECK-NEXT:    [[TMP119:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP120:%.*]] = and <16 x i32> [[TMP118]], [[TMP119]]
-; CHECK-NEXT:    [[TMP121:%.*]] = or <16 x i32> [[TMP118]], [[TMP2]]
-; CHECK-NEXT:    [[TMP122:%.*]] = icmp ugt <16 x i32> [[TMP116]], [[TMP121]]
-; CHECK-NEXT:    [[TMP123:%.*]] = icmp ugt <16 x i32> [[TMP117]], [[TMP120]]
-; CHECK-NEXT:    [[TMP124:%.*]] = xor <16 x i1> [[TMP122]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = icmp sgt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP126:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP127:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP128:%.*]] = and <16 x i1> [[TMP124]], [[TMP126]]
-; CHECK-NEXT:    [[TMP129:%.*]] = and <16 x i1> [[TMP125]], [[TMP126]]
-; CHECK-NEXT:    [[TMP130:%.*]] = and <16 x i1> [[TMP124]], [[TMP127]]
-; CHECK-NEXT:    [[TMP131:%.*]] = or <16 x i1> [[TMP128]], [[TMP129]]
-; CHECK-NEXT:    [[TMP132:%.*]] = or <16 x i1> [[TMP131]], [[TMP130]]
-; CHECK-NEXT:    [[TMP133:%.*]] = and <16 x i1> [[TMP125]], [[TMP127]]
-; CHECK-NEXT:    [[TMP134:%.*]] = bitcast <16 x i1> [[TMP132]] to i16
-; CHECK-NEXT:    [[TMP135:%.*]] = bitcast <16 x i1> [[TMP133]] to i16
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP134]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP135]], i32 6
-; CHECK-NEXT:    [[TMP136:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP138:%.*]] = and <16 x i1> zeroinitializer, [[TMP136]]
-; CHECK-NEXT:    [[TMP139:%.*]] = and <16 x i1> splat (i1 true), [[TMP136]]
-; CHECK-NEXT:    [[TMP140:%.*]] = and <16 x i1> zeroinitializer, [[TMP137]]
-; CHECK-NEXT:    [[TMP141:%.*]] = or <16 x i1> [[TMP138]], [[TMP139]]
-; CHECK-NEXT:    [[TMP142:%.*]] = or <16 x i1> [[TMP141]], [[TMP140]]
-; CHECK-NEXT:    [[TMP143:%.*]] = and <16 x i1> splat (i1 true), [[TMP137]]
-; CHECK-NEXT:    [[TMP144:%.*]] = bitcast <16 x i1> [[TMP142]] to i16
-; CHECK-NEXT:    [[TMP145:%.*]] = bitcast <16 x i1> [[TMP143]] to i16
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP144]], i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP145]], i32 7
-; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
-;
-  %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
-  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-  %res1 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
-  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-  %res2 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
-  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-  %res3 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
-  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-  %res4 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
-  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-  %res5 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
-  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-  %res6 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
-  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-  %res7 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
-  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
-  ret <8 x i16> %vec7
-}
-
-declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
-
-define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1)  #0 {
-; CHECK-LABEL: @test_ucmp_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i32> [[TMP4]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP10]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i32> [[A0]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP16:%.*]] = and <16 x i32> [[A1]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <16 x i32> [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <16 x i32> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i1> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i1> [[TMP20]] to i16
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP22]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP25:%.*]] = and <16 x i32> [[A0]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[A1]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ule <16 x i32> [[TMP25]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ule <16 x i32> [[TMP26]], [[TMP28]]
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <16 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <16 x i1> [[TMP32]] to i16
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <16 x i1> [[TMP33]] to i16
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP34]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP35]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 0, i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 0, i32 3
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <16 x i32> [[TMP37]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = xor <16 x i32> [[TMP37]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP40:%.*]] = and <16 x i32> [[TMP39]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <16 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP38]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP4]] to i16
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <16 x i1> [[TMP42]] to i16
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP43]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP44]], i32 4
-; CHECK-NEXT:    [[TMP45:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP46:%.*]] = and <16 x i32> [[A0]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP49:%.*]] = and <16 x i32> [[A1]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = icmp uge <16 x i32> [[TMP46]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp uge <16 x i32> [[TMP47]], [[TMP49]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <16 x i1> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <16 x i1> [[TMP53]] to i16
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <16 x i1> [[TMP54]] to i16
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP55]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP56]], i32 5
-; CHECK-NEXT:    [[TMP57:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i32> [[A0]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP60:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP61:%.*]] = and <16 x i32> [[A1]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP63:%.*]] = icmp ugt <16 x i32> [[TMP58]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = icmp ugt <16 x i32> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP65:%.*]] = xor <16 x i1> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast <16 x i1> [[TMP65]] to i16
-; CHECK-NEXT:    [[TMP68:%.*]] = bitcast <16 x i1> [[TMP66]] to i16
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP67]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP68]], i32 6
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 0, i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 -1, i32 7
-; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
-;
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
-  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 -1)
-  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 -1)
-  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 -1)
-  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 -1)
-  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 -1)
-  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 -1)
-  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 -1)
-  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
-  ret <8 x i16> %vec7
-}
-
-define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_ucmp_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i32> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[TMP5]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x i1> [[TMP17]] to i16
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i1> [[TMP18]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i16> splat (i16 -1), i16 [[TMP19]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i32> [[A0]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP25:%.*]] = and <16 x i32> [[A1]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP22]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ult <16 x i32> [[TMP23]], [[TMP25]]
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <16 x i1> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = and <16 x i1> [[TMP29]], [[TMP31]]
-; CHECK-NEXT:    [[TMP34:%.*]] = and <16 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP29]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = or <16 x i1> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i1> [[TMP37]] to i16
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i16> [[_MSPROP]], i16 [[TMP39]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[VEC0]], i16 [[TMP40]], i32 1
-; CHECK-NEXT:    [[TMP41:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP42:%.*]] = and <16 x i32> [[A0]], [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP44:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP45:%.*]] = and <16 x i32> [[A1]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp ule <16 x i32> [[TMP42]], [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp ule <16 x i32> [[TMP43]], [[TMP45]]
-; CHECK-NEXT:    [[TMP49:%.*]] = xor <16 x i1> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp ule <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP53:%.*]] = and <16 x i1> [[TMP49]], [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = and <16 x i1> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP55:%.*]] = and <16 x i1> [[TMP49]], [[TMP52]]
-; CHECK-NEXT:    [[TMP56:%.*]] = or <16 x i1> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = or <16 x i1> [[TMP56]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = and <16 x i1> [[TMP50]], [[TMP52]]
-; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <16 x i1> [[TMP57]] to i16
-; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <16 x i1> [[TMP58]] to i16
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i16> [[_MSPROP1]], i16 [[TMP59]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[TMP60]], i32 2
-; CHECK-NEXT:    [[TMP61:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP63:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]]
-; CHECK-NEXT:    [[TMP64:%.*]] = and <16 x i1> zeroinitializer, [[TMP61]]
-; CHECK-NEXT:    [[TMP65:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]]
-; CHECK-NEXT:    [[TMP66:%.*]] = or <16 x i1> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP67:%.*]] = or <16 x i1> [[TMP66]], [[TMP65]]
-; CHECK-NEXT:    [[TMP68:%.*]] = and <16 x i1> zeroinitializer, [[TMP62]]
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <16 x i1> [[TMP67]] to i16
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast <16 x i1> [[TMP68]] to i16
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i16> [[_MSPROP2]], i16 [[TMP69]], i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[TMP70]], i32 3
-; CHECK-NEXT:    [[TMP71:%.*]] = xor <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP72:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne <16 x i32> [[TMP72]], zeroinitializer
-; CHECK-NEXT:    [[TMP74:%.*]] = xor <16 x i32> [[TMP72]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP75:%.*]] = and <16 x i32> [[TMP74]], [[TMP71]]
-; CHECK-NEXT:    [[TMP76:%.*]] = icmp eq <16 x i32> [[TMP75]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <16 x i1> [[TMP73]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP80:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = and <16 x i1> [[TMP77]], [[TMP78]]
-; CHECK-NEXT:    [[TMP82:%.*]] = and <16 x i1> [[_MSPROP_ICMP4]], [[TMP79]]
-; CHECK-NEXT:    [[TMP83:%.*]] = or <16 x i1> [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP84:%.*]] = or <16 x i1> [[TMP83]], [[TMP82]]
-; CHECK-NEXT:    [[TMP85:%.*]] = and <16 x i1> [[TMP77]], [[TMP79]]
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast <16 x i1> [[TMP84]] to i16
-; CHECK-NEXT:    [[TMP87:%.*]] = bitcast <16 x i1> [[TMP85]] to i16
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i16> [[_MSPROP3]], i16 [[TMP86]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[TMP87]], i32 4
-; CHECK-NEXT:    [[TMP88:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP89:%.*]] = and <16 x i32> [[A0]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP91:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP92:%.*]] = and <16 x i32> [[A1]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP94:%.*]] = icmp uge <16 x i32> [[TMP89]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = icmp uge <16 x i32> [[TMP90]], [[TMP92]]
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <16 x i1> [[TMP94]], [[TMP95]]
-; CHECK-NEXT:    [[TMP97:%.*]] = icmp uge <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP98:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP99:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP100:%.*]] = and <16 x i1> [[TMP96]], [[TMP98]]
-; CHECK-NEXT:    [[TMP101:%.*]] = and <16 x i1> [[TMP97]], [[TMP98]]
-; CHECK-NEXT:    [[TMP102:%.*]] = and <16 x i1> [[TMP96]], [[TMP99]]
-; CHECK-NEXT:    [[TMP103:%.*]] = or <16 x i1> [[TMP100]], [[TMP101]]
-; CHECK-NEXT:    [[TMP104:%.*]] = or <16 x i1> [[TMP103]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = and <16 x i1> [[TMP97]], [[TMP99]]
-; CHECK-NEXT:    [[TMP106:%.*]] = bitcast <16 x i1> [[TMP104]] to i16
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast <16 x i1> [[TMP105]] to i16
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i16> [[_MSPROP5]], i16 [[TMP106]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[TMP107]], i32 5
-; CHECK-NEXT:    [[TMP108:%.*]] = xor <16 x i32> [[TMP1]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP109:%.*]] = and <16 x i32> [[A0]], [[TMP108]]
-; CHECK-NEXT:    [[TMP110:%.*]] = or <16 x i32> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP111:%.*]] = xor <16 x i32> [[TMP2]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP112:%.*]] = and <16 x i32> [[A1]], [[TMP111]]
-; CHECK-NEXT:    [[TMP113:%.*]] = or <16 x i32> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP114:%.*]] = icmp ugt <16 x i32> [[TMP109]], [[TMP113]]
-; CHECK-NEXT:    [[TMP115:%.*]] = icmp ugt <16 x i32> [[TMP110]], [[TMP112]]
-; CHECK-NEXT:    [[TMP116:%.*]] = xor <16 x i1> [[TMP114]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = icmp ugt <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP119:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP120:%.*]] = and <16 x i1> [[TMP116]], [[TMP118]]
-; CHECK-NEXT:    [[TMP121:%.*]] = and <16 x i1> [[TMP117]], [[TMP118]]
-; CHECK-NEXT:    [[TMP122:%.*]] = and <16 x i1> [[TMP116]], [[TMP119]]
-; CHECK-NEXT:    [[TMP123:%.*]] = or <16 x i1> [[TMP120]], [[TMP121]]
-; CHECK-NEXT:    [[TMP124:%.*]] = or <16 x i1> [[TMP123]], [[TMP122]]
-; CHECK-NEXT:    [[TMP125:%.*]] = and <16 x i1> [[TMP117]], [[TMP119]]
-; CHECK-NEXT:    [[TMP126:%.*]] = bitcast <16 x i1> [[TMP124]] to i16
-; CHECK-NEXT:    [[TMP127:%.*]] = bitcast <16 x i1> [[TMP125]] to i16
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i16> [[_MSPROP6]], i16 [[TMP126]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[TMP127]], i32 6
-; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP129:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP130:%.*]] = and <16 x i1> zeroinitializer, [[TMP128]]
-; CHECK-NEXT:    [[TMP131:%.*]] = and <16 x i1> splat (i1 true), [[TMP128]]
-; CHECK-NEXT:    [[TMP132:%.*]] = and <16 x i1> zeroinitializer, [[TMP129]]
-; CHECK-NEXT:    [[TMP133:%.*]] = or <16 x i1> [[TMP130]], [[TMP131]]
-; CHECK-NEXT:    [[TMP134:%.*]] = or <16 x i1> [[TMP133]], [[TMP132]]
-; CHECK-NEXT:    [[TMP135:%.*]] = and <16 x i1> splat (i1 true), [[TMP129]]
-; CHECK-NEXT:    [[TMP136:%.*]] = bitcast <16 x i1> [[TMP134]] to i16
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast <16 x i1> [[TMP135]] to i16
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i16> [[_MSPROP7]], i16 [[TMP136]], i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[TMP137]], i32 7
-; CHECK-NEXT:    store <8 x i16> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[VEC7]]
-;
-  %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
-  %vec0 = insertelement <8 x i16> undef, i16 %res0, i32 0
-  %res1 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 1, i16 %mask)
-  %vec1 = insertelement <8 x i16> %vec0, i16 %res1, i32 1
-  %res2 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 2, i16 %mask)
-  %vec2 = insertelement <8 x i16> %vec1, i16 %res2, i32 2
-  %res3 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 3, i16 %mask)
-  %vec3 = insertelement <8 x i16> %vec2, i16 %res3, i32 3
-  %res4 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 4, i16 %mask)
-  %vec4 = insertelement <8 x i16> %vec3, i16 %res4, i32 4
-  %res5 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 5, i16 %mask)
-  %vec5 = insertelement <8 x i16> %vec4, i16 %res5, i32 5
-  %res6 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 6, i16 %mask)
-  %vec6 = insertelement <8 x i16> %vec5, i16 %res6, i32 6
-  %res7 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 7, i16 %mask)
-  %vec7 = insertelement <8 x i16> %vec6, i16 %res7, i32 7
-  ret <8 x i16> %vec7
-}
-
-declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) nounwind readnone
-
-define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_cmp_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP16]], [[TMP2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <8 x i64> [[TMP15]], [[TMP18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i1> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <8 x i1> [[TMP22]] to i8
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP24]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP25]], i32 1
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[TMP26]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP26]], [[TMP1]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP32:%.*]] = and <8 x i64> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = or <8 x i64> [[TMP30]], [[TMP2]]
-; CHECK-NEXT:    [[TMP34:%.*]] = icmp ule <8 x i64> [[TMP28]], [[TMP33]]
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ule <8 x i64> [[TMP29]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP37:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i1> [[TMP36]] to i8
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP38]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP39]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3
-; CHECK-NEXT:    [[TMP40:%.*]] = xor <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i64> [[TMP41]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = xor <8 x i64> [[TMP41]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP44:%.*]] = and <8 x i64> [[TMP43]], [[TMP40]]
-; CHECK-NEXT:    [[TMP45:%.*]] = icmp eq <8 x i64> [[TMP44]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP42]], [[TMP45]]
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8
-; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <8 x i1> [[TMP46]] to i8
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP47]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP48]], i32 4
-; CHECK-NEXT:    [[TMP49:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP50:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP51:%.*]] = and <8 x i64> [[TMP49]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = or <8 x i64> [[TMP49]], [[TMP1]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP54:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i64> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP56:%.*]] = or <8 x i64> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP57:%.*]] = icmp uge <8 x i64> [[TMP51]], [[TMP56]]
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp uge <8 x i64> [[TMP52]], [[TMP55]]
-; CHECK-NEXT:    [[TMP59:%.*]] = xor <8 x i1> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP60:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP61:%.*]] = bitcast <8 x i1> [[TMP59]] to i8
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast <8 x i1> [[TMP60]] to i8
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP61]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP62]], i32 5
-; CHECK-NEXT:    [[TMP63:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP64:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP65:%.*]] = and <8 x i64> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = or <8 x i64> [[TMP63]], [[TMP1]]
-; CHECK-NEXT:    [[TMP67:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP68:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP69:%.*]] = and <8 x i64> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i64> [[TMP67]], [[TMP2]]
-; CHECK-NEXT:    [[TMP71:%.*]] = icmp ugt <8 x i64> [[TMP65]], [[TMP70]]
-; CHECK-NEXT:    [[TMP72:%.*]] = icmp ugt <8 x i64> [[TMP66]], [[TMP69]]
-; CHECK-NEXT:    [[TMP73:%.*]] = xor <8 x i1> [[TMP71]], [[TMP72]]
-; CHECK-NEXT:    [[TMP74:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP75:%.*]] = bitcast <8 x i1> [[TMP73]] to i8
-; CHECK-NEXT:    [[TMP76:%.*]] = bitcast <8 x i1> [[TMP74]] to i8
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP75]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP76]], i32 6
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7
-; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
-;
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
-  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
-  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
-  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
-  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
-  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
-  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
-  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
-  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
-  ret <8 x i8> %vec7
-}
-
-define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_cmp_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP27:%.*]] = and <8 x i64> [[TMP25]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = or <8 x i64> [[TMP25]], [[TMP2]]
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <8 x i64> [[TMP24]], [[TMP27]]
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <8 x i1> [[TMP29]], [[TMP30]]
-; CHECK-NEXT:    [[TMP32:%.*]] = icmp slt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP36:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = and <8 x i1> [[TMP31]], [[TMP34]]
-; CHECK-NEXT:    [[TMP38:%.*]] = or <8 x i1> [[TMP35]], [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = or <8 x i1> [[TMP38]], [[TMP37]]
-; CHECK-NEXT:    [[TMP40:%.*]] = and <8 x i1> [[TMP32]], [[TMP34]]
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <8 x i1> [[TMP40]] to i8
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP41]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP42]], i32 1
-; CHECK-NEXT:    [[TMP43:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP44:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i64> [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or <8 x i64> [[TMP43]], [[TMP1]]
-; CHECK-NEXT:    [[TMP47:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP49:%.*]] = and <8 x i64> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or <8 x i64> [[TMP47]], [[TMP2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = icmp ule <8 x i64> [[TMP45]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp ule <8 x i64> [[TMP46]], [[TMP49]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp sle <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP57:%.*]] = and <8 x i1> [[TMP53]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i1> [[TMP54]], [[TMP55]]
-; CHECK-NEXT:    [[TMP59:%.*]] = and <8 x i1> [[TMP53]], [[TMP56]]
-; CHECK-NEXT:    [[TMP60:%.*]] = or <8 x i1> [[TMP57]], [[TMP58]]
-; CHECK-NEXT:    [[TMP61:%.*]] = or <8 x i1> [[TMP60]], [[TMP59]]
-; CHECK-NEXT:    [[TMP62:%.*]] = and <8 x i1> [[TMP54]], [[TMP56]]
-; CHECK-NEXT:    [[TMP63:%.*]] = bitcast <8 x i1> [[TMP61]] to i8
-; CHECK-NEXT:    [[TMP64:%.*]] = bitcast <8 x i1> [[TMP62]] to i8
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP63]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP64]], i32 2
-; CHECK-NEXT:    [[TMP65:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP66:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP67:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]]
-; CHECK-NEXT:    [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP65]]
-; CHECK-NEXT:    [[TMP69:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]]
-; CHECK-NEXT:    [[TMP70:%.*]] = or <8 x i1> [[TMP67]], [[TMP68]]
-; CHECK-NEXT:    [[TMP71:%.*]] = or <8 x i1> [[TMP70]], [[TMP69]]
-; CHECK-NEXT:    [[TMP72:%.*]] = and <8 x i1> zeroinitializer, [[TMP66]]
-; CHECK-NEXT:    [[TMP73:%.*]] = bitcast <8 x i1> [[TMP71]] to i8
-; CHECK-NEXT:    [[TMP74:%.*]] = bitcast <8 x i1> [[TMP72]] to i8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP73]], i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP74]], i32 3
-; CHECK-NEXT:    [[TMP75:%.*]] = xor <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP76:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <8 x i64> [[TMP76]], zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = xor <8 x i64> [[TMP76]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP79:%.*]] = and <8 x i64> [[TMP78]], [[TMP75]]
-; CHECK-NEXT:    [[TMP80:%.*]] = icmp eq <8 x i64> [[TMP79]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP77]], [[TMP80]]
-; CHECK-NEXT:    [[TMP81:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP82:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP83:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP84:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP82]]
-; CHECK-NEXT:    [[TMP85:%.*]] = and <8 x i1> [[TMP81]], [[TMP82]]
-; CHECK-NEXT:    [[TMP86:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP83]]
-; CHECK-NEXT:    [[TMP87:%.*]] = or <8 x i1> [[TMP84]], [[TMP85]]
-; CHECK-NEXT:    [[TMP88:%.*]] = or <8 x i1> [[TMP87]], [[TMP86]]
-; CHECK-NEXT:    [[TMP89:%.*]] = and <8 x i1> [[TMP81]], [[TMP83]]
-; CHECK-NEXT:    [[TMP90:%.*]] = bitcast <8 x i1> [[TMP88]] to i8
-; CHECK-NEXT:    [[TMP91:%.*]] = bitcast <8 x i1> [[TMP89]] to i8
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP90]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP91]], i32 4
-; CHECK-NEXT:    [[TMP92:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP93:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP94:%.*]] = and <8 x i64> [[TMP92]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = or <8 x i64> [[TMP92]], [[TMP1]]
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP97:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP98:%.*]] = and <8 x i64> [[TMP96]], [[TMP97]]
-; CHECK-NEXT:    [[TMP99:%.*]] = or <8 x i64> [[TMP96]], [[TMP2]]
-; CHECK-NEXT:    [[TMP100:%.*]] = icmp uge <8 x i64> [[TMP94]], [[TMP99]]
-; CHECK-NEXT:    [[TMP101:%.*]] = icmp uge <8 x i64> [[TMP95]], [[TMP98]]
-; CHECK-NEXT:    [[TMP102:%.*]] = xor <8 x i1> [[TMP100]], [[TMP101]]
-; CHECK-NEXT:    [[TMP103:%.*]] = icmp sge <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP104:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP105:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP106:%.*]] = and <8 x i1> [[TMP102]], [[TMP104]]
-; CHECK-NEXT:    [[TMP107:%.*]] = and <8 x i1> [[TMP103]], [[TMP104]]
-; CHECK-NEXT:    [[TMP108:%.*]] = and <8 x i1> [[TMP102]], [[TMP105]]
-; CHECK-NEXT:    [[TMP109:%.*]] = or <8 x i1> [[TMP106]], [[TMP107]]
-; CHECK-NEXT:    [[TMP110:%.*]] = or <8 x i1> [[TMP109]], [[TMP108]]
-; CHECK-NEXT:    [[TMP111:%.*]] = and <8 x i1> [[TMP103]], [[TMP105]]
-; CHECK-NEXT:    [[TMP112:%.*]] = bitcast <8 x i1> [[TMP110]] to i8
-; CHECK-NEXT:    [[TMP113:%.*]] = bitcast <8 x i1> [[TMP111]] to i8
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP112]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP113]], i32 5
-; CHECK-NEXT:    [[TMP114:%.*]] = xor <8 x i64> [[A0]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP115:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP116:%.*]] = and <8 x i64> [[TMP114]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = or <8 x i64> [[TMP114]], [[TMP1]]
-; CHECK-NEXT:    [[TMP118:%.*]] = xor <8 x i64> [[A1]], splat (i64 -9223372036854775808)
-; CHECK-NEXT:    [[TMP119:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP120:%.*]] = and <8 x i64> [[TMP118]], [[TMP119]]
-; CHECK-NEXT:    [[TMP121:%.*]] = or <8 x i64> [[TMP118]], [[TMP2]]
-; CHECK-NEXT:    [[TMP122:%.*]] = icmp ugt <8 x i64> [[TMP116]], [[TMP121]]
-; CHECK-NEXT:    [[TMP123:%.*]] = icmp ugt <8 x i64> [[TMP117]], [[TMP120]]
-; CHECK-NEXT:    [[TMP124:%.*]] = xor <8 x i1> [[TMP122]], [[TMP123]]
-; CHECK-NEXT:    [[TMP125:%.*]] = icmp sgt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP126:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP127:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP128:%.*]] = and <8 x i1> [[TMP124]], [[TMP126]]
-; CHECK-NEXT:    [[TMP129:%.*]] = and <8 x i1> [[TMP125]], [[TMP126]]
-; CHECK-NEXT:    [[TMP130:%.*]] = and <8 x i1> [[TMP124]], [[TMP127]]
-; CHECK-NEXT:    [[TMP131:%.*]] = or <8 x i1> [[TMP128]], [[TMP129]]
-; CHECK-NEXT:    [[TMP132:%.*]] = or <8 x i1> [[TMP131]], [[TMP130]]
-; CHECK-NEXT:    [[TMP133:%.*]] = and <8 x i1> [[TMP125]], [[TMP127]]
-; CHECK-NEXT:    [[TMP134:%.*]] = bitcast <8 x i1> [[TMP132]] to i8
-; CHECK-NEXT:    [[TMP135:%.*]] = bitcast <8 x i1> [[TMP133]] to i8
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP134]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP135]], i32 6
-; CHECK-NEXT:    [[TMP136:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP138:%.*]] = and <8 x i1> zeroinitializer, [[TMP136]]
-; CHECK-NEXT:    [[TMP139:%.*]] = and <8 x i1> splat (i1 true), [[TMP136]]
-; CHECK-NEXT:    [[TMP140:%.*]] = and <8 x i1> zeroinitializer, [[TMP137]]
-; CHECK-NEXT:    [[TMP141:%.*]] = or <8 x i1> [[TMP138]], [[TMP139]]
-; CHECK-NEXT:    [[TMP142:%.*]] = or <8 x i1> [[TMP141]], [[TMP140]]
-; CHECK-NEXT:    [[TMP143:%.*]] = and <8 x i1> splat (i1 true), [[TMP137]]
-; CHECK-NEXT:    [[TMP144:%.*]] = bitcast <8 x i1> [[TMP142]] to i8
-; CHECK-NEXT:    [[TMP145:%.*]] = bitcast <8 x i1> [[TMP143]] to i8
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP144]], i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP145]], i32 7
-; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
-;
-  %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
-  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
-  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
-  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
-  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
-  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-  %res5 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
-  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-  %res6 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
-  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-  %res7 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
-  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
-  ret <8 x i8> %vec7
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
-
-define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1)  #0 {
-; CHECK-LABEL: @test_ucmp_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <8 x i64> [[TMP4]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP5]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP9]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP10]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP11]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[A0]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[A1]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult <8 x i64> [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult <8 x i64> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i1> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i1> [[TMP20]] to i8
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <8 x i1> [[TMP21]] to i8
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP22]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP23]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[A0]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[A1]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ule <8 x i64> [[TMP25]], [[TMP29]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ule <8 x i64> [[TMP26]], [[TMP28]]
-; CHECK-NEXT:    [[TMP32:%.*]] = xor <8 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast <8 x i1> [[TMP32]] to i8
-; CHECK-NEXT:    [[TMP35:%.*]] = bitcast <8 x i1> [[TMP33]] to i8
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP34]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP35]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 0, i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 0, i32 3
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp ne <8 x i64> [[TMP37]], zeroinitializer
-; CHECK-NEXT:    [[TMP39:%.*]] = xor <8 x i64> [[TMP37]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP40:%.*]] = and <8 x i64> [[TMP39]], [[TMP36]]
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <8 x i64> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP38]], [[TMP41]]
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP4]] to i8
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast <8 x i1> [[TMP42]] to i8
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP43]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP44]], i32 4
-; CHECK-NEXT:    [[TMP45:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP46:%.*]] = and <8 x i64> [[A0]], [[TMP45]]
-; CHECK-NEXT:    [[TMP47:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP48:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP49:%.*]] = and <8 x i64> [[A1]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP51:%.*]] = icmp uge <8 x i64> [[TMP46]], [[TMP50]]
-; CHECK-NEXT:    [[TMP52:%.*]] = icmp uge <8 x i64> [[TMP47]], [[TMP49]]
-; CHECK-NEXT:    [[TMP53:%.*]] = xor <8 x i1> [[TMP51]], [[TMP52]]
-; CHECK-NEXT:    [[TMP54:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP55:%.*]] = bitcast <8 x i1> [[TMP53]] to i8
-; CHECK-NEXT:    [[TMP56:%.*]] = bitcast <8 x i1> [[TMP54]] to i8
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP55]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP56]], i32 5
-; CHECK-NEXT:    [[TMP57:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i64> [[A0]], [[TMP57]]
-; CHECK-NEXT:    [[TMP59:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP60:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP61:%.*]] = and <8 x i64> [[A1]], [[TMP60]]
-; CHECK-NEXT:    [[TMP62:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP63:%.*]] = icmp ugt <8 x i64> [[TMP58]], [[TMP62]]
-; CHECK-NEXT:    [[TMP64:%.*]] = icmp ugt <8 x i64> [[TMP59]], [[TMP61]]
-; CHECK-NEXT:    [[TMP65:%.*]] = xor <8 x i1> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP66:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP67:%.*]] = bitcast <8 x i1> [[TMP65]] to i8
-; CHECK-NEXT:    [[TMP68:%.*]] = bitcast <8 x i1> [[TMP66]] to i8
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP67]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP68]], i32 6
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 0, i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 -1, i32 7
-; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
-;
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
-  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 -1)
-  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 -1)
-  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 -1)
-  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 -1)
-  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 -1)
-  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 -1)
-  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 -1)
-  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
-  ret <8 x i8> %vec7
-}
-
-define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_ucmp_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <8 x i64> [[A0:%.*]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP5]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP6]], [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i1> [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i1> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i1> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x i1> [[TMP17]] to i8
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i1> [[TMP18]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i8> splat (i8 -1), i8 [[TMP19]], i32 0
-; CHECK-NEXT:    [[VEC0:%.*]] = insertelement <8 x i8> undef, i8 [[TMP20]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[A0]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[A1]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <8 x i64> [[TMP22]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = icmp ult <8 x i64> [[TMP23]], [[TMP25]]
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i1> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp ult <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = and <8 x i1> [[TMP29]], [[TMP31]]
-; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[TMP30]], [[TMP31]]
-; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP29]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = or <8 x i1> [[TMP33]], [[TMP34]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i1> [[TMP36]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = and <8 x i1> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i1> [[TMP37]] to i8
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i8> [[_MSPROP]], i8 [[TMP39]], i32 1
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i8> [[VEC0]], i8 [[TMP40]], i32 1
-; CHECK-NEXT:    [[TMP41:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP42:%.*]] = and <8 x i64> [[A0]], [[TMP41]]
-; CHECK-NEXT:    [[TMP43:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP44:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP45:%.*]] = and <8 x i64> [[A1]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp ule <8 x i64> [[TMP42]], [[TMP46]]
-; CHECK-NEXT:    [[TMP48:%.*]] = icmp ule <8 x i64> [[TMP43]], [[TMP45]]
-; CHECK-NEXT:    [[TMP49:%.*]] = xor <8 x i1> [[TMP47]], [[TMP48]]
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp ule <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP51:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP52:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP53:%.*]] = and <8 x i1> [[TMP49]], [[TMP51]]
-; CHECK-NEXT:    [[TMP54:%.*]] = and <8 x i1> [[TMP50]], [[TMP51]]
-; CHECK-NEXT:    [[TMP55:%.*]] = and <8 x i1> [[TMP49]], [[TMP52]]
-; CHECK-NEXT:    [[TMP56:%.*]] = or <8 x i1> [[TMP53]], [[TMP54]]
-; CHECK-NEXT:    [[TMP57:%.*]] = or <8 x i1> [[TMP56]], [[TMP55]]
-; CHECK-NEXT:    [[TMP58:%.*]] = and <8 x i1> [[TMP50]], [[TMP52]]
-; CHECK-NEXT:    [[TMP59:%.*]] = bitcast <8 x i1> [[TMP57]] to i8
-; CHECK-NEXT:    [[TMP60:%.*]] = bitcast <8 x i1> [[TMP58]] to i8
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i8> [[_MSPROP1]], i8 [[TMP59]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i8> [[VEC1]], i8 [[TMP60]], i32 2
-; CHECK-NEXT:    [[TMP61:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP62:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP63:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]]
-; CHECK-NEXT:    [[TMP64:%.*]] = and <8 x i1> zeroinitializer, [[TMP61]]
-; CHECK-NEXT:    [[TMP65:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]]
-; CHECK-NEXT:    [[TMP66:%.*]] = or <8 x i1> [[TMP63]], [[TMP64]]
-; CHECK-NEXT:    [[TMP67:%.*]] = or <8 x i1> [[TMP66]], [[TMP65]]
-; CHECK-NEXT:    [[TMP68:%.*]] = and <8 x i1> zeroinitializer, [[TMP62]]
-; CHECK-NEXT:    [[TMP69:%.*]] = bitcast <8 x i1> [[TMP67]] to i8
-; CHECK-NEXT:    [[TMP70:%.*]] = bitcast <8 x i1> [[TMP68]] to i8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i8> [[_MSPROP2]], i8 [[TMP69]], i32 3
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i8> [[VEC2]], i8 [[TMP70]], i32 3
-; CHECK-NEXT:    [[TMP71:%.*]] = xor <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP72:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP73:%.*]] = icmp ne <8 x i64> [[TMP72]], zeroinitializer
-; CHECK-NEXT:    [[TMP74:%.*]] = xor <8 x i64> [[TMP72]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP75:%.*]] = and <8 x i64> [[TMP74]], [[TMP71]]
-; CHECK-NEXT:    [[TMP76:%.*]] = icmp eq <8 x i64> [[TMP75]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP4:%.*]] = and <8 x i1> [[TMP73]], [[TMP76]]
-; CHECK-NEXT:    [[TMP77:%.*]] = icmp ne <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP78:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP79:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP80:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP78]]
-; CHECK-NEXT:    [[TMP81:%.*]] = and <8 x i1> [[TMP77]], [[TMP78]]
-; CHECK-NEXT:    [[TMP82:%.*]] = and <8 x i1> [[_MSPROP_ICMP4]], [[TMP79]]
-; CHECK-NEXT:    [[TMP83:%.*]] = or <8 x i1> [[TMP80]], [[TMP81]]
-; CHECK-NEXT:    [[TMP84:%.*]] = or <8 x i1> [[TMP83]], [[TMP82]]
-; CHECK-NEXT:    [[TMP85:%.*]] = and <8 x i1> [[TMP77]], [[TMP79]]
-; CHECK-NEXT:    [[TMP86:%.*]] = bitcast <8 x i1> [[TMP84]] to i8
-; CHECK-NEXT:    [[TMP87:%.*]] = bitcast <8 x i1> [[TMP85]] to i8
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i8> [[_MSPROP3]], i8 [[TMP86]], i32 4
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i8> [[VEC3]], i8 [[TMP87]], i32 4
-; CHECK-NEXT:    [[TMP88:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP89:%.*]] = and <8 x i64> [[A0]], [[TMP88]]
-; CHECK-NEXT:    [[TMP90:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP91:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP92:%.*]] = and <8 x i64> [[A1]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP94:%.*]] = icmp uge <8 x i64> [[TMP89]], [[TMP93]]
-; CHECK-NEXT:    [[TMP95:%.*]] = icmp uge <8 x i64> [[TMP90]], [[TMP92]]
-; CHECK-NEXT:    [[TMP96:%.*]] = xor <8 x i1> [[TMP94]], [[TMP95]]
-; CHECK-NEXT:    [[TMP97:%.*]] = icmp uge <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP98:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP99:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP100:%.*]] = and <8 x i1> [[TMP96]], [[TMP98]]
-; CHECK-NEXT:    [[TMP101:%.*]] = and <8 x i1> [[TMP97]], [[TMP98]]
-; CHECK-NEXT:    [[TMP102:%.*]] = and <8 x i1> [[TMP96]], [[TMP99]]
-; CHECK-NEXT:    [[TMP103:%.*]] = or <8 x i1> [[TMP100]], [[TMP101]]
-; CHECK-NEXT:    [[TMP104:%.*]] = or <8 x i1> [[TMP103]], [[TMP102]]
-; CHECK-NEXT:    [[TMP105:%.*]] = and <8 x i1> [[TMP97]], [[TMP99]]
-; CHECK-NEXT:    [[TMP106:%.*]] = bitcast <8 x i1> [[TMP104]] to i8
-; CHECK-NEXT:    [[TMP107:%.*]] = bitcast <8 x i1> [[TMP105]] to i8
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i8> [[_MSPROP5]], i8 [[TMP106]], i32 5
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i8> [[VEC4]], i8 [[TMP107]], i32 5
-; CHECK-NEXT:    [[TMP108:%.*]] = xor <8 x i64> [[TMP1]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP109:%.*]] = and <8 x i64> [[A0]], [[TMP108]]
-; CHECK-NEXT:    [[TMP110:%.*]] = or <8 x i64> [[A0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP111:%.*]] = xor <8 x i64> [[TMP2]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP112:%.*]] = and <8 x i64> [[A1]], [[TMP111]]
-; CHECK-NEXT:    [[TMP113:%.*]] = or <8 x i64> [[A1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP114:%.*]] = icmp ugt <8 x i64> [[TMP109]], [[TMP113]]
-; CHECK-NEXT:    [[TMP115:%.*]] = icmp ugt <8 x i64> [[TMP110]], [[TMP112]]
-; CHECK-NEXT:    [[TMP116:%.*]] = xor <8 x i1> [[TMP114]], [[TMP115]]
-; CHECK-NEXT:    [[TMP117:%.*]] = icmp ugt <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP118:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP119:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP120:%.*]] = and <8 x i1> [[TMP116]], [[TMP118]]
-; CHECK-NEXT:    [[TMP121:%.*]] = and <8 x i1> [[TMP117]], [[TMP118]]
-; CHECK-NEXT:    [[TMP122:%.*]] = and <8 x i1> [[TMP116]], [[TMP119]]
-; CHECK-NEXT:    [[TMP123:%.*]] = or <8 x i1> [[TMP120]], [[TMP121]]
-; CHECK-NEXT:    [[TMP124:%.*]] = or <8 x i1> [[TMP123]], [[TMP122]]
-; CHECK-NEXT:    [[TMP125:%.*]] = and <8 x i1> [[TMP117]], [[TMP119]]
-; CHECK-NEXT:    [[TMP126:%.*]] = bitcast <8 x i1> [[TMP124]] to i8
-; CHECK-NEXT:    [[TMP127:%.*]] = bitcast <8 x i1> [[TMP125]] to i8
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i8> [[_MSPROP6]], i8 [[TMP126]], i32 6
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i8> [[VEC5]], i8 [[TMP127]], i32 6
-; CHECK-NEXT:    [[TMP128:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP129:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
-; CHECK-NEXT:    [[TMP130:%.*]] = and <8 x i1> zeroinitializer, [[TMP128]]
-; CHECK-NEXT:    [[TMP131:%.*]] = and <8 x i1> splat (i1 true), [[TMP128]]
-; CHECK-NEXT:    [[TMP132:%.*]] = and <8 x i1> zeroinitializer, [[TMP129]]
-; CHECK-NEXT:    [[TMP133:%.*]] = or <8 x i1> [[TMP130]], [[TMP131]]
-; CHECK-NEXT:    [[TMP134:%.*]] = or <8 x i1> [[TMP133]], [[TMP132]]
-; CHECK-NEXT:    [[TMP135:%.*]] = and <8 x i1> splat (i1 true), [[TMP129]]
-; CHECK-NEXT:    [[TMP136:%.*]] = bitcast <8 x i1> [[TMP134]] to i8
-; CHECK-NEXT:    [[TMP137:%.*]] = bitcast <8 x i1> [[TMP135]] to i8
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <8 x i8> [[_MSPROP7]], i8 [[TMP136]], i32 7
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i8> [[VEC6]], i8 [[TMP137]], i32 7
-; CHECK-NEXT:    store <8 x i8> [[_MSPROP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i8> [[VEC7]]
-;
-  %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
-  %vec0 = insertelement <8 x i8> undef, i8 %res0, i32 0
-  %res1 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 1, i8 %mask)
-  %vec1 = insertelement <8 x i8> %vec0, i8 %res1, i32 1
-  %res2 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 2, i8 %mask)
-  %vec2 = insertelement <8 x i8> %vec1, i8 %res2, i32 2
-  %res3 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 3, i8 %mask)
-  %vec3 = insertelement <8 x i8> %vec2, i8 %res3, i32 3
-  %res4 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 4, i8 %mask)
-  %vec4 = insertelement <8 x i8> %vec3, i8 %res4, i32 4
-  %res5 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 5, i8 %mask)
-  %vec5 = insertelement <8 x i8> %vec4, i8 %res5, i32 5
-  %res6 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 6, i8 %mask)
-  %vec6 = insertelement <8 x i8> %vec5, i8 %res6, i32 6
-  %res7 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 7, i8 %mask)
-  %vec7 = insertelement <8 x i8> %vec6, i8 %res7, i32 7
-  ret <8 x i8> %vec7
-}
-
-declare i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwind readnone
-
-declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512(<4 x float> %x0, <16 x float> %x2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[X0:%.*]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP7]], <16 x float> [[TMP5]], <16 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x float> [[TMP15]] to <16 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP16]], <16 x i32> [[TMP22]], <16 x i32> [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP17]], <16 x float> [[TMP15]], <16 x float> zeroinitializer
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSPROP_SELECT]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[TMP4]], [[TMP14]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <16 x i32> [[_MSPROP_SELECT3]], [[_MSPROP4]]
-; CHECK-NEXT:    [[RES5:%.*]] = fadd <16 x float> [[TMP23]], [[RES4]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP5]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES5]]
-;
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 -1)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
-  %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %mask)
-  %res4 = fadd <16 x float> %res1, %res2
-  %res5 = fadd <16 x float> %res3, %res4
-  ret <16 x float> %res5
-}
-
-define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(ptr %x0ptr, <16 x float> %x2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf32x4_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X0:%.*]] = load <4 x float>, ptr [[X0PTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x float> [[X0]], <4 x float> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %x0 = load <4 x float>, ptr %x0ptr
-  %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x4.512(<4 x float> %x0, <16 x float> %x2, i16 %mask)
-  ret <16 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_broadcastf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcastf64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[X0:%.*]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(ptr %x0ptr, <8 x double> %x2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcastf64x4_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X0:%.*]] = load <4 x double>, ptr [[X0PTR:%.*]], align 32
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x double> [[X0]], <4 x double> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP18]]
-;
-  %x0 = load <4 x double>, ptr %x0ptr
-  %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask)
-  ret <8 x double> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32>, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_broadcasti32x4_512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[X0:%.*]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[_MSPROP2]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT3:%.*]] = select <16 x i1> [[TMP14]], <16 x i32> [[TMP19]], <16 x i32> [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP15]], <16 x i32> [[TMP13]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP4]], 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP21]], <16 x i32> [[_MSPROP_SELECT]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP12]], 1
-; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP22]], <16 x i32> [[_MSPROP_SELECT3]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP20]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP23]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(ptr %x0ptr, <16 x i32> %x2, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti32x4_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X0:%.*]] = load <4 x i32>, ptr [[X0PTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[_MSLD]], <4 x i32> [[_MSLD]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[X0]], <4 x i32> [[X0]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %x0 = load <4 x i32>, ptr %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x4.512(<4 x i32> %x0, <16 x i32> %x2, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_broadcasti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_broadcasti64x4_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[X0:%.*]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP3]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(ptr %x0ptr, <8 x i64> %x2, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_broadcasti64x4_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X0:%.*]] = load <4 x i64>, ptr [[X0PTR:%.*]], align 32
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X0PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i64>, ptr [[TMP8]], align 32
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[_MSLD]], <4 x i64> [[_MSLD]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[X0]], <4 x i64> [[X0]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %x0 = load <4 x i64>, ptr %x0ptr
-  %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pabs_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pabs_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pabs_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pabs_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pabs_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.abs.v8i64(<8 x i64> [[X0:%.*]], i1 false)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
-  ret <8 x i64> %res
-}
-
-define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m)  #0 {
-;
-; CHECK-LABEL: @test_vptestmq(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP]] to i8
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i1> [[TMP16]] to i8
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[A0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP1]], [[A1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP26]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP29:%.*]] = and <8 x i64> [[TMP28]], [[TMP25]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <8 x i64> [[TMP29]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP27]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[M:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP35:%.*]] = and <8 x i1> [[TMP31]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = and <8 x i1> [[_MSPROP_ICMP1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <8 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = or <8 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = and <8 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[TMP38]] to i8
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP40]], [[TMP17]]
-; CHECK-NEXT:    [[RES2:%.*]] = add i8 [[TMP41]], [[TMP18]]
-; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES2]]
-;
-  %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
-  %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
-  %res2 = add i8 %res1, %res
-  ret i8 %res2
-}
-declare i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
-
-define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m)  #0 {
-;
-; CHECK-LABEL: @test_vptestmd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[A0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[A1:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i1> [[TMP16]] to i16
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i32> [[A0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i32> [[TMP1]], [[A1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i32> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i32> [[A0]], [[A1]]
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ne <16 x i32> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <16 x i32> [[TMP26]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i32> [[TMP28]], [[TMP25]]
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <16 x i32> [[TMP29]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP27]], [[TMP30]]
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ne <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[M:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP32]]
-; CHECK-NEXT:    [[TMP35:%.*]] = and <16 x i1> [[TMP31]], [[TMP32]]
-; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[_MSPROP_ICMP1]], [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or <16 x i1> [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = or <16 x i1> [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[TMP39:%.*]] = and <16 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[TMP38]] to i16
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP40]], [[TMP17]]
-; CHECK-NEXT:    [[RES2:%.*]] = add i16 [[TMP41]], [[TMP18]]
-; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[RES2]]
-;
-  %res = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
-  %res1 = call i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 %m)
-  %res2 = add i16 %res1, %res
-  ret i16 %res2
-}
-declare i16 @llvm.x86.avx512.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
-
-declare i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32>, <16 x i32>, i16 %x2)
-
-define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_ptestnm_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i32> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i32> [[X0]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[_MSPROP_ICMP]], [[TMP18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
-; CHECK-NEXT:    [[TMP27:%.*]] = and <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = and <16 x i32> [[X0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP29:%.*]] = and <16 x i32> [[TMP1]], [[X1]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <16 x i32> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <16 x i32> [[TMP30]], [[TMP29]]
-; CHECK-NEXT:    [[TMP32:%.*]] = and <16 x i32> [[X0]], [[X1]]
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <16 x i32> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = or <16 x i32> [[TMP31]], zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i32> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <16 x i32> [[TMP34]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], [[TMP33]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <16 x i32> [[TMP37]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP35]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq <16 x i32> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <16 x i1> [[TMP39]] to i16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i16 [[TMP25]], [[TMP40]]
-; CHECK-NEXT:    [[RES2:%.*]] = add i16 [[TMP26]], [[TMP41]]
-; CHECK-NEXT:    store i16 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[RES2]]
-;
-  %res = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16 %x2)
-  %res1 = call i16 @llvm.x86.avx512.ptestnm.d.512(<16 x i32> %x0, <16 x i32> %x1, i16-1)
-  %res2 = add i16 %res, %res1
-  ret i16 %res2
-}
-
-declare i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64>, <8 x i64>, i8 %x2)
-
-define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_ptestnm_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[X0:%.*]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP1]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[X0]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <8 x i1> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i1> [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i1> [[_MSPROP_ICMP]], [[TMP18]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i1> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <8 x i1> [[TMP23]] to i8
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <8 x i1> [[TMP24]] to i8
-; CHECK-NEXT:    [[TMP27:%.*]] = and <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP28:%.*]] = and <8 x i64> [[X0]], [[TMP2]]
-; CHECK-NEXT:    [[TMP29:%.*]] = and <8 x i64> [[TMP1]], [[X1]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP27]], [[TMP28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP29]]
-; CHECK-NEXT:    [[TMP32:%.*]] = and <8 x i64> [[X0]], [[X1]]
-; CHECK-NEXT:    [[TMP33:%.*]] = xor <8 x i64> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP34:%.*]] = or <8 x i64> [[TMP31]], zeroinitializer
-; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i64> [[TMP34]], zeroinitializer
-; CHECK-NEXT:    [[TMP36:%.*]] = xor <8 x i64> [[TMP34]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP37:%.*]] = and <8 x i64> [[TMP36]], [[TMP33]]
-; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <8 x i64> [[TMP37]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <8 x i1> [[TMP35]], [[TMP38]]
-; CHECK-NEXT:    [[TMP39:%.*]] = icmp eq <8 x i64> [[TMP32]], zeroinitializer
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i1> [[_MSPROP_ICMP1]] to i8
-; CHECK-NEXT:    [[TMP41:%.*]] = bitcast <8 x i1> [[TMP39]] to i8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or i8 [[TMP25]], [[TMP40]]
-; CHECK-NEXT:    [[RES2:%.*]] = add i8 [[TMP26]], [[TMP41]]
-; CHECK-NEXT:    store i8 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES2]]
-;
-  %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
-  %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
-  %res2 = add i8 %res, %res1
-  ret i8 %res2
-}
-
-declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
-define i16 @test_kand(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @test_kand(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP3]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i1> [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i1> [[TMP9]] to i16
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[TMP11]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = and <16 x i1> [[TMP13]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <16 x i1> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[TMP13]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i1> [[TMP17]], [[TMP18]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i1> [[TMP20]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x i1> [[TMP21]] to i16
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x i1> [[TMP22]] to i16
-; CHECK-NEXT:    store i16 [[TMP23]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP24]]
-;
-  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
-  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
-  ret i16 %t2
-}
-
-declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
-define i16 @test_kandn(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @test_kandn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[_MSPROP]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[_MSPROP]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true)
-; CHECK-NEXT:    [[TMP19:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP16]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[_MSPROP1]], [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i1> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <16 x i1> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x i1> [[TMP23]] to i16
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
-; CHECK-NEXT:    store i16 [[TMP25]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP26]]
-;
-  %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
-  %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
-  ret i16 %t2
-}
-
-declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
-define i16 @test_knot(i16 %a0)  #0 {
-;
-; CHECK-LABEL: @test_knot(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = xor <16 x i1> [[TMP3]], splat (i1 true)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    store i16 [[TMP5]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP6]]
-;
-  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
-  ret i16 %res
-}
-
-declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
-define i16 @test_kor(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @test_kor(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[TMP6:%.*]] = and <16 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = and <16 x i1> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP3]], <i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 0), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 1), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 2), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 3), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 4), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 5), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 6), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 7), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 8), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 9), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 10), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 11), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 12), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 13), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 14), i1 true), i1 xor (i1 extractelement (<16 x i1> bitcast (<1 x i16> splat (i16 8) to <16 x i1>), i32 15), i1 true)>
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i1> [[TMP10]] to i16
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[TMP11]] to i16
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i16 [[TMP12]] to <16 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i16 [[TMP13]] to <16 x i1>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = xor <16 x i1> [[TMP15]], splat (i1 true)
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i1> [[TMP17]], splat (i1 true)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <16 x i1> [[TMP14]], [[TMP16]]
-; CHECK-NEXT:    [[TMP21:%.*]] = and <16 x i1> [[TMP18]], [[TMP16]]
-; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP14]], [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <16 x i1> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <16 x i1> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i1> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[TMP24]] to i16
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
-; CHECK-NEXT:    store i16 [[TMP26]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP27]]
-;
-  %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
-  %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
-  ret i16 %t2
-}
-
-declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
-; TODO: the two kxnor instructions here a no op and should be elimintaed,
-; probably by FoldConstantArithmetic in SelectionDAG.
-define i16 @test_kxnor(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @test_kxnor(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], splat (i1 true)
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[_MSPROP]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = xor <16 x i1> [[TMP5]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP6]] to i16
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP8]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <16 x i1> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i1> [[TMP10]], splat (i1 true)
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i1> [[_MSPROP2]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i1> [[TMP13]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i1> [[_MSPROP3]] to i16
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
-; CHECK-NEXT:    store i16 [[TMP15]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP16]]
-;
-  %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
-  %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
-  ret i16 %t2
-}
-
-declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
-define i16 @test_kxor(i16 %a0, i16 %a1)  #0 {
-;
-; CHECK-LABEL: @test_kxor(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[A0:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i1> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <16 x i1> [[TMP4]], bitcast (<1 x i16> splat (i16 8) to <16 x i1>)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[_MSPROP]] to i16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[TMP5]] to i16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP6]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP7]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[A1:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i1> [[TMP8]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i1> [[TMP9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i1> [[_MSPROP1]] to i16
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i1> [[TMP12]] to i16
-; CHECK-NEXT:    store i16 [[TMP13]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP14]]
-;
-  %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
-  %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
-  ret i16 %t2
-}
-
-declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
-define i32 @test_kortestz(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D)  #0 {
-; CHECK-LABEL: @test_kortestz(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1>
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true)
-; CHECK-NEXT:    [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
-; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]]
-; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]]
-; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16
-; CHECK-NEXT:    [[TMP44:%.*]] = xor i16 [[TMP43]], 0
-; CHECK-NEXT:    [[TMP45:%.*]] = or i16 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = xor i16 [[TMP45]], -1
-; CHECK-NEXT:    [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]]
-; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32
-; CHECK-NEXT:    [[TMP51:%.*]] = zext i1 [[TMP50]] to i32
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[TMP51]]
-;
-entry:
-  %0 = bitcast <8 x i64> %A to <16 x i32>
-  %1 = bitcast <8 x i64> %B to <16 x i32>
-  %2 = icmp ne <16 x i32> %0, %1
-  %3 = bitcast <8 x i64> %C to <16 x i32>
-  %4 = bitcast <8 x i64> %D to <16 x i32>
-  %5 = icmp ne <16 x i32> %3, %4
-  %6 = bitcast <16 x i1> %2 to i16
-  %7 = bitcast <16 x i1> %5 to i16
-  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)
-  ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
-define i32 @test_kortestc(<8 x i64> %A, <8 x i64> %B, <8 x i64> %C, <8 x i64> %D)  #0 {
-; CHECK-LABEL: @test_kortestc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[A:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP1]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[B:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP:%.*]] = and <16 x i1> [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <16 x i32> [[TMP5]], [[TMP7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[C:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i64> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i64> [[D:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <16 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or <16 x i32> [[TMP15]], [[TMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i32> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = xor <16 x i32> [[TMP20]], splat (i32 -1)
-; CHECK-NEXT:    [[TMP23:%.*]] = and <16 x i32> [[TMP22]], [[TMP19]]
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <16 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_ICMP1:%.*]] = and <16 x i1> [[TMP21]], [[TMP24]]
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <16 x i32> [[TMP16]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP]] to i16
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast <16 x i1> [[TMP14]] to i16
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast <16 x i1> [[_MSPROP_ICMP1]] to i16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i1> [[TMP25]] to i16
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast i16 [[TMP26]] to <16 x i1>
-; CHECK-NEXT:    [[TMP31:%.*]] = bitcast i16 [[TMP27]] to <16 x i1>
-; CHECK-NEXT:    [[TMP32:%.*]] = bitcast i16 [[TMP28]] to <16 x i1>
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i16 [[TMP29]] to <16 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = xor <16 x i1> [[TMP31]], splat (i1 true)
-; CHECK-NEXT:    [[TMP35:%.*]] = xor <16 x i1> [[TMP33]], splat (i1 true)
-; CHECK-NEXT:    [[TMP36:%.*]] = and <16 x i1> [[TMP30]], [[TMP32]]
-; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i1> [[TMP34]], [[TMP32]]
-; CHECK-NEXT:    [[TMP38:%.*]] = and <16 x i1> [[TMP30]], [[TMP35]]
-; CHECK-NEXT:    [[TMP39:%.*]] = or <16 x i1> [[TMP36]], [[TMP37]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or <16 x i1> [[TMP39]], [[TMP38]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or <16 x i1> [[TMP31]], [[TMP33]]
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast <16 x i1> [[TMP40]] to i16
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast <16 x i1> [[TMP41]] to i16
-; CHECK-NEXT:    [[TMP44:%.*]] = xor i16 [[TMP43]], 0
-; CHECK-NEXT:    [[TMP45:%.*]] = or i16 [[TMP42]], 0
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne i16 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = xor i16 [[TMP45]], -1
-; CHECK-NEXT:    [[TMP48:%.*]] = and i16 [[TMP47]], [[TMP44]]
-; CHECK-NEXT:    [[TMP49:%.*]] = icmp eq i16 [[TMP48]], 0
-; CHECK-NEXT:    [[_MSPROP_ICMP2:%.*]] = and i1 [[TMP46]], [[TMP49]]
-; CHECK-NEXT:    [[TMP50:%.*]] = icmp eq i16 [[TMP43]], 0
-; CHECK-NEXT:    [[_MSPROP:%.*]] = zext i1 [[_MSPROP_ICMP2]] to i32
-; CHECK-NEXT:    [[TMP51:%.*]] = zext i1 [[TMP50]] to i32
-; CHECK-NEXT:    store i32 [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[TMP51]]
-;
-entry:
-  %0 = bitcast <8 x i64> %A to <16 x i32>
-  %1 = bitcast <8 x i64> %B to <16 x i32>
-  %2 = icmp ne <16 x i32> %0, %1
-  %3 = bitcast <8 x i64> %C to <16 x i32>
-  %4 = bitcast <8 x i64> %D to <16 x i32>
-  %5 = icmp ne <16 x i32> %3, %4
-  %6 = bitcast <16 x i1> %2 to i16
-  %7 = bitcast <16 x i1> %5 to i16
-  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %6, i16 %7)
-  ret i32 %res
-}
-
-define i16 @test_cmpps(<16 x float> %a, <16 x float> %b)  #0 {
-; CHECK-LABEL: @test_cmpps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP7]]
-;
-  %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
-  ret i16 %res
-}
-declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
-
-define i8 @test_cmppd(<8 x double> %a, <8 x double> %b)  #0 {
-; CHECK-LABEL: @test_cmppd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP7]]
-;
-  %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
-  ret i8 %res
-}
-declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
-
-define <8 x i64> @test_mul_epi32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mul_epi32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP3]], splat (i64 32)
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
-; CHECK-NEXT:    [[TMP10:%.*]] = ashr <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP12:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = ashr <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <8 x i64> [[TMP4]], splat (i64 32)
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = shl <8 x i64> [[TMP5]], splat (i64 32)
-; CHECK-NEXT:    [[TMP11:%.*]] = ashr <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = ashr <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP6]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP7]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = shl <8 x i64> [[TMP8]], splat (i64 32)
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = ashr <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = ashr <8 x i64> [[TMP19]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <8 x i64> [[TMP9]], splat (i64 32)
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = ashr <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = or <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = ashr <8 x i64> [[TMP20]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = ashr <8 x i64> [[TMP17]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = shl <8 x i64> [[TMP14]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = ashr <8 x i64> [[TMP23]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epi32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epi32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = shl <8 x i64> [[TMP10]], splat (i64 32)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = shl <8 x i64> [[TMP11]], splat (i64 32)
-; CHECK-NEXT:    [[TMP17:%.*]] = ashr <8 x i64> [[TMP15]], splat (i64 32)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = ashr <8 x i64> [[TMP16]], splat (i64 32)
-; CHECK-NEXT:    [[TMP20:%.*]] = shl <8 x i64> [[TMP12]], splat (i64 32)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = shl <8 x i64> [[TMP13]], splat (i64 32)
-; CHECK-NEXT:    [[TMP23:%.*]] = ashr <8 x i64> [[TMP21]], splat (i64 32)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = ashr <8 x i64> [[TMP22]], splat (i64 32)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %mul = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>)
-
-define <8 x i64> @test_mul_epu32_rr(<16 x i32> %a, <16 x i32> %b)  #0 {
-; CHECK-LABEL: @test_mul_epu32_rr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = and <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP3]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP19]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rrk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = mul <8 x i64> [[TMP14]], [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <8 x i64> [[TMP21]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or <8 x i64> [[TMP25]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP22]], <8 x i64> [[TMP26]], <8 x i64> [[TMP23]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP21]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rrkz(<16 x i32> %a, <16 x i32> %b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rrkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[B:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i64> [[TMP4]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP5]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP6]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP6]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP7]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP12]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = mul <8 x i64> [[TMP13]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP21]], <8 x i64> [[TMP25]], <8 x i64> [[TMP22]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP20]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rm(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP7]], align 64
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = and <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP8]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP16]], [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP24]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rmk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rmk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rmkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rmkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[B:%.*]] = load <16 x i32>, ptr [[PTR_B:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP27:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP28:%.*]] = xor <8 x i64> [[TMP25]], zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = or <8 x i64> [[TMP28]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP26]], <8 x i64> [[TMP30]], <8 x i64> [[TMP27]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP25]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %b = load <16 x i32>, ptr %ptr_b
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rmb(<16 x i32> %a, ptr %ptr_b)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rmb(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP9]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP19]], [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP17]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP2]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP25]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %res = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rmbk(<16 x i32> %a, ptr %ptr_b, <8 x i64> %passThru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rmbk(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP17]]
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP21]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = and <8 x i64> [[TMP14]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <8 x i64> [[TMP20]], [[TMP26]]
-; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP29:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP30:%.*]] = xor <8 x i64> [[TMP27]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or <8 x i64> [[TMP31]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP28]], <8 x i64> [[TMP32]], <8 x i64> [[TMP29]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP27]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> %passThru
-  ret < 8 x i64> %res
-}
-
-define <8 x i64> @test_mul_epu32_rmbkz(<16 x i32> %a, ptr %ptr_b, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mul_epu32_rmbkz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load i64, ptr [[PTR_B:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <8 x i64> undef, i64 [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[B64:%.*]] = shufflevector <8 x i64> [[VECINIT_I]], <8 x i64> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to <16 x i32>
-; CHECK-NEXT:    [[B:%.*]] = bitcast <8 x i64> [[B64]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[A:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[B]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = and <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = and <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = and <8 x i64> [[TMP10]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP11]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP20:%.*]] = and <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = and <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i64> [[TMP12]], splat (i64 4294967295)
-; CHECK-NEXT:    [[TMP23:%.*]] = or <8 x i64> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or <8 x i64> [[TMP23]], [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = and <8 x i64> [[TMP13]], splat (i64 4294967295)
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <8 x i64> [[TMP18]], [[TMP24]]
-; CHECK-NEXT:    [[TMP26:%.*]] = mul <8 x i64> [[TMP19]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP28:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[_MSPROP2]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <8 x i64> [[TMP26]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = or <8 x i64> [[TMP29]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or <8 x i64> [[TMP30]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP27]], <8 x i64> [[TMP31]], <8 x i64> [[TMP28]]
-; CHECK-NEXT:    [[RES:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP26]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %q = load i64, ptr %ptr_b
-  %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0
-  %b64 = shufflevector <8 x i64> %vecinit.i, <8 x i64> undef, <8 x i32> zeroinitializer
-  %b = bitcast <8 x i64> %b64 to <16 x i32>
-  %mul = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res = select <8 x i1> %mask.cast, <8 x i64> %mul, <8 x i64> zeroinitializer
-  ret < 8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>)
-
-define <2 x double> @test_x86_avx512_mm_cvtu32_sd(<2 x double> %a, i32 %b)
-;
-; CHECK-LABEL: @test_x86_avx512_mm_cvtu32_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = uitofp i32 [[B:%.*]] to double
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP4]], i64 0
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[TMP5]]
-;
-  #0 {
-  %res = call <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double> %a, i32 %b) ; <<<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx512.cvtusi2sd(<2 x double>, i32) nounwind readnone
-
-define <16 x float> @test_x86_vbroadcast_ss_512(ptr %a0)  #0 {
-;
-; CHECK-LABEL: @test_x86_vbroadcast_ss_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[A0:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <16 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> poison, float [[TMP4]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <16 x i32> [[_MSPROP]], i32 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[TMP4]], i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <16 x i32> [[_MSPROP1]], i32 [[_MSLD]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[TMP4]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <16 x i32> [[_MSPROP2]], i32 [[_MSLD]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[TMP4]], i32 3
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <16 x i32> [[_MSPROP3]], i32 [[_MSLD]], i32 4
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <16 x float> [[TMP11]], float [[TMP4]], i32 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <16 x i32> [[_MSPROP4]], i32 [[_MSLD]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x float> [[TMP12]], float [[TMP4]], i32 5
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <16 x i32> [[_MSPROP5]], i32 [[_MSLD]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <16 x float> [[TMP13]], float [[TMP4]], i32 6
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <16 x i32> [[_MSPROP6]], i32 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <16 x float> [[TMP14]], float [[TMP4]], i32 7
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <16 x i32> [[_MSPROP7]], i32 [[_MSLD]], i32 8
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x float> [[TMP15]], float [[TMP4]], i32 8
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = insertelement <16 x i32> [[_MSPROP8]], i32 [[_MSLD]], i32 9
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP4]], i32 9
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <16 x i32> [[_MSPROP9]], i32 [[_MSLD]], i32 10
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP4]], i32 10
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <16 x i32> [[_MSPROP10]], i32 [[_MSLD]], i32 11
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP4]], i32 11
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <16 x i32> [[_MSPROP11]], i32 [[_MSLD]], i32 12
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 12
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = insertelement <16 x i32> [[_MSPROP12]], i32 [[_MSLD]], i32 13
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP4]], i32 13
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = insertelement <16 x i32> [[_MSPROP13]], i32 [[_MSLD]], i32 14
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP4]], i32 14
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <16 x i32> [[_MSPROP14]], i32 [[_MSLD]], i32 15
-; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP4]], i32 15
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP15]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP23]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr %a0) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.vbroadcast.ss.512(ptr) nounwind readonly
-
-define <8 x double> @test_x86_vbroadcast_sd_512(ptr %a0)  #0 {
-;
-; CHECK-LABEL: @test_x86_vbroadcast_sd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
-; CHECK-NEXT:    [[TMP4:%.*]] = load double, ptr [[A0:%.*]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[A0]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP4]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <8 x i64> [[_MSPROP]], i64 [[_MSLD]], i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP4]], i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <8 x i64> [[_MSPROP1]], i64 [[_MSLD]], i32 2
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP4]], i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <8 x i64> [[_MSPROP2]], i64 [[_MSLD]], i32 3
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP4]], i32 3
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <8 x i64> [[_MSPROP3]], i64 [[_MSLD]], i32 4
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <8 x i64> [[_MSPROP4]], i64 [[_MSLD]], i32 5
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP4]], i32 5
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <8 x i64> [[_MSPROP5]], i64 [[_MSLD]], i32 6
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP4]], i32 6
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <8 x i64> [[_MSPROP6]], i64 [[_MSLD]], i32 7
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP4]], i32 7
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP15]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr %a0) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(ptr) nounwind readonly
-
-declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP18]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
-  ret <16 x float> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
-
-define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
-
-define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
-
-define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
-
-define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X4:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP10]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 200) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP20]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP20]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8)
-
-define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP23]]
-;
-  %x2s = load double, ptr %x2ptr
-  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
-  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
-  %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3)
-  ret <8 x double> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
-
-define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
-  ret <16 x float> %res
-}
-
-
-declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-declare <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-declare <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-
-define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vsubps_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vsubps_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 9)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vsubps_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vsubps_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vmulps_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vmulps_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 9)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vmulps_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1)  #0 {
-; CHECK-LABEL: @test_vmulps_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-
-;; mask float
-define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> zeroinitializer, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-;; With Passthru value
-define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> %passthru, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> %passthru, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> %passthru, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.mul.ps.512(<16 x float> %a0, <16 x float> %a1,
-  <16 x float> %passthru, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-;; mask double
-define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulpd_mask_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
-  <8 x double> zeroinitializer, i8 %mask, i32 8)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulpd_mask_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
-  <8 x double> zeroinitializer, i8 %mask, i32 9)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulpd_mask_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
-  <8 x double> zeroinitializer, i8 %mask, i32 10)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_vmulpd_mask_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.mul.pd.512(<8 x double> %a0, <8 x double> %a1,
-  <8 x double> zeroinitializer, i8 %mask, i32 11)
-  ret <8 x double> %res
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.add.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 9)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 10)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.div.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define void @test_mask_compress_store_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_store_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
-
-define void @test_compress_store_pd_512(ptr %addr, <8 x double> %data)  #0 {
-;
-; CHECK-LABEL: @test_compress_store_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8f64(<8 x double> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.pd.512(ptr %addr, <8 x double> %data, i8 -1)
-  ret void
-}
-
-define void @test_mask_compress_store_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_store_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
-
-define void @test_compress_store_ps_512(ptr %addr, <16 x float> %data)  #0 {
-;
-; CHECK-LABEL: @test_compress_store_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16f32(<16 x float> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.ps.512(ptr %addr, <16 x float> %data, i16 -1)
-  ret void
-}
-
-define void @test_mask_compress_store_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_store_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP3]], ptr [[TMP8]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
-
-define void @test_compress_store_q_512(ptr %addr, <8 x i64> %data)  #0 {
-;
-; CHECK-LABEL: @test_compress_store_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[TMP2]], ptr [[TMP5]], <8 x i1> splat (i1 true))
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v8i64(<8 x i64> [[DATA:%.*]], ptr [[ADDR]], <8 x i1> splat (i1 true))
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.q.512(ptr %addr, <8 x i64> %data, i8 -1)
-  ret void
-}
-
-define void @test_mask_compress_store_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_store_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP3]], ptr [[TMP8]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
-  ret void
-}
-
-declare void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
-
-define void @test_compress_store_d_512(ptr %addr, <16 x i32> %data)  #0 {
-;
-; CHECK-LABEL: @test_compress_store_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[TMP2]], ptr [[TMP5]], <16 x i1> splat (i1 true))
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    call void @llvm.masked.compressstore.v16i32(<16 x i32> [[DATA:%.*]], ptr [[ADDR]], <16 x i1> splat (i1 true))
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.compress.store.d.512(ptr %addr, <16 x i32> %data, i16 -1)
-  ret void
-}
-
-define <8 x double> @test_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x double> [[DATA:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP12]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_maskz_expand_load_pd_512(ptr %addr, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x double> zeroinitializer)
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 %mask)
-
-define <8 x double> @test_expand_load_pd_512(ptr %addr, <8 x double> %data)  #0 {
-;
-; CHECK-LABEL: @test_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x double> [[DATA:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP8]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 -1)
-  ret <8 x double> %res
-}
-
-; Make sure we don't crash if you pass 0 to the mask.
-define <8 x double> @test_zero_mask_expand_load_pd_512(ptr %addr, <8 x double> %data, i8 %mask)  #0 {
-; CHECK-LABEL: @test_zero_mask_expand_load_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> zeroinitializer, <8 x i64> [[TMP2]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.masked.expandload.v8f64(ptr [[ADDR]], <8 x i1> zeroinitializer, <8 x double> [[DATA:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP8]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(ptr %addr, <8 x double> %data, i8 0)
-  ret <8 x double> %res
-}
-
-define <16 x float> @test_mask_expand_load_ps_512(ptr %addr, <16 x float> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_load_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x float> [[DATA:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP12]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_maskz_expand_load_ps_512(ptr %addr, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_load_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x float> zeroinitializer)
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> zeroinitializer, i16 %mask)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 %mask)
-
-define <16 x float> @test_expand_load_ps_512(ptr %addr, <16 x float> %data)  #0 {
-;
-; CHECK-LABEL: @test_expand_load_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.masked.expandload.v16f32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x float> [[DATA:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP8]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.load.ps.512(ptr %addr, <16 x float> %data, i16 -1)
-  ret <16 x float> %res
-}
-
-define <8 x i64> @test_mask_expand_load_q_512(ptr %addr, <8 x i64> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_load_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP8]], <8 x i1> [[TMP5]], <8 x i64> [[TMP3]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP5]], <8 x i64> [[DATA:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_maskz_expand_load_q_512(ptr %addr, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_load_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP7]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> [[TMP4]], <8 x i64> zeroinitializer)
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 %mask)
-
-define <8 x i64> @test_expand_load_q_512(ptr %addr, <8 x i64> %data)  #0 {
-;
-; CHECK-LABEL: @test_expand_load_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[TMP5]], <8 x i1> splat (i1 true), <8 x i64> [[TMP2]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.masked.expandload.v8i64(ptr [[ADDR]], <8 x i1> splat (i1 true), <8 x i64> [[DATA:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP8]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.load.q.512(ptr %addr, <8 x i64> %data, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <16 x i32> @test_mask_expand_load_d_512(ptr %addr, <16 x i32> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_load_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP8]], <16 x i1> [[TMP5]], <16 x i32> [[TMP3]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[TMP12:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP5]], <16 x i32> [[DATA:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_maskz_expand_load_d_512(ptr %addr, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_load_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP7]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> [[TMP4]], <16 x i32> zeroinitializer)
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 %mask)
-
-define <16 x i32> @test_expand_load_d_512(ptr %addr, <16 x i32> %data)  #0 {
-;
-; CHECK-LABEL: @test_expand_load_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[ADDR:%.*]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 87960930222080
-; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
-; CHECK-NEXT:    [[_MSMASKEDEXPLOAD:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[TMP5]], <16 x i1> splat (i1 true), <16 x i32> [[TMP2]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.masked.expandload.v16i32(ptr [[ADDR]], <16 x i1> splat (i1 true), <16 x i32> [[DATA:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSMASKEDEXPLOAD]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP8]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.load.d.512(ptr %addr, <16 x i32> %data, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask)  #0 {
-; CHECK-LABEL: @test_mm512_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, <16 x float>zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-
-define <8 x double> @test_sqrt_pd_512(<8 x double> %a0)  #0 {
-; CHECK-LABEL: @test_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> undef, i8 -1, i32 4)
-  ret <8 x double> %res
-}
-define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> %passthru, i8 %mask, i32 4)
-  ret <8 x double> %res
-}
-define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 %mask, i32 4)
-  ret <8 x double> %res
-}
-define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0)  #0 {
-; CHECK-LABEL: @test_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP5]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> undef, i8 -1, i32 11)
-  ret <8 x double> %res
-}
-define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> %passthru, i8 %mask, i32 11)
-  ret <8 x double> %res
-}
-define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 %mask, i32 11)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <16 x float> @test_sqrt_ps_512(<16 x float> %a0)  #0 {
-; CHECK-LABEL: @test_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> undef, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 4)
-  ret <16 x float> %res
-}
-define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0)  #0 {
-; CHECK-LABEL: @test_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP5]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 11)
-  ret <16 x float> %res
-}
-define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> %passthru, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 %mask, i32 11)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-declare <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prolv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_prolv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prolv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prolv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prolv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_prolv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prolv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prolv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-
-define <16 x i32>@test_int_x86_avx512_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prorv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP7]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP9]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP16]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
-  ret <16 x i32> %res
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_prorv_d_512_old(<16 x i32> %x0, <16 x i32> %x1, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_d_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP14]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP8]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP15]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
-
-define <8 x i64>@test_int_x86_avx512_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_prorv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prorv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP9]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP8]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP9]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP16]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
-  ret <8 x i64> %res
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_prorv_q_512_old(<8 x i64> %x0, <8 x i64> %x1, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_prorv_q_512_old(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> [[X1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP7]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP14]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP8]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP15]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
-  ret <8 x i64> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prol_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_prol_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32>, i32, <16 x i32>, i16)
-
-define { <16 x i32>, <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_pror_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pror_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0:%.*]], <16 x i32> [[X0]], <16 x i32> splat (i32 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP6]], <16 x i32> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[X3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP15]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <16 x i32> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <16 x i1> [[TMP17]], <16 x i32> [[TMP22]], <16 x i32> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP18]], <16 x i32> [[TMP16]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP1]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <16 x i32> @llvm.fshr.v16i32(<16 x i32> [[X0]], <16 x i32> [[X0]], <16 x i32> splat (i32 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } { <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1) }, <16 x i32> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP27]], <16 x i32> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES3]], <16 x i32> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP28]], <16 x i32> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } [[RES4]], <16 x i32> [[TMP26]], 2
-; CHECK-NEXT:    store { <16 x i32>, <16 x i32>, <16 x i32> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <16 x i32>, <16 x i32>, <16 x i32> } [[RES5]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 4, <16 x i32> zeroinitializer, i16 %x3)
-  %res2 = call <16 x i32> @llvm.x86.avx512.mask.pror.d.512(<16 x i32> %x0, i32 5, <16 x i32> %x2, i16 -1)
-  %res3 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> } poison, <16 x i32> %res0, 0
-  %res4 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res3, <16 x i32> %res1, 1
-  %res5 = insertvalue { <16 x i32>, <16 x i32>, <16 x i32> }  %res4, <16 x i32> %res2, 2
-  ret { <16 x i32>, <16 x i32>, <16 x i32> } %res5
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64>, i32, <8 x i64>, i8)
-
-define { <8 x i64>, <8 x i64>, <8 x i64> } @test_int_x86_avx512_mask_pror_q_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pror_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0:%.*]], <8 x i64> [[X0]], <8 x i64> splat (i64 3))
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP6]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP6]], <8 x i64> [[X2]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 4))
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP15]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], [[TMP15]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> [[TMP22]], <8 x i64> [[TMP19]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP18]], <8 x i64> [[TMP16]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[TMP1]], <8 x i64> [[TMP1]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP25:%.*]] = or <8 x i64> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> [[X0]], <8 x i64> [[X0]], <8 x i64> splat (i64 5))
-; CHECK-NEXT:    [[TMP27:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } { <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1), <8 x i64> splat (i64 -1) }, <8 x i64> [[_MSPROP_SELECT]], 0
-; CHECK-NEXT:    [[RES3:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> [[TMP13]], 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP27]], <8 x i64> [[_MSPROP_SELECT1]], 1
-; CHECK-NEXT:    [[RES4:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES3]], <8 x i64> [[TMP23]], 1
-; CHECK-NEXT:    [[TMP29:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP28]], <8 x i64> [[TMP25]], 2
-; CHECK-NEXT:    [[RES5:%.*]] = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } [[RES4]], <8 x i64> [[TMP26]], 2
-; CHECK-NEXT:    store { <8 x i64>, <8 x i64>, <8 x i64> } [[TMP29]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret { <8 x i64>, <8 x i64>, <8 x i64> } [[RES5]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3)
-  %res1 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 4, <8 x i64> zeroinitializer, i8 %x3)
-  %res2 = call <8 x i64> @llvm.x86.avx512.mask.pror.q.512(<8 x i64> %x0, i32 5, <8 x i64> %x2, i8 -1)
-  %res3 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> } poison, <8 x i64> %res0, 0
-  %res4 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res3, <8 x i64> %res1, 1
-  %res5 = insertvalue { <8 x i64>, <8 x i64>, <8 x i64> }  %res4, <8 x i64> %res2, 2
-  ret { <8 x i64>, <8 x i64>, <8 x i64> } %res5
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res2, %res3
-  ret <2 x double> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res2, %res3
-  ret <4 x float> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i64 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = or i64 [[TMP14]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP16]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP11]], double [[TMP8]], double 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x double> [[X0]], double [[TMP17]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP15:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]]
-; CHECK-NEXT:    br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
-; CHECK:       22:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       23:
-; CHECK-NEXT:    [[TMP24:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP19]], double [[TMP20]], double [[TMP21]], i32 11)
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast double [[TMP24]] to i64
-; CHECK-NEXT:    [[TMP30:%.*]] = xor i64 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = or i64 [[TMP31]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP32]], i64 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP27]], double [[TMP24]], double 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT11]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <2 x double> [[X0]], double [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[TMP18]], [[TMP34]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP13]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES2]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 11)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = xor i32 [[TMP13]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = or i32 [[TMP14]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP16]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP11]], float [[TMP8]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x float> [[X0]], float [[TMP17]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP15:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR16:%.*]] = or i1 [[_MSOR]], [[_MSCMP15]]
-; CHECK-NEXT:    br i1 [[_MSOR16]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
-; CHECK:       22:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       23:
-; CHECK-NEXT:    [[TMP24:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP19]], float [[TMP20]], float [[TMP21]], i32 11)
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP25]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <8 x i1> [[TMP26]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = select i1 [[TMP27]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast float [[TMP24]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP29]], 0
-; CHECK-NEXT:    [[TMP31:%.*]] = or i32 [[TMP30]], 0
-; CHECK-NEXT:    [[TMP32:%.*]] = or i32 [[TMP31]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT11:%.*]] = select i1 [[_MSPROP10]], i32 [[TMP32]], i32 [[TMP28]]
-; CHECK-NEXT:    [[TMP33:%.*]] = select i1 [[TMP27]], float [[TMP24]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT11]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <4 x float> [[X0]], float [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[TMP18]], [[TMP34]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP18]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 11)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP29]] to i64
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res2, %res3
-  ret <2 x double> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res2, %res3
-  ret <4 x float> %res4
-}
-
-define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c)  #0 {
-;
-; CHECK-LABEL: @fmadd_ss_mask_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
-; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
-; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP14]] to i32
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]]
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
-; CHECK:       29:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       30:
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
-; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
-; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP33]], align 4
-; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a.val = load float, ptr %a
-  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
-  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
-  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
-  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
-
-  %b.val = load float, ptr %b
-  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
-  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
-  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
-  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
-
-  %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
-
-  %sr = extractelement <4 x float> %vr, i32 0
-  store float %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c)  #0 {
-;
-; CHECK-LABEL: @fmadd_ss_maskz_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
-; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
-; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP32]], align 4
-; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a.val = load float, ptr %a
-  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
-  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
-  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
-  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
-
-  %b.val = load float, ptr %b
-  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
-  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
-  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
-  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
-
-  %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
-
-  %sr = extractelement <4 x float> %vr, i32 0
-  store float %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c)  #0 {
-;
-; CHECK-LABEL: @fmadd_sd_mask_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double [[TMP14]] to i64
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]]
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
-; CHECK:       29:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       30:
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
-; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
-; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP33]], align 8
-; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a.val = load double, ptr %a
-  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
-  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
-
-  %b.val = load double, ptr %b
-  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
-  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
-
-  %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
-
-  %sr = extractelement <2 x double> %vr, i32 0
-  store double %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c)  #0 {
-;
-; CHECK-LABEL: @fmadd_sd_maskz_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a.val = load double, ptr %a
-  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
-  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
-
-  %b.val = load double, ptr %b
-  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
-  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
-
-  %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
-
-  %sr = extractelement <2 x double> %vr, i32 0
-  store double %sr, ptr %a
-  ret void
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
-; CHECK:       26:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       27:
-; CHECK-NEXT:    [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
-; CHECK:       35:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       36:
-; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast double [[TMP37]] to i64
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast double [[TMP38]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res2, %res3
-  ret <2 x double> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
-; CHECK:       26:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       27:
-; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
-; CHECK:       35:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       36:
-; CHECK-NEXT:    [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float [[TMP37]] to i32
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast float [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res2, %res3
-  ret <4 x float> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = fneg <2 x double> [[X0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = fneg <2 x double> [[X0]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
-; CHECK:       38:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       39:
-; CHECK-NEXT:    [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP46:%.*]] = bitcast double [[TMP40]] to i64
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast double [[TMP41]] to i64
-; CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = or i64 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 11)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 10)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res2, %res3
-  ret <2 x double> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ) #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP11]] to i32
-; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = fneg <4 x float> [[X0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = fneg <4 x float> [[X0]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
-; CHECK:       38:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       39:
-; CHECK-NEXT:    [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP46:%.*]] = bitcast float [[TMP40]] to i32
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float [[TMP41]] to i32
-; CHECK-NEXT:    [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 11)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 10)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res2, %res3
-  ret <4 x float> %res4
-}
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP24]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %vecinit.i, <4 x float> %x1, i8 %x3, i32 4)
-  ret < 4 x float> %res
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP24]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0,<4 x float> %vecinit.i, <4 x float> %x1,  i8 %x3, i32 4)
-  ret < 4 x float> %res
-}
-
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP19]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %vecinit.i, i8 0, i32 4)
-  ret < 4 x float> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1)  #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]]
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP11]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP10]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
-  ret <8 x i32> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
-define <16 x float> @test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8)
-  %res2 = fadd <16 x float> %res, %res1
-  ret <16 x float> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
-
-define <16 x float> @test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2)  #0 {
-;
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 %x2, i32 4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %x0, <16 x float> %x1, i16 -1, i32 8)
-  %res2 = fadd <16 x float> %res, %res1
-  ret <16 x float> %res2
-}
-
-define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_compress_pd_512(<8 x double> %data)  #0 {
-; CHECK-LABEL: @test_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.compress.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-
-define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_compress_ps_512(<16 x float> %data)  #0 {
-; CHECK-LABEL: @test_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.compress.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
-
-define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_compress_q_512(<8 x i64> %data)  #0 {
-; CHECK-LABEL: @test_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.compress.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
-
-define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_compress_d_512(<16 x i32> %data)  #0 {
-; CHECK-LABEL: @test_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.compress.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
-
-define <8 x double> @test_expand_pd_512(<8 x double> %data)  #0 {
-; CHECK-LABEL: @test_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> undef, i8 -1)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %passthru, i8 %mask)
-  ret <8 x double> %res
-}
-
-define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> zeroinitializer, i8 %mask)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.expand.pd.512(<8 x double> %data, <8 x double> %src0, i8 %mask)
-
-define <16 x float> @test_expand_ps_512(<16 x float> %data)  #0 {
-; CHECK-LABEL: @test_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> undef, i16 -1)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %passthru, i16 %mask)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> zeroinitializer, i16 %mask)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.expand.ps.512(<16 x float> %data, <16 x float> %src0, i16 %mask)
-
-define <8 x i64> @test_expand_q_512(<8 x i64> %data)  #0 {
-; CHECK-LABEL: @test_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> undef, i8 -1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> zeroinitializer, i8 %mask)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.expand.q.512(<8 x i64> %data, <8 x i64> %src0, i8 %mask)
-
-define <16 x i32> @test_expand_d_512(<16 x i32> %data)  #0 {
-; CHECK-LABEL: @test_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> undef, i16 -1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_mask_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask)  #0 {
-;
-; CHECK-LABEL: @test_maskz_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.expand.d.512(<16 x i32> %data, <16 x i32> %src0, i16 %mask)
-
-define <16 x float> @test_cmp_512(<16 x float> %a, <16 x float> %b, <16 x float> %c, <16 x float> %d, ptr %p)  #0 {
-;
-; CHECK-LABEL: @test_cmp_512(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 1, <16 x i1> splat (i1 true), i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[TMP14:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[C:%.*]], <16 x float> [[D:%.*]], i32 1, <16 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP5]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR8]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = load <16 x float>, ptr [[P:%.*]], align 64
-; CHECK-NEXT:    [[TMP18:%.*]] = ptrtoint ptr [[P]] to i64
-; CHECK-NEXT:    [[TMP19:%.*]] = xor i64 [[TMP18]], 87960930222080
-; CHECK-NEXT:    [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP20]], align 64
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <16 x i1> [[TMP9]], [[TMP14]]
-; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> zeroinitializer, <16 x i32> [[_MSLD]]
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast <16 x float> [[TMP17]] to <16 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = xor <16 x i32> zeroinitializer, [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or <16 x i32> [[TMP24]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = or <16 x i32> [[TMP25]], [[_MSLD]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP26]], <16 x i32> [[TMP22]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select <16 x i1> [[TMP21]], <16 x float> zeroinitializer, <16 x float> [[TMP17]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP27]]
-;
-  entry:
-  %0 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 1, i32 8)
-  %1 = tail call <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float> %c, <16 x float> %d, i32 1, i32 4)
-  %2 = load <16 x float>, ptr %p
-  %3 = xor <16 x i1> %0, %1
-  %4 = select <16 x i1> %3, <16 x float> zeroinitializer, <16 x float> %2
-  ret <16 x float> %4
-}
-
-declare <16 x i1> @llvm.x86.avx512.cmp.ps.512(<16 x float>, <16 x float>, i32, i32)
-
-attributes #0 = { sanitize_memory }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
deleted file mode 100644
index 052b497831ee1..0000000000000
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ /dev/null
@@ -1,13714 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt %s -S -mtriple=x86_64-linux-gnu -mattr=+avx512f -passes=msan 2>&1 | FileCheck %s
-;
-; Forked from llvm/test/CodeGen/X86/avx512-intrinsics.ll
-
-define <8 x double> @test_mask_compress_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1:![0-9]+]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10:[0-9]+]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
-  ret <8 x double> %2
-}
-
-define <8 x double> @test_maskz_compress_pd_512(<8 x double> %data, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
-  ret <8 x double> %2
-}
-
-define <8 x double> @test_compress_pd_512(<8 x double> %data) #0 {
-; CHECK-LABEL: @test_compress_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <8 x double> %1
-}
-
-define <16 x float> @test_mask_compress_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
-  ret <16 x float> %2
-}
-
-define <16 x float> @test_maskz_compress_ps_512(<16 x float> %data, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
-  ret <16 x float> %2
-}
-
-define <16 x float> @test_compress_ps_512(<16 x float> %data) #0 {
-; CHECK-LABEL: @test_compress_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <16 x float> %1
-}
-
-define <8 x i64> @test_mask_compress_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i64> @test_maskz_compress_q_512(<8 x i64> %data, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i64> @test_compress_q_512(<8 x i64> %data) #0 {
-; CHECK-LABEL: @test_compress_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <8 x i64> %1
-}
-
-define <16 x i32> @test_mask_compress_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
-  ret <16 x i32> %2
-}
-
-define <16 x i32> @test_maskz_compress_d_512(<16 x i32> %data, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
-  ret <16 x i32> %2
-}
-
-define <16 x i32> @test_compress_d_512(<16 x i32> %data) #0 {
-; CHECK-LABEL: @test_compress_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <16 x i32> %1
-}
-
-define <8 x double> @test_expand_pd_512(<8 x double> %data) #0 {
-; CHECK-LABEL: @test_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mask_expand_pd_512(<8 x double> %data, <8 x double> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> %passthru, <8 x i1> %1)
-  ret <8 x double> %2
-}
-
-define <8 x double> @test_maskz_expand_pd_512(<8 x double> %data, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_expand_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> [[DATA:%.*]], <8 x double> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double> %data, <8 x double> zeroinitializer, <8 x i1> %1)
-  ret <8 x double> %2
-}
-
-define <16 x float> @test_expand_ps_512(<16 x float> %data) #0 {
-; CHECK-LABEL: @test_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mask_expand_ps_512(<16 x float> %data, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> %passthru, <16 x i1> %1)
-  ret <16 x float> %2
-}
-
-define <16 x float> @test_maskz_expand_ps_512(<16 x float> %data, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_expand_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> [[DATA:%.*]], <16 x float> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float> %data, <16 x float> zeroinitializer, <16 x i1> %1)
-  ret <16 x float> %2
-}
-
-define <8 x i64> @test_expand_q_512(<8 x i64> %data) #0 {
-; CHECK-LABEL: @test_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> undef, <8 x i1> splat (i1 true))
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP2]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <8 x i64> %1
-}
-
-define <8 x i64> @test_mask_expand_q_512(<8 x i64> %data, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i1> [[TMP4]] to i8
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> [[PASSTHRU:%.*]], <8 x i1> [[TMP5]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> %passthru, <8 x i1> %1)
-  ret <8 x i64> %2
-}
-
-define <8 x i64> @test_maskz_expand_q_512(<8 x i64> %data, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_expand_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> [[DATA:%.*]], <8 x i64> zeroinitializer, <8 x i1> [[TMP4]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %1 = bitcast i8 %mask to <8 x i1>
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64> %data, <8 x i64> zeroinitializer, <8 x i1> %1)
-  ret <8 x i64> %2
-}
-
-define <16 x i32> @test_expand_d_512(<16 x i32> %data) #0 {
-; CHECK-LABEL: @test_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> undef, <16 x i1> splat (i1 true))
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP2]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> undef, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-  ret <16 x i32> %1
-}
-
-define <16 x i32> @test_mask_expand_d_512(<16 x i32> %data, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i1> [[TMP4]] to i16
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> [[PASSTHRU:%.*]], <16 x i1> [[TMP5]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> %passthru, <16 x i1> %1)
-  ret <16 x i32> %2
-}
-
-define <16 x i32> @test_maskz_expand_d_512(<16 x i32> %data, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_expand_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i1> [[TMP3]] to i16
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i16 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> [[DATA:%.*]], <16 x i32> zeroinitializer, <16 x i1> [[TMP4]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %1 = bitcast i16 %mask to <16 x i1>
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32> %data, <16 x i32> zeroinitializer, <16 x i1> %1)
-  ret <16 x i32> %2
-}
-
-define <16 x float> @test_rcp_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_rcp_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-
-define <8 x double> @test_rcp_pd_512(<8 x double> %a0) #0 {
-; CHECK-LABEL: @test_rcp_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
-
-declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32)
-
-define <2 x double> @test_rndscale_sd(<2 x double> %a, <2 x double> %b) #0 {
-; CHECK-LABEL: @test_rndscale_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> undef, i8 -1, i32 11, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> undef, i8 -1, i32 11, i32 4)
-  ret <2 x double>%res
-}
-
-define <2 x double> @test_rndscale_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) #0 {
-; CHECK-LABEL: @test_rndscale_sd_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
-  ret <2 x double>%res
-}
-
-define <2 x double> @test_rndscale_sd_mask_load(<2 x double> %a, ptr %bptr, <2 x double> %c, i8 %mask) #0 {
-; CHECK-LABEL: @test_rndscale_sd_mask_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[B:%.*]] = load <2 x double>, ptr [[BPTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[BPTR]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <2 x i64>, ptr [[TMP9]], align 16
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B]], <2 x double> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %b = load <2 x double>, ptr %bptr
-  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask, i32 11, i32 4)
-  ret <2 x double>%res
-}
-
-define <2 x double> @test_rndscale_sd_maskz(<2 x double> %a, <2 x double> %b, i8 %mask) #0 {
-; CHECK-LABEL: @test_rndscale_sd_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %a, <2 x double> %b, <2 x double> zeroinitializer, i8 %mask, i32 11, i32 4)
-  ret <2 x double>%res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32)
-
-define <4 x float> @test_rndscale_ss(<4 x float> %a, <4 x float> %b) #0 {
-; CHECK-LABEL: @test_rndscale_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> undef, i8 -1, i32 11, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
-  ret <4 x float>%res
-}
-
-define <4 x float> @test_rndscale_ss_load(<4 x float> %a, ptr %bptr) #0 {
-; CHECK-LABEL: @test_rndscale_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load <4 x float>, ptr [[BPTR:%.*]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[BPTR]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B]], <4 x float> undef, i8 -1, i32 11, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %b = load <4 x float>, ptr %bptr
-  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> undef, i8 -1, i32 11, i32 4)
-  ret <4 x float>%res
-}
-
-define <4 x float> @test_rndscale_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) #0 {
-; CHECK-LABEL: @test_rndscale_ss_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> [[C:%.*]], i8 [[MASK:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask, i32 11, i32 4)
-  ret <4 x float>%res
-}
-
-define <4 x float> @test_rndscale_ss_maskz(<4 x float> %a, <4 x float> %b, i8 %mask) #0 {
-; CHECK-LABEL: @test_rndscale_ss_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 11, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask, i32 11, i32 4)
-  ret <4 x float>%res
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
-
-define <8 x double> @test7(<8 x double> %a) #0 {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> [[A:%.*]], i32 11, <8 x double> [[A]], i8 -1, i32 4)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
-  ret <8 x double>%res
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
-
-define <16 x float> @test8(<16 x float> %a) #0 {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> [[A:%.*]], i32 11, <16 x float> [[A]], i16 -1, i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
-  ret <16 x float>%res
-}
-
-define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_rsqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
-
-define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) #0 {
-; CHECK-LABEL: @test_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP2]]
-;
-  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mask_sqrt_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP1]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x double> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP12]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP6]], <8 x double> [[TMP4]], <8 x double> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP13]]
-;
-  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @test_maskz_sqrt_pd_512(<8 x double> %a0, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_sqrt_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.sqrt.v8f64(<8 x double> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x double> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i64> [[TMP10]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP5]], <8 x double> [[TMP3]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP11]]
-;
-  %1 = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a0)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
-
-define <8 x double> @test_sqrt_round_pd_512(<8 x double> %a0) #0 {
-; CHECK-LABEL: @test_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP5]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
-  ret <8 x double> %1
-}
-
-define <8 x double> @test_mask_sqrt_round_pd_512(<8 x double> %a0, <8 x double> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> zeroinitializer, <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[PASSTHRU:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP9]], <8 x double> [[TMP7]], <8 x double> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %passthru
-  ret <8 x double> %3
-}
-
-define <8 x double> @test_maskz_sqrt_round_pd_512(<8 x double> %a0, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_sqrt_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP8]], <8 x double> [[TMP6]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP14]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double> %a0, i32 11)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-declare <8 x double> @llvm.x86.avx512.sqrt.pd.512(<8 x double>, i32) nounwind readnone
-
-define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[TMP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP2]]
-;
-  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mask_sqrt_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[TMP8]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP12]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP6]], <16 x float> [[TMP4]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP13]]
-;
-  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_maskz_sqrt_ps_512(<16 x float> %a0, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_sqrt_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x float> @llvm.sqrt.v16f32(<16 x float> [[A0:%.*]])
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP7]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP10]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[TMP3]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP11]]
-;
-  %1 = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a0)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-
-define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP5]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mask_sqrt_round_ps_512(<16 x float> %a0, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_mask_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP9]], <16 x float> [[TMP7]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_maskz_sqrt_round_ps_512(<16 x float> %a0, i16 %mask) #0 {
-; CHECK-LABEL: @test_maskz_sqrt_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> [[A0:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP8]], <16 x float> [[TMP6]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP14]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float> %a0, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>, i32) nounwind readnone
-
-define <8 x double> @test_getexp_pd_512(<8 x double> %a0) #0 {
-; CHECK-LABEL: @test_getexp_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
-  ret <8 x double> %res
-}
-define <8 x double> @test_getexp_round_pd_512(<8 x double> %a0) #0 {
-; CHECK-LABEL: @test_getexp_round_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> [[A0:%.*]], <8 x double> zeroinitializer, i8 -1, i32 12)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 12)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.getexp.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <16 x float> @test_getexp_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_getexp_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-
-define <16 x float> @test_getexp_round_ps_512(<16 x float> %a0) #0 {
-; CHECK-LABEL: @test_getexp_round_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> [[A0:%.*]], <16 x float> zeroinitializer, i16 -1, i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
-  ret <16 x float> %res
-}
-declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
-
-declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_sqrt_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 9)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 [[MASK]], i32 10)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
-; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
-; CHECK:       21:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       22:
-; CHECK-NEXT:    [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> zeroinitializer, i8 -1, i32 11)
-; CHECK-NEXT:    [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES_2:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
-; CHECK-NEXT:    [[RES:%.*]] = fadd <4 x float> [[RES_1]], [[RES_2]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 10)
-  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 11)
-
-  %res.1 = fadd <4 x float> %res0, %res1
-  %res.2 = fadd <4 x float> %res2, %res3
-  %res   = fadd <4 x float> %res.1, %res.2
-  ret <4 x float> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_sqrt_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 9)
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSCMP13]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 [[MASK]], i32 10)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
-; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP21:%.*]], label [[TMP22:%.*]], !prof [[PROF1]]
-; CHECK:       21:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       22:
-; CHECK-NEXT:    [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> zeroinitializer, i8 -1, i32 11)
-; CHECK-NEXT:    [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES_2:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
-; CHECK-NEXT:    [[RES:%.*]] = fadd <2 x double> [[RES_1]], [[RES_2]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 10)
-  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 11)
-
-  %res.1 = fadd <2 x double> %res0, %res1
-  %res.2 = fadd <2 x double> %res2, %res3
-  %res   = fadd <2 x double> %res.1, %res.2
-  ret <2 x double> %res
-}
-
-define i32 @test_x86_avx512_cvttsd2usi(<2 x double> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvttsd2usi(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> [[A0]], i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES2]]
-;
-  %res0 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 4) ;
-  %res1 = call i32 @llvm.x86.avx512.cvttsd2usi(<2 x double> %a0, i32 8) ;
-  %res2 = add i32 %res0, %res1
-  ret i32 %res2
-}
-declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvttsd2si(<2 x double> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvttsd2si(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> [[A0]], i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES2]]
-;
-  %res0 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 4) ;
-  %res1 = call i32 @llvm.x86.avx512.cvttsd2si(<2 x double> %a0, i32 8) ;
-  %res2 = add i32 %res0, %res1
-  ret i32 %res2
-}
-declare i32 @llvm.x86.avx512.cvttsd2si(<2 x double>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvttss2si(<4 x float> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvttss2si(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A0]], i32 4)
-; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES2]]
-;
-  %res0 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 8) ;
-  %res1 = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a0, i32 4) ;
-  %res2 = add i32 %res0, %res1
-  ret i32 %res2
-}
-declare i32 @llvm.x86.avx512.cvttss2si(<4 x float>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvttss2si_load(ptr %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvttss2si_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
-; CHECK:       2:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       3:
-; CHECK-NEXT:    [[A1:%.*]] = load <4 x float>, ptr [[A0:%.*]], align 16
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[A0]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = xor i64 [[TMP4]], 87960930222080
-; CHECK-NEXT:    [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP6]], align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> [[A1]], i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %a1 = load <4 x float>, ptr %a0
-  %res = call i32 @llvm.x86.avx512.cvttss2si(<4 x float> %a1, i32 4) ;
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_cvttss2usi(<4 x float> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvttss2usi(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES0:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> [[A0]], i32 4)
-; CHECK-NEXT:    [[RES2:%.*]] = add i32 [[RES0]], [[RES1]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES2]]
-;
-  %res0 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 8) ;
-  %res1 = call i32 @llvm.x86.avx512.cvttss2usi(<4 x float> %a0, i32 4) ;
-  %res2 = add i32 %res0, %res1
-  ret i32 %res2
-}
-declare i32 @llvm.x86.avx512.cvttss2usi(<4 x float>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvtsd2usi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> [[A0]], i32 9)
-; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES4]]
-;
-  %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4)
-  %res1 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 11)
-  %res2 = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 9)
-  %res3 = add i32 %res, %res1
-  %res4 = add i32 %res3, %res2
-  ret i32 %res4
-}
-declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvtsd2si32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> [[A0]], i32 9)
-; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES4]]
-;
-  %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4)
-  %res1 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 11)
-  %res2 = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 9)
-  %res3 = add i32 %res, %res1
-  %res4 = add i32 %res3, %res2
-  ret i32 %res4
-}
-declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvtss2usi32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i32 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> [[A0]], i32 9)
-; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES4]]
-;
-  %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4)
-  %res1 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 11)
-  %res2 = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 9)
-  %res3 = add i32 %res, %res1
-  %res4 = add i32 %res3, %res2
-  ret i32 %res4
-}
-declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone
-
-define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvtss2si32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 11)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP2]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES2:%.*]] = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> [[A0]], i32 9)
-; CHECK-NEXT:    [[RES3:%.*]] = add i32 [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add i32 [[RES3]], [[RES2]]
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES4]]
-;
-  %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4)
-  %res1 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 11)
-  %res2 = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 9)
-  %res3 = add i32 %res, %res1
-  %res4 = add i32 %res3, %res2
-  ret i32 %res4
-}
-declare i32 @llvm.x86.avx512.vcvtss2si32(<4 x float>, i32) nounwind readnone
-
-define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 %mask, ptr %dst) #0 {
-; CHECK-LABEL: @test_x86_vcvtps2ph_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]])
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK:       15:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       16:
-; CHECK-NEXT:    [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080
-; CHECK-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
-; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32
-; CHECK-NEXT:    store <16 x i16> [[RES1]], ptr [[DST]], align 32
-; CHECK-NEXT:    [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]]
-; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i16> [[RES]]
-;
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
-  %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 11, <16 x i16> zeroinitializer, i16 %mask)
-  %res3 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 12, <16 x i16> %src, i16 %mask)
-  store <16 x i16> %res1, ptr %dst
-  %res  = add <16 x i16> %res2, %res3
-  ret <16 x i16> %res
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
-
-define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) #0 {
-; CHECK-LABEL: @test_cmpps(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 2, <16 x i1> splat (i1 true), i32 8)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i1> [[RES]] to i16
-; CHECK-NEXT:    store i16 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i16 [[TMP7]]
-;
-  %res = call <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 8)
-  %1 = bitcast <16 x i1> %res to i16
-  ret i16 %1
-}
-declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32)
-
-define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) #0 {
-; CHECK-LABEL: @test_cmppd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 4, <8 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i1> [[RES]] to i8
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[TMP7]]
-;
-  %res = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
-  %1 = bitcast <8 x i1> %res to i8
-  ret i8 %1
-}
-declare <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double>, <8 x double>, i32, <8 x i1>, i32)
-
-
-  ; fp min - max
-define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) #0 {
-; CHECK-LABEL: @test_vmaxpd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
-  ret <8 x double> %1
-}
-declare <8 x double> @llvm.x86.avx512.max.pd.512(<8 x double>, <8 x double>, i32)
-
-define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) #0 {
-; CHECK-LABEL: @test_vminpd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double> %a0, <8 x double> %a1, i32 4)
-  ret <8 x double> %1
-}
-declare <8 x double> @llvm.x86.avx512.min.pd.512(<8 x double>, <8 x double>, i32)
-
-define void @test_mask_store_ss(ptr %ptr, <4 x float> %data, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_store_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = and i8 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = and i8 [[MASK:%.*]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = and i8 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = or i8 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i8 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = and i8 [[MASK]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP9]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[EXTRACT:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[TMP14]], i32 1, <4 x i1> [[EXTRACT]])
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP16:%.*]], label [[TMP17:%.*]], !prof [[PROF1]]
-; CHECK:       16:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       17:
-; CHECK-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[DATA:%.*]], ptr [[PTR]], i32 1, <4 x i1> [[EXTRACT]])
-; CHECK-NEXT:    ret void
-;
-  %1 = and i8 %mask, 1
-  %2 = bitcast i8 %1 to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.masked.store.v4f32.p0(<4 x float> %data, ptr %ptr, i32 1, <4 x i1> %extract)
-  ret void
-}
-declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32, <4 x i1>) #1
-
-
-declare <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float>, <16 x float>, i32)
-declare <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float>, <16 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double>, <8 x double>, i32)
-
-define <16 x float> @test_vsubps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vsubps_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vsubps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vsubps_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vsubps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vsubps_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vsubps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vsubps_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vmulps_rn(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vmulps_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vmulps_rd(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vmulps_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vmulps_ru(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vmulps_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vmulps_rz(<16 x float> %a0, <16 x float> %a1) #0 {
-; CHECK-LABEL: @test_vmulps_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_vmulps_mask_rn(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_rd(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_ru(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_rz(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_passthru_rn(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_passthru_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_passthru_rd(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_passthru_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_passthru_ru(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_passthru_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_vmulps_mask_passthru_rz(<16 x float> %a0, <16 x float> %a1, <16 x float> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_vmulps_mask_passthru_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[PASSTHRU:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.mul.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %passthru
-  ret <16 x float> %3
-}
-
-define <8 x double> @test_vmulpd_mask_rn(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_vmulpd_mask_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 8)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <8 x double> @test_vmulpd_mask_rd(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_vmulpd_mask_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 9)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <8 x double> @test_vmulpd_mask_ru(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_vmulpd_mask_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 10)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <8 x double> @test_vmulpd_mask_rz(<8 x double> %a0, <8 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_vmulpd_mask_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> [[A0:%.*]], <8 x double> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.mul.pd.512(<8 x double> %a0, <8 x double> %a1, i32 11)
-  %2 = bitcast i8 %mask to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_add_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_add_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_add_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_add_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_add_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_add_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  ret <16 x float> %1
-}
-declare <16 x float> @llvm.x86.avx512.add.ps.512(<16 x float>, <16 x float>, i32)
-
-define <16 x float> @test_mm512_mask_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_sub_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_sub_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_sub_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_sub_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_sub_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_sub_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_sub_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.sub.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_div_round_ps_rn_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rn_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_div_round_ps_rd_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 9)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 9)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_div_round_ps_ru_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_ru_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 10)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 10)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_div_round_ps_rz_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_rz_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 11)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 11)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_div_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_div_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  ret <16 x float> %1
-}
-declare <16 x float> @llvm.x86.avx512.div.ps.512(<16 x float>, <16 x float>, i32)
-
-define <16 x float> @test_mm512_maskz_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_min_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_min_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_min_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_min_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  ret <16 x float> %1
-}
-declare <16 x float> @llvm.x86.avx512.min.ps.512(<16 x float>, <16 x float>, i32)
-
-define <16 x float> @test_mm512_maskz_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_maskz_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_maskz_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_mask_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, <16 x float> %src, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_mask_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[SRC:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[SRC]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  %2 = bitcast i16 %mask to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %src
-  ret <16 x float> %3
-}
-
-define <16 x float> @test_mm512_max_round_ps_sae(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_max_round_ps_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 8)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 8)
-  ret <16 x float> %1
-}
-
-define <16 x float> @test_mm512_max_round_ps_current(<16 x float> %a0, <16 x float> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_mm512_max_round_ps_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> [[A0:%.*]], <16 x float> [[A1:%.*]], i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float> %a0, <16 x float> %a1, i32 4)
-  ret <16 x float> %1
-}
-declare <16 x float> @llvm.x86.avx512.max.ps.512(<16 x float>, <16 x float>, i32)
-
-declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_mask_add_ss_rn(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_add_ss_rd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 9)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_add_ss_ru(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 10)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_add_ss_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 11)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_add_ss_current(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_maskz_add_ss_rn(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_add_ss_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) #0 {
-; CHECK-LABEL: @test_add_ss_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_ss_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %a1.val = load float, ptr %a1
-  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
-  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
-  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
-  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_add_ss_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %a1.val = load float, ptr %a1
-  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
-  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
-  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
-  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_rd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 9)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 9)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_ru(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 10)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 10)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_rz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 11)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 11)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_current(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_maskz_add_sd_rn(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_add_sd_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_add_sd_rn(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_add_sd_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %a1.val = load double, ptr %a1
-  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
-  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_add_sd_current_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %a1.val = load double, ptr %a1
-  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
-  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
-  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_ss_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_maskz_max_ss_sae(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_ss_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_max_ss_sae(<4 x float> %a0, <4 x float> %a1) #0 {
-; CHECK-LABEL: @test_max_ss_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_max_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_maskz_max_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) #0 {
-; CHECK-LABEL: @test_max_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, ptr %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_ss_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %a1.val = load float, ptr %a1
-  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
-  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
-  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
-  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, ptr %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_ss_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load float, ptr [[A1:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <4 x float> undef, float [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[A1V1:%.*]] = insertelement <4 x float> [[A1V0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[A1V2:%.*]] = insertelement <4 x float> [[A1V1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <4 x float> [[A1V2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[_MSPROP3]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float> [[A0:%.*]], <4 x float> [[A1V]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %a1.val = load float, ptr %a1
-  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
-  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
-  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
-  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
-  ret <4 x float> %res
-}
-declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_sd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_maskz_max_sd_sae(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_sd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_max_sd_sae(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_max_sd_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 8)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_maskz_max_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_max_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, ptr %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_max_sd_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT:    br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %a1.val = load double, ptr %a1
-  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
-  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, ptr %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_max_sd_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A1_VAL:%.*]] = load double, ptr [[A1:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A1]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[A1V0:%.*]] = insertelement <2 x double> undef, double [[A1_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[A1V:%.*]] = insertelement <2 x double> [[A1V0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[_MSPROP1]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP11:%.*]], label [[TMP12:%.*]], !prof [[PROF1]]
-; CHECK:       11:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       12:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double> [[A0:%.*]], <2 x double> [[A1V]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %a1.val = load double, ptr %a1
-  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
-  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
-  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
-  ret <2 x double> %res
-}
-
-define <4 x float> @test_x86_avx512_cvtsi2ss32(<4 x float> %a, i32 %b) #0 {
-; CHECK-LABEL: @test_x86_avx512_cvtsi2ss32(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 11)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float> %a, i32 %b, i32 11) ; <<<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.cvtsi2ss32(<4 x float>, i32, i32) nounwind readnone
-
-define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss (<4 x float> %a, i32 %b) #0 {
-; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 9)
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_x86_avx512__mm_cvt_roundu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
-; CHECK-LABEL: @test_x86_avx512__mm_cvt_roundu32_ss_mem(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 9)
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %b = load i32, ptr %ptr
-  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 9) ; <<<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_x86_avx512__mm_cvtu32_ss(<4 x float> %a, i32 %b) #0 {
-; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B:%.*]], i32 4)
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_x86_avx512__mm_cvtu32_ss_mem(<4 x float> %a, ptr %ptr) #0 {
-; CHECK-LABEL: @test_x86_avx512__mm_cvtu32_ss_mem(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[B:%.*]] = load i32, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i32 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSLD]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> [[A:%.*]], i32 [[B]], i32 4)
-; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %b = load i32, ptr %ptr
-  %res = call <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float> %a, i32 %b, i32 4) ; <<<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind readnone
-
-declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>)
-
-define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
-  ret <16 x i32> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>)
-
-define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP9]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
-  ret <8 x double> %1
-}
-
-define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x double> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[X1]] to <8 x double>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i64> zeroinitializer, <8 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x double> [[TMP10]] to <8 x i64>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x double> [[TMP11]] to <8 x i64>
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <8 x i64> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i64> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = or <8 x i64> [[TMP18]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP19]], <8 x i64> [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <8 x i1> [[TMP13]], <8 x double> [[TMP10]], <8 x double> [[TMP11]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP20]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2)
-  %2 = bitcast <8 x i64> %x1 to <8 x double>
-  %3 = bitcast i8 %x3 to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2
-  ret <8 x double> %4
-}
-
-declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>)
-
-define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP9]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
-  ret <16 x float> %1
-}
-
-define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[X1]] to <16 x float>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> zeroinitializer, <16 x i32> [[TMP2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x float> [[TMP11]] to <16 x i32>
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <16 x i32> [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = or <16 x i32> [[TMP18]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP19]], <16 x i32> [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select <16 x i1> [[TMP13]], <16 x float> [[TMP10]], <16 x float> [[TMP11]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP20]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2)
-  %2 = bitcast <16 x i32> %x1 to <16 x float>
-  %3 = bitcast i16 %x3 to <16 x i1>
-  %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2
-  ret <16 x float> %4
-}
-
-declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>)
-
-define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermi2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP4]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
-  ret <8 x i64> %1
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermi2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> [[TMP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1
-  ret <8 x i64> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2P:%.*]], align 64
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2P]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP9]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
-  ret <16 x i32> %3
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, ptr %x2ptr, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 136) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[X2S:%.*]] = load double, ptr [[X2PTR:%.*]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[X2PTR]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <8 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[X2INS:%.*]] = insertelement <8 x double> undef, double [[X2S]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = shufflevector <8 x i64> [[_MSPROP]], <8 x i64> splat (i64 -1), <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[X2:%.*]] = shufflevector <8 x double> [[X2INS]], <8 x double> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[_MSPROP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x double> [[X2]])
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP17]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x double> [[TMP15]] to <8 x i64>
-; CHECK-NEXT:    [[TMP20:%.*]] = xor <8 x i64> [[TMP19]], zeroinitializer
-; CHECK-NEXT:    [[TMP21:%.*]] = or <8 x i64> [[TMP20]], zeroinitializer
-; CHECK-NEXT:    [[TMP22:%.*]] = or <8 x i64> [[TMP21]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP16]], <8 x i64> [[TMP22]], <8 x i64> [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select <8 x i1> [[TMP17]], <8 x double> [[TMP15]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP23]]
-;
-  %x2s = load double, ptr %x2ptr
-  %x2ins = insertelement <8 x double> undef, double %x2s, i32 0
-  %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer
-  %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x float> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[TMP10]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP17]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP12]], <16 x float> [[TMP10]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vpermt2var_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i64> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> [[X1:%.*]], <8 x i64> [[X0:%.*]], <8 x i64> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP1]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
-  ret <8 x i64> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vpermt2var_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> [[X1:%.*]], <16 x i32> [[X0:%.*]], <16 x i32> [[X2:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1
-  ret <16 x i32> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32)
-define <8 x double>@test_int_x86_avx512_mask_scalef_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x double> [[X2]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3, i32 11)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1, i32 8)
-  %res2 = fadd <8 x double> %res, %res1
-  ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
-define <16 x float>@test_int_x86_avx512_mask_scalef_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_scalef_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x float> [[X2]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 10)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1, i32 8)
-  %res2 = fadd <16 x float> %res, %res1
-  ret <16 x float> %res2
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64>, <16 x i8>, i8)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmov_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qb_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmov.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64>, <16 x i8>, i8)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovs_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qb_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovs.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64>, <16 x i8>, i8)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0:%.*]], <16 x i8> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> [[X0]], <16 x i8> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> %x1, i8 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.qb.512(<8 x i64> %x0, <16 x i8> zeroinitializer, i8 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovus_qb_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qb_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovus.qb.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[RES4]]
-;
-  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
-  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
-  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
-  %res3 = add <8 x i16> %res0, %res1
-  %res4 = add <8 x i16> %res3, %res2
-  ret <8 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmov_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmov.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[RES4]]
-;
-  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
-  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
-  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
-  %res3 = add <8 x i16> %res0, %res1
-  %res4 = add <8 x i16> %res3, %res2
-  ret <8 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovs_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovs.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
-
-define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0:%.*]], <8 x i16> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> [[X0]], <8 x i16> zeroinitializer, i8 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <8 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <8 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i16> [[RES4]]
-;
-  %res0 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 -1)
-  %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> %x1, i8 %x2)
-  %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %x0, <8 x i16> zeroinitializer, i8 %x2)
-  %res3 = add <8 x i16> %res0, %res1
-  %res4 = add <8 x i16> %res3, %res2
-  ret <8 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovus.qw.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
-;
-  %1 = trunc <8 x i64> %x0 to <8 x i32>
-  ret <8 x i32> %1
-}
-
-define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[_MSPROP]], <8 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i32> [[TMP4]], [[X1:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP10]], <8 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[TMP4]], <8 x i32> [[X1]]
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP11]]
-;
-  %1 = trunc <8 x i64> %x0 to <8 x i32>
-  %2 = bitcast i8 %x2 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
-  ret <8 x i32> %3
-}
-
-define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0,  i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmov_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[X0:%.*]] to <8 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X2:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[_MSPROP]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i32> [[TMP7]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP9]], <8 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP5]], <8 x i32> [[TMP3]], <8 x i32> zeroinitializer
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[TMP10]]
-;
-  %1 = trunc <8 x i64> %x0 to <8 x i32>
-  %2 = bitcast i8 %x2 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer
-  ret <8 x i32> %3
-}
-
-declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmov_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_qd_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmov.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovs_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovs_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
-  ret <8 x i32> %res
-}
-
-declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_qd_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovs.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pmovus_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2)
-  ret <8 x i32> %res
-}
-
-define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pmovus_qd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> [[X0:%.*]], <8 x i32> zeroinitializer, i8 [[X2:%.*]])
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2)
-  ret <8 x i32> %res
-}
-
-declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64>, i8)
-
-define void @test_int_x86_avx512_mask_pmovus_qd_mem_512(ptr %ptr, <8 x i64> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_qd_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR:%.*]], <8 x i64> [[X1:%.*]], i8 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr [[PTR]], <8 x i64> [[X1]], i8 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 -1)
-  call void @llvm.x86.avx512.mask.pmovus.qd.mem.512(ptr %ptr, <8 x i64> %x1, i8 %x2)
-  ret void
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmov_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_db_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmov.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmovs_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_db_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmovs.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
-
-define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0:%.*]], <16 x i8> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i8> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> [[X0]], <16 x i8> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i8> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i8> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i8> [[RES4]]
-;
-  %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 -1)
-  %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> %x1, i16 %x2)
-  %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %x0, <16 x i8> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i8> %res0, %res1
-  %res4 = add <16 x i8> %res3, %res2
-  ret <16 x i8> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmovus_db_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_db_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmovus.db.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i16> [[RES4]]
-;
-  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
-  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i16> %res0, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmov_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmov_dw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmov.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i16> [[RES4]]
-;
-  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
-  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i16> %res0, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmovs_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovs_dw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmovs.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
-
-define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0:%.*]], <16 x i16> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSCMP7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> [[X0]], <16 x i16> zeroinitializer, i16 [[X2]])
-; CHECK-NEXT:    [[RES3:%.*]] = add <16 x i16> [[RES0]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = add <16 x i16> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i16> [[RES4]]
-;
-  %res0 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 -1)
-  %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> %x1, i16 %x2)
-  %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %x0, <16 x i16> zeroinitializer, i16 %x2)
-  %res3 = add <16 x i16> %res0, %res1
-  %res4 = add <16 x i16> %res3, %res2
-  ret <16 x i16> %res4
-}
-
-declare void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32>, i16)
-
-define void @test_int_x86_avx512_mask_pmovus_dw_mem_512(ptr %ptr, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pmovus_dw_mem_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 72) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR:%.*]], <16 x i32> [[X1:%.*]], i16 -1)
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSOR4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr [[PTR]], <16 x i32> [[X1]], i16 [[X2:%.*]])
-; CHECK-NEXT:    ret void
-;
-  call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 -1)
-  call void @llvm.x86.avx512.mask.pmovus.dw.mem.512(ptr %ptr, <16 x i32> %x1, i16 %x2)
-  ret void
-}
-
-declare <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32>, i32)
-
-define <16 x float>@test_int_x86_avx512_mask_cvt_dq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_dq2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[CVT:%.*]] = sitofp <16 x i32> [[X0:%.*]] to <16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %cvt = sitofp <16 x i32> %x0 to <16 x float>
-  %1 = bitcast i16 %x2 to <16 x i1>
-  %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
-  %3 = call <16 x float> @llvm.x86.avx512.sitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
-  %res2 = fadd <16 x float> %2, %3
-  ret <16 x float> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
-
-define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2dq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES2]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
-
-define <8 x float>@test_int_x86_avx512_mask_cvt_pd2ps_512(<8 x double> %x0, <8 x float> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0:%.*]], <8 x float> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> [[X0]], <8 x float> [[X1]], i8 -1, i32 10)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x float> [[RES2]]
-;
-  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 %x2, i32 4)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %x0, <8 x float> %x1, i8 -1, i32 10)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
-
-define <8 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_pd2udq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES2]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 10)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float>, <16 x i32>, i16, i32)
-
-define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2dq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float>, <8 x double>, i8, i32)
-
-define <8 x double>@test_int_x86_avx512_mask_cvt_ps2pd_512(<8 x float> %x0, <8 x double> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0:%.*]], <8 x double> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i256 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> [[X0]], <8 x double> [[X1]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 %x2, i32 4)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtps2pd.512(<8 x float> %x0, <8 x double> %x1, i8 -1, i32 8)
-  %res2 = fadd <8 x double> %res, %res1
-  ret <8 x double> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
-
-define <16 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ps2udq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 10)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 10)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double>, <8 x i32>, i8, i32)
-
-define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2dq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES2]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2dq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32>, i32)
-
-define <16 x float>@test_int_x86_avx512_mask_cvt_udq2ps_512(<16 x i32> %x0, <16 x float> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_udq2ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[CVT:%.*]] = uitofp <16 x i32> [[X0:%.*]] to <16 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[X2:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP1]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x float> [[CVT]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[X1:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP1]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP4]], <16 x i32> [[TMP11]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP5]], <16 x float> [[CVT]], <16 x float> [[X1]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> [[X0]], i32 8)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[_MSPROP_SELECT]], zeroinitializer
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[TMP12]], [[TMP16]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %cvt = uitofp <16 x i32> %x0 to <16 x float>
-  %1 = bitcast i16 %x2 to <16 x i1>
-  %2 = select <16 x i1> %1, <16 x float> %cvt, <16 x float> %x1
-  %3 = call <16 x float> @llvm.x86.avx512.uitofp.round.v16f32.v16i32(<16 x i32> %x0, i32 8)
-  %res2 = fadd <16 x float> %2, %3
-  ret <16 x float> %res2
-}
-
-declare <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
-
-define <8 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_512(<8 x double> %x0, <8 x i32> %x1, i8 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_pd2udq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 96) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0:%.*]], <8 x i32> [[X1:%.*]], i8 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i32> [[TMP2]] to i256
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i256 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> [[X0]], <8 x i32> [[X1]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i32> [[RES2]]
-;
-  %res = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 %x2, i32 4)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttpd2udq.512(<8 x double> %x0, <8 x i32> %x1, i8 -1, i32 8)
-  %res2 = add <8 x i32> %res, %res1
-  ret <8 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float>, <16 x i32>, i16, i32)
-
-define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2dq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2dq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float>, <16 x i32>, i16, i32)
-
-define <16 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_512(<16 x float> %x0, <16 x i32> %x1, i16 %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvtt_ps2udq_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]], i16 [[X2:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> [[X0]], <16 x i32> [[X1]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 %x2, i32 4)
-  %res1 = call <16 x i32> @llvm.x86.avx512.mask.cvttps2udq.512(<16 x float> %x0, <16 x i32> %x1, i16 -1, i32 8)
-  %res2 = add <16 x i32> %res, %res1
-  ret <16 x i32> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1) #0 {
-; CHECK-LABEL: @test_getexp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 -1, i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 8)
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_getexp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES0:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0]], <4 x float> [[A1]], <4 x float> [[A2]], i8 [[MASK]], i32 8)
-; CHECK-NEXT:    [[RES_1:%.*]] = fadd <4 x float> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES_1]]
-;
-  %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8)
-  %res.1 = fadd <4 x float> %res0, %res1
-  ret <4 x float> %res.1
-}
-
-define <4 x float> @test_maskz_getexp_ss(<4 x float> %a0, <4 x float> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_getexp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], <4 x float> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 8)
-  ret <4 x float> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_getexp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 -1, i32 4)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 4)
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_mask_getexp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES0:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> [[A2:%.*]], i8 [[MASK:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSOR10]], [[_MSCMP11]]
-; CHECK-NEXT:    br i1 [[_MSOR12]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0]], <2 x double> [[A1]], <2 x double> [[A2]], i8 [[MASK]], i32 8)
-; CHECK-NEXT:    [[RES_1:%.*]] = fadd <2 x double> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES_1]]
-;
-  %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8)
-  %res.1 = fadd <2 x double> %res0, %res1
-  ret <2 x double> %res.1
-}
-
-define <2 x double> @test_maskz_getexp_sd(<2 x double> %a0, <2 x double> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_maskz_getexp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], <2 x double> zeroinitializer, i8 [[MASK:%.*]], i32 8)
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 8)
-  ret <2 x double> %res
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double>, <2 x double>, i32, i8, i32)
-
-define i8@test_int_x86_avx512_mask_cmp_sd(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 5, i8 [[X3:%.*]], i32 8)
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES4]]
-;
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
-  ret i8 %res4
-}
-
-define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_sd_all(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 2, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 3, i8 -1, i32 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
-; CHECK-NEXT:    br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
-; CHECK:       18:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       19:
-; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 5, i8 [[X3]], i32 8)
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i8 [[RES1]], -1
-; CHECK-NEXT:    [[TMP21:%.*]] = xor i8 [[RES2]], -1
-; CHECK-NEXT:    [[TMP22:%.*]] = and i8 [[TMP20]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = and i8 0, [[TMP21]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or i8 0, [[TMP22]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i8 [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[RES11:%.*]] = or i8 [[RES1]], [[RES2]]
-; CHECK-NEXT:    [[TMP26:%.*]] = xor i8 [[RES3]], -1
-; CHECK-NEXT:    [[TMP27:%.*]] = xor i8 [[RES4]], -1
-; CHECK-NEXT:    [[TMP28:%.*]] = and i8 [[TMP26]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = and i8 0, [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = or i8 0, [[TMP28]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or i8 [[TMP30]], [[TMP29]]
-; CHECK-NEXT:    [[RES12:%.*]] = or i8 [[RES3]], [[RES4]]
-; CHECK-NEXT:    [[TMP32:%.*]] = xor i8 [[RES11]], -1
-; CHECK-NEXT:    [[TMP33:%.*]] = xor i8 [[RES12]], -1
-; CHECK-NEXT:    [[TMP34:%.*]] = and i8 [[TMP25]], [[TMP31]]
-; CHECK-NEXT:    [[TMP35:%.*]] = and i8 [[TMP32]], [[TMP31]]
-; CHECK-NEXT:    [[TMP36:%.*]] = and i8 [[TMP25]], [[TMP33]]
-; CHECK-NEXT:    [[TMP37:%.*]] = or i8 [[TMP34]], [[TMP35]]
-; CHECK-NEXT:    [[TMP38:%.*]] = or i8 [[TMP37]], [[TMP36]]
-; CHECK-NEXT:    [[RES13:%.*]] = or i8 [[RES11]], [[RES12]]
-; CHECK-NEXT:    store i8 [[TMP38]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES13]]
-;
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 2, i8 -1, i32 4)
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 3, i8 -1, i32 8)
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 4, i8 %x3, i32 4)
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.sd(<2 x double> %x0, <2 x double> %x1, i32 5, i8 %x3, i32 8)
-
-  %res11 = or i8 %res1, %res2
-  %res12 = or i8 %res3, %res4
-  %res13 = or i8 %res11, %res12
-  ret i8 %res13
-}
-
-declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
-
-define i8@test_int_x86_avx512_mask_cmp_ss(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 3, i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    store i8 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES2]]
-;
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 %x3, i32 4)
-  ret i8 %res2
-}
-
-
-define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cmp_ss_all(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES1:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 2, i8 -1, i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES2:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 3, i8 -1, i32 8)
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR7:%.*]] = or i1 [[_MSCMP5]], [[_MSCMP6]]
-; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR9:%.*]] = or i1 [[_MSOR7]], [[_MSCMP8]]
-; CHECK-NEXT:    br i1 [[_MSOR9]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[RES3:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 4, i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP17]], 0
-; CHECK-NEXT:    [[_MSOR12:%.*]] = or i1 [[_MSCMP10]], [[_MSCMP11]]
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR14:%.*]] = or i1 [[_MSOR12]], [[_MSCMP13]]
-; CHECK-NEXT:    br i1 [[_MSOR14]], label [[TMP18:%.*]], label [[TMP19:%.*]], !prof [[PROF1]]
-; CHECK:       18:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       19:
-; CHECK-NEXT:    [[RES4:%.*]] = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 5, i8 [[X3]], i32 8)
-; CHECK-NEXT:    [[TMP20:%.*]] = and i8 [[RES1]], 0
-; CHECK-NEXT:    [[TMP21:%.*]] = and i8 0, [[RES2]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or i8 0, [[TMP20]]
-; CHECK-NEXT:    [[TMP23:%.*]] = or i8 [[TMP22]], [[TMP21]]
-; CHECK-NEXT:    [[RES11:%.*]] = and i8 [[RES1]], [[RES2]]
-; CHECK-NEXT:    [[TMP24:%.*]] = and i8 [[RES3]], 0
-; CHECK-NEXT:    [[TMP25:%.*]] = and i8 0, [[RES4]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or i8 0, [[TMP24]]
-; CHECK-NEXT:    [[TMP27:%.*]] = or i8 [[TMP26]], [[TMP25]]
-; CHECK-NEXT:    [[RES12:%.*]] = and i8 [[RES3]], [[RES4]]
-; CHECK-NEXT:    [[TMP28:%.*]] = and i8 [[TMP23]], [[TMP27]]
-; CHECK-NEXT:    [[TMP29:%.*]] = and i8 [[RES11]], [[TMP27]]
-; CHECK-NEXT:    [[TMP30:%.*]] = and i8 [[TMP23]], [[RES12]]
-; CHECK-NEXT:    [[TMP31:%.*]] = or i8 [[TMP28]], [[TMP29]]
-; CHECK-NEXT:    [[TMP32:%.*]] = or i8 [[TMP31]], [[TMP30]]
-; CHECK-NEXT:    [[RES13:%.*]] = and i8 [[RES11]], [[RES12]]
-; CHECK-NEXT:    store i8 [[TMP32]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i8 [[RES13]]
-;
-  %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
-  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 3, i8 -1, i32 8)
-  %res3 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 4, i8 %x3, i32 4)
-  %res4 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 5, i8 %x3, i32 8)
-
-  %res11 = and i8 %res1, %res2
-  %res12 = and i8 %res3, %res4
-  %res13 = and i8 %res11, %res12
-  ret i8 %res13
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
-
-define <8 x double>@test_int_x86_avx512_mask_getmant_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i8 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0:%.*]], i32 11, <8 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> [[X0]], i32 11, <8 x double> [[X2]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.getmant.pd.512(<8 x double> %x0, i32 11, <8 x double> %x2, i8 -1, i32 8)
-  %res2 = fadd <8 x double> %res, %res1
-  ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
-
-define <16 x float>@test_int_x86_avx512_mask_getmant_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i16 [[TMP3]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0:%.*]], i32 11, <16 x float> [[X2:%.*]], i16 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i512 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR6:%.*]] = or i1 [[_MSCMP4]], [[_MSCMP5]]
-; CHECK-NEXT:    br i1 [[_MSOR6]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
-; CHECK:       10:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       11:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> [[X0]], i32 11, <16 x float> [[X2]], i16 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 %x3, i32 4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.getmant.ps.512(<16 x float> %x0, i32 11, <16 x float> %x2, i16 -1, i32 8)
-  %res2 = fadd <16 x float> %res, %res1
-  ret <16 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double>, <2 x double>, i32, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], i32 11, <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 12, <2 x double> zeroinitializer, i8 [[X3]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 13, <2 x double> [[X2]], i8 [[X3]], i32 8)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSCMP18]], [[_MSCMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP21:%.*]] = icmp ne i128 [[TMP21]], 0
-; CHECK-NEXT:    [[_MSOR22:%.*]] = or i1 [[_MSOR20]], [[_MSCMP21]]
-; CHECK-NEXT:    br i1 [[_MSOR22]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
-; CHECK:       22:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       23:
-; CHECK-NEXT:    [[RES3:%.*]] = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> [[X0]], <2 x double> [[X1]], i32 14, <2 x double> [[X2]], i8 -1, i32 4)
-; CHECK-NEXT:    [[RES11:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES12:%.*]] = fadd <2 x double> [[RES2]], [[RES3]]
-; CHECK-NEXT:    [[RES13:%.*]] = fadd <2 x double> [[RES11]], [[RES12]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES13]]
-;
-  %res  = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 12, <2 x double> zeroinitializer, i8 %x3, i32 4)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 13, <2 x double> %x2, i8 %x3, i32 8)
-  %res3 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 14, <2 x double> %x2, i8 -1, i32 4)
-  %res11 = fadd <2 x double> %res, %res1
-  %res12 = fadd <2 x double> %res2, %res3
-  %res13 = fadd <2 x double> %res11, %res12
-  ret <2 x double> %res13
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float>, <4 x float>, i32, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], i32 11, <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 12, <4 x float> zeroinitializer, i8 [[X3]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 13, <4 x float> [[X2]], i8 -1, i32 8)
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i128 [[TMP19]], 0
-; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i128 [[TMP20]], 0
-; CHECK-NEXT:    [[_MSOR18:%.*]] = or i1 [[_MSCMP16]], [[_MSCMP17]]
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i128 [[TMP21]], 0
-; CHECK-NEXT:    [[_MSOR20:%.*]] = or i1 [[_MSOR18]], [[_MSCMP19]]
-; CHECK-NEXT:    br i1 [[_MSOR20]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
-; CHECK:       22:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       23:
-; CHECK-NEXT:    [[RES3:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0]], <4 x float> [[X1]], i32 14, <4 x float> [[X2]], i8 -1, i32 4)
-; CHECK-NEXT:    [[RES11:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES12:%.*]] = fadd <4 x float> [[RES2]], [[RES3]]
-; CHECK-NEXT:    [[RES13:%.*]] = fadd <4 x float> [[RES11]], [[RES12]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES13]]
-;
-  %res  = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 12, <4 x float> zeroinitializer, i8 %x3, i32 4)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 13, <4 x float> %x2, i8 -1, i32 8)
-  %res3 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 14, <4 x float> %x2, i8 -1, i32 4)
-  %res11 = fadd <4 x float> %res, %res1
-  %res12 = fadd <4 x float> %res2, %res3
-  %res13 = fadd <4 x float> %res11, %res12
-  ret <4 x float> %res13
-}
-
-define <4 x float> @test_int_x86_avx512_mask_getmant_ss_load(<4 x float> %x0, ptr %x1p) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_getmant_ss_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[X1:%.*]] = load <4 x float>, ptr [[X1P:%.*]], align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[X1P]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
-; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 16
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i32> [[_MSLD]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1]], i32 11, <4 x float> undef, i8 -1, i32 4)
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES]]
-;
-  %x1 = load <4 x float>, ptr %x1p
-  %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> undef, i8 -1, i32 4)
-  ret <4 x float> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double>, <8 x i64>)
-
-define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
-  ret <8 x double> %res
-}
-
-define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_mask(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> %x2
-  ret <8 x double> %res2
-}
-
-define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512_maskz(<8 x double> %x0, <8 x i64> %x1, i8 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_pd_512_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x double> [[RES]] to <8 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP13]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x double> [[RES]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES2]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x double> %res, <8 x double> zeroinitializer
-  ret <8 x double> %res2
-}
-
-declare <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float>, <16 x i32>)
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
-  ret <16 x float> %res2
-}
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP13]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
-  ret <16 x float> %res2
-}
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool(<16 x float> %x0, <16 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
-; CHECK:       3:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       4:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-  ret <16 x float> %res
-}
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_mask(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> %x2
-  ret <16 x float> %res2
-}
-
-define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(<16 x float> %x0, <16 x i32> %x1, i16 %mask) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_vpermilvar_ps_512_constant_pool_maskz(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> [[X0:%.*]], <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <16 x float> [[RES]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x float> [[RES]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES2]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 1, i32 0>)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x float> %res, <16 x float> zeroinitializer
-  ret <16 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double>, <4 x float>, <2 x double>, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask_cvt_ss2sd_round(<2 x double> %x0,<4 x float> %x1, <2 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_ss2sd_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0:%.*]], <4 x float> [[X1:%.*]], <2 x double> [[X2:%.*]], i8 [[X3:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> [[X0]], <4 x float> [[X1]], <2 x double> [[X2]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES2]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 %x3, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtss2sd.round(<2 x double> %x0, <4 x float> %x1, <2 x double> %x2, i8 -1, i32 8)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float>, <2 x double>, <4 x float>, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask_cvt_sd2ss_round(<4 x float> %x0,<2 x double> %x1, <4 x float> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_cvt_sd2ss_round(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0:%.*]], <2 x double> [[X1:%.*]], <4 x float> [[X2:%.*]], i8 [[X3:%.*]], i32 11)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i128 [[TMP12]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
-; CHECK:       13:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       14:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> [[X0]], <2 x double> [[X1]], <4 x float> [[X2]], i8 -1, i32 8)
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES2]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 %x3, i32 11)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtsd2ss.round(<4 x float> %x0, <2 x double> %x1, <4 x float> %x2, i8 -1, i32 8)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32)
-
-define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP9]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], [[X0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> [[X0]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
-  %2 = bitcast i16 %x4 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  ret <16 x i32> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[X4:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP17]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33)
-  %2 = bitcast i16 %x4 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
-  ret <16 x i32> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32)
-
-define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP9]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
-  ret <8 x i64> %1
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> [[TMP1]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], [[X0]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> [[X0]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
-  %2 = bitcast i8 %x4 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x0
-  ret <8 x i64> %3
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_pternlog_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    br i1 [[_MSOR3]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 33)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X4:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[TMP10]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33)
-  %2 = bitcast i8 %x4 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
-  ret <8 x i64> %3
-}
-
-define i32 @test_x86_avx512_comi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_comi_sd_eq_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 8)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 8)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_ucomi_sd_eq_sae(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 8)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 8)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_comi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_comi_sd_eq(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 0, i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 0, i32 4)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_ucomi_sd_eq(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_ucomi_sd_eq(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 8, i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 8, i32 4)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_comi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_comi_sd_lt_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 8)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 8)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_ucomi_sd_lt_sae(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt_sae(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 8)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 8)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_comi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_comi_sd_lt(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 1, i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 1, i32 4)
-  ret i32 %res
-}
-
-define i32 @test_x86_avx512_ucomi_sd_lt(<2 x double> %a0, <2 x double> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_ucomi_sd_lt(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> [[A0:%.*]], <2 x double> [[A1:%.*]], i32 9, i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.sd(<2 x double> %a0, <2 x double> %a1, i32 9, i32 4)
-  ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.vcomi.sd(<2 x double>, <2 x double>, i32, i32)
-
-define i32 @test_x86_avx512_ucomi_ss_lt(<4 x float> %a0, <4 x float> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_ucomi_ss_lt(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> [[A0:%.*]], <4 x float> [[A1:%.*]], i32 9, i32 4)
-; CHECK-NEXT:    store i32 0, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret i32 [[RES]]
-;
-  %res = call i32 @llvm.x86.avx512.vcomi.ss(<4 x float> %a0, <4 x float> %a1, i32 9, i32 4)
-  ret i32 %res
-}
-
-declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32)
-
-declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>)
-
-define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP7]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
-  ret <8 x double> %1
-}
-
-define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> zeroinitializer, <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x double> [[TMP9]] to <8 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x double> [[X2:%.*]] to <8 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i64> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i64> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> [[TMP17]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x double> [[TMP9]], <8 x double> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP18]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2
-  ret <8 x double> %3
-}
-
-define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_df_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i64> zeroinitializer, <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x double> [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP15]], <8 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <8 x i1> [[TMP10]], <8 x double> [[TMP8]], <8 x double> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[TMP16]]
-;
-  %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer
-  ret <8 x double> %3
-}
-
-declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>)
-
-define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP3]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
-  ret <8 x i64> %1
-}
-
-define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[_MSPROP]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <8 x i64> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP11]], <8 x i64> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP7]], <8 x i64> [[TMP5]], <8 x i64> [[X2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP12]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2
-  ret <8 x i64> %3
-}
-
-define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_di_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> [[X0:%.*]], <8 x i64> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[_MSPROP]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[TMP11]]
-;
-  %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1)
-  %2 = bitcast i8 %x3 to <8 x i1>
-  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
-  ret <8 x i64> %3
-}
-
-declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>)
-
-define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP7]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
-  ret <16 x float> %1
-}
-
-define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP7:%.*]], label [[TMP8:%.*]], !prof [[PROF1]]
-; CHECK:       7:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       8:
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[X2:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP18]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2
-  ret <16 x float> %3
-}
-
-define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_sf_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[TMP12]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], zeroinitializer
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP15]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[TMP8]], <16 x float> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP16]]
-;
-  %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer
-  ret <16 x float> %3
-}
-
-declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>)
-
-define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
-  ret <16 x i32> %1
-}
-
-define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[_MSPROP]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <16 x i32> [[TMP5]], [[X2:%.*]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP11]], <16 x i32> [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP7]], <16 x i32> [[TMP5]], <16 x i32> [[X2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP12]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2
-  ret <16 x i32> %3
-}
-
-define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_permvar_si_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> [[X0:%.*]], <16 x i32> [[X1:%.*]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[X3:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[_MSPROP]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[_MSPROP]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[TMP11]]
-;
-  %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1)
-  %2 = bitcast i16 %x3 to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
-  ret <16 x i32> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
-
-define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 4, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> [[X1]], <8 x i64> [[X2]], i32 5, i8 [[X4]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 3, i8 -1, i32 8)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES4]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4)
-  %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4)
-  %res2 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 8)
-  %res3 = fadd <8 x double> %res, %res1
-  %res4 = fadd <8 x double> %res3, %res2
-  ret <8 x double> %res4
-}
-
-define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512_load(<8 x double> %x0, <8 x double> %x1, ptr %x2ptr) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_pd_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X2:%.*]] = load <8 x i64>, ptr [[X2PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i64>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2]], i32 3, i8 -1, i32 4)
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %x2 = load <8 x i64>, ptr %x2ptr
-  %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 -1, i32 4)
-  ret <8 x double> %res
-}
-
-declare <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32)
-
-define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_pd_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0:%.*]], <8 x double> [[X1:%.*]], <8 x i64> [[X2:%.*]], i32 3, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> [[X0]], <8 x double> [[X1]], <8 x i64> [[X2]], i32 2, i8 -1, i32 8)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <8 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <8 x double> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES4]]
-;
-  %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4)
-  %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4)
-  %res2 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 2, i8 -1, i32 8)
-  %res3 = fadd <8 x double> %res, %res1
-  %res4 = fadd <8 x double> %res3, %res2
-  ret <8 x double> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 5, i8 -1, i32 8)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 -1, i32 8)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res3, %res2
-  ret <4 x float> %res4
-}
-
-declare <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float>, <4 x float>, <4 x i32>, i32, i8, i32)
-
-define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0:%.*]], <4 x float> [[X1:%.*]], <4 x i32> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> zeroinitializer, i32 5, i8 [[X4]], i32 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <4 x i32> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> [[X0]], <4 x float> [[X1]], <4 x i32> [[X2]], i32 6, i8 -1, i32 4)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 8)
-  %res2 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 6, i8 -1, i32 4)
-  %res3 = fadd <4 x float> %res, %res1
-  %res4 = fadd <4 x float> %res3, %res2
-  ret <4 x float> %res4
-}
-
-declare <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
-
-define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 5, i16 [[X4]], i32 4)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 5, i16 -1, i32 8)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
-  %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4)
-  %res2 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 8)
-  %res3 = fadd <16 x float> %res, %res1
-  %res4 = fadd <16 x float> %res3, %res2
-  ret <16 x float> %res4
-}
-
-define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512_load(<16 x float> %x0, <16 x float> %x1, ptr %x2ptr) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_ps_512_load(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[X2:%.*]] = load <16 x i32>, ptr [[X2PTR:%.*]], align 64
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[X2PTR]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[_MSLD]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSOR]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2]], i32 5, i16 -1, i32 4)
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %x2 = load <16 x i32>, ptr %x2ptr
-  %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 -1, i32 4)
-  ret <16 x float> %res
-}
-
-declare <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float>, <16 x float>, <16 x i32>, i32, i16, i32)
-
-define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i16 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_ps_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i16 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0:%.*]], <16 x float> [[X1:%.*]], <16 x i32> [[X2:%.*]], i32 5, i16 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i512 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i16 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> zeroinitializer, i32 6, i16 [[X4]], i32 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i512 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i32> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i512 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i32> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i512 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> [[X0]], <16 x float> [[X1]], <16 x i32> [[X2]], i32 7, i16 -1, i32 4)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <16 x float> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <16 x float> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES4]]
-;
-  %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
-  %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 6, i16 %x4, i32 8)
-  %res2 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 7, i16 -1, i32 4)
-  %res3 = fadd <16 x float> %res, %res1
-  %res4 = fadd <16 x float> %res3, %res2
-  ret <16 x float> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_fixupimm_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    br i1 [[_MSOR15]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 6, i8 -1, i32 4)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 6, i8 -1, i32 4)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res3, %res2
-  ret <2 x double> %res4
-}
-
-declare <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double>, <2 x double>, <2 x i64>, i32, i8, i32)
-
-define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i8 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_fixupimm_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP6]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
-; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR5:%.*]] = or i1 [[_MSOR3]], [[_MSCMP4]]
-; CHECK-NEXT:    br i1 [[_MSOR5]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[RES:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0:%.*]], <2 x double> [[X1:%.*]], <2 x i64> [[X2:%.*]], i32 5, i8 [[X4:%.*]], i32 4)
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT:    [[_MSOR8:%.*]] = or i1 [[_MSCMP6]], [[_MSCMP7]]
-; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR10:%.*]] = or i1 [[_MSOR8]], [[_MSCMP9]]
-; CHECK-NEXT:    br i1 [[_MSOR10]], label [[TMP12:%.*]], label [[TMP13:%.*]], !prof [[PROF1]]
-; CHECK:       12:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       13:
-; CHECK-NEXT:    [[RES1:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> zeroinitializer, i32 5, i8 [[X4]], i32 8)
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <2 x i64> [[TMP1]] to i128
-; CHECK-NEXT:    [[_MSCMP11:%.*]] = icmp ne i128 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[_MSCMP12:%.*]] = icmp ne i128 [[TMP15]], 0
-; CHECK-NEXT:    [[_MSOR13:%.*]] = or i1 [[_MSCMP11]], [[_MSCMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <2 x i64> [[TMP3]] to i128
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i128 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSOR15:%.*]] = or i1 [[_MSOR13]], [[_MSCMP14]]
-; CHECK-NEXT:    [[_MSCMP16:%.*]] = icmp ne i8 [[TMP4]], 0
-; CHECK-NEXT:    [[_MSOR17:%.*]] = or i1 [[_MSOR15]], [[_MSCMP16]]
-; CHECK-NEXT:    br i1 [[_MSOR17]], label [[TMP17:%.*]], label [[TMP18:%.*]], !prof [[PROF1]]
-; CHECK:       17:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       18:
-; CHECK-NEXT:    [[RES2:%.*]] = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> [[X0]], <2 x double> [[X1]], <2 x i64> [[X2]], i32 5, i8 [[X4]], i32 8)
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[RES]], [[RES1]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[RES3]], [[RES2]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8)
-  %res2 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 8)
-  %res3 = fadd <2 x double> %res, %res1
-  %res4 = fadd <2 x double> %res3, %res2
-  ret <2 x double> %res4
-}
-
-declare double @llvm.fma.f64(double, double, double) #1
-declare double @llvm.x86.avx512.vfmadd.f64(double, double, double, i32) #0
-
-define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP5]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X0]], double [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP27]] to i64
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP11]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP27]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X0]], double [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %1 = extractelement <2 x double> %x0, i64 0
-  %2 = extractelement <2 x double> %x1, i64 0
-  %3 = extractelement <2 x double> %x2, i64 0
-  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, double %4, double %1
-  %8 = insertelement <2 x double> %x0, double %7, i64 0
-  %9 = extractelement <2 x double> %x0, i64 0
-  %10 = extractelement <2 x double> %x1, i64 0
-  %11 = extractelement <2 x double> %x2, i64 0
-  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
-  %13 = insertelement <2 x double> %x0, double %12, i64 0
-  %14 = extractelement <2 x double> %x0, i64 0
-  %15 = extractelement <2 x double> %x1, i64 0
-  %16 = extractelement <2 x double> %x2, i64 0
-  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
-  %18 = bitcast i8 %x3 to <8 x i1>
-  %19 = extractelement <8 x i1> %18, i64 0
-  %20 = select i1 %19, double %17, double %14
-  %21 = insertelement <2 x double> %x0, double %20, i64 0
-  %res3 = fadd <2 x double> %8, %13
-  %res4 = fadd <2 x double> %21, %res3
-  ret <2 x double> %res4
-}
-
-define <4 x float> @test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP5]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP27]] to i32
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP11]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP27]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X0]], float [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %x1, i64 0
-  %3 = extractelement <4 x float> %x2, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float %1
-  %8 = insertelement <4 x float> %x0, float %7, i64 0
-  %9 = extractelement <4 x float> %x0, i64 0
-  %10 = extractelement <4 x float> %x1, i64 0
-  %11 = extractelement <4 x float> %x2, i64 0
-  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
-  %13 = insertelement <4 x float> %x0, float %12, i64 0
-  %14 = extractelement <4 x float> %x0, i64 0
-  %15 = extractelement <4 x float> %x1, i64 0
-  %16 = extractelement <4 x float> %x2, i64 0
-  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
-  %18 = bitcast i8 %x3 to <8 x i1>
-  %19 = extractelement <8 x i1> %18, i64 0
-  %20 = select i1 %19, float %17, float %14
-  %21 = insertelement <4 x float> %x0, float %20, i64 0
-  %res3 = fadd <4 x float> %8, %13
-  %res4 = fadd <4 x float> %21, %res3
-  ret <4 x float> %res4
-}
-
-define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_sd(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = call double @llvm.fma.f64(double [[TMP1]], double [[TMP2]], double [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast double [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i64 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i64 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i64 [[TMP11]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], double [[TMP4]], double 0.000000e+00
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <2 x double> [[X0]], double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]], i32 11)
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select i1 false, i64 [[TMP24]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], double [[TMP17]], double 0.000000e+00
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X0]], double [[TMP25]], i64 0
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <2 x double> [[TMP13]], [[TMP26]]
-; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES2]]
-;
-  %1 = extractelement <2 x double> %x0, i64 0
-  %2 = extractelement <2 x double> %x1, i64 0
-  %3 = extractelement <2 x double> %x2, i64 0
-  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, double %4, double 0.000000e+00
-  %8 = insertelement <2 x double> %x0, double %7, i64 0
-  %9 = extractelement <2 x double> %x0, i64 0
-  %10 = extractelement <2 x double> %x1, i64 0
-  %11 = extractelement <2 x double> %x2, i64 0
-  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
-  %13 = bitcast i8 %x3 to <8 x i1>
-  %14 = extractelement <8 x i1> %13, i64 0
-  %15 = select i1 %14, double %12, double 0.000000e+00
-  %16 = insertelement <2 x double> %x0, double %15, i64 0
-  %res2 = fadd <2 x double> %8, %16
-  ret <2 x double> %res2
-}
-
-declare float @llvm.fma.f32(float, float, float) #1
-declare float @llvm.x86.avx512.vfmadd.f32(float, float, float, i32) #0
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = call float @llvm.fma.f32(float [[TMP1]], float [[TMP2]], float [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i1> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[TMP6]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast float [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = xor i32 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], 0
-; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP10]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP11]], i32 [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP6]], float [[TMP4]], float 0.000000e+00
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> [[X0]], float [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]], i32 11)
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 0, i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT1:%.*]] = select i1 false, i32 [[TMP24]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP17]], float 0.000000e+00
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X0]], float [[TMP25]], i64 0
-; CHECK-NEXT:    [[RES2:%.*]] = fadd <4 x float> [[TMP13]], [[TMP26]]
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES2]]
-;
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %x1, i64 0
-  %3 = extractelement <4 x float> %x2, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float 0.000000e+00
-  %8 = insertelement <4 x float> %x0, float %7, i64 0
-  %9 = extractelement <4 x float> %x0, i64 0
-  %10 = extractelement <4 x float> %x1, i64 0
-  %11 = extractelement <4 x float> %x2, i64 0
-  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
-  %13 = bitcast i8 %x3 to <8 x i1>
-  %14 = extractelement <8 x i1> %13, i64 0
-  %15 = select i1 %14, float %12, float 0.000000e+00
-  %16 = insertelement <4 x float> %x0, float %15, i64 0
-  %res2 = fadd <4 x float> %8, %16
-  ret <4 x float> %res2
-}
-
-define <4 x float> @test_int_x86_avx512_maskz_vfmadd_ss_load0(i8 zeroext %0, ptr nocapture readonly %1, float %2, float %3) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_load0(
-; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x float>, ptr [[TMP1:%.*]], align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = ptrtoint ptr [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], 87960930222080
-; CHECK-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP13]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP14]], align 16
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[_MSLD]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[TMP11]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i32 [[_MSPROP]], [[TMP6]]
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i32 [[_MSPROP1]], [[TMP7]]
-; CHECK-NEXT:    [[TMP16:%.*]] = tail call float @llvm.fma.f32(float [[TMP15]], float [[TMP2:%.*]], float [[TMP3:%.*]])
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8 [[TMP8]] to <8 x i1>
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP0:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <8 x i1> [[TMP17]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP19]], i32 [[_MSPROP2]], i32 0
-; CHECK-NEXT:    [[TMP21:%.*]] = bitcast float [[TMP16]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = xor i32 [[TMP21]], 0
-; CHECK-NEXT:    [[TMP23:%.*]] = or i32 [[TMP22]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP3]], i32 [[TMP24]], i32 [[TMP20]]
-; CHECK-NEXT:    [[TMP25:%.*]] = select i1 [[TMP19]], float [[TMP16]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <4 x i32> [[_MSLD]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP11]], float [[TMP25]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP4]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP26]]
-;
-  %5 = load <4 x float>, ptr %1, align 16
-  %6 = extractelement <4 x float> %5, i64 0
-  %7 = tail call float @llvm.fma.f32(float %6, float %2, float %3) #2
-  %8 = bitcast i8 %0 to <8 x i1>
-  %9 = extractelement <8 x i1> %8, i64 0
-  %10 = select i1 %9, float %7, float 0.000000e+00
-  %11 = insertelement <4 x float> %5, float %10, i64 0
-  ret <4 x float> %11
-}
-
-define <2 x double> @test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call double @llvm.fma.f64(double [[TMP5]], double [[TMP6]], double [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[_MSPROP4]], i64 [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double [[TMP8]] to i64
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double [[TMP7]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i64 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i64 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i64 [[TMP16]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i64 [[TMP17]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], double [[TMP8]], double [[TMP7]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> [[X2]], double [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP20]], double [[TMP21]], double [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <2 x double> [[X2]], double [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP27]], double [[TMP28]], double [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i64 0, i64 [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast double [[TMP32]] to i64
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast double [[TMP29]] to i64
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i64 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i64 [[TMP40]], [[_MSPROP13]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i64 [[TMP41]], i64 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], double [[TMP32]], double [[TMP29]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <2 x double> [[X2]], double [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <2 x i64> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <2 x i64> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %1 = extractelement <2 x double> %x0, i64 0
-  %2 = extractelement <2 x double> %x1, i64 0
-  %3 = extractelement <2 x double> %x2, i64 0
-  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, double %4, double %3
-  %8 = insertelement <2 x double> %x2, double %7, i64 0
-  %9 = extractelement <2 x double> %x0, i64 0
-  %10 = extractelement <2 x double> %x1, i64 0
-  %11 = extractelement <2 x double> %x2, i64 0
-  %12 = call double @llvm.x86.avx512.vfmadd.f64(double %9, double %10, double %11, i32 11)
-  %13 = insertelement <2 x double> %x2, double %12, i64 0
-  %14 = extractelement <2 x double> %x0, i64 0
-  %15 = extractelement <2 x double> %x1, i64 0
-  %16 = extractelement <2 x double> %x2, i64 0
-  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 10)
-  %18 = bitcast i8 %x3 to <8 x i1>
-  %19 = extractelement <8 x i1> %18, i64 0
-  %20 = select i1 %19, double %17, double %16
-  %21 = insertelement <2 x double> %x2, double %20, i64 0
-  %res3 = fadd <2 x double> %8, %13
-  %res4 = fadd <2 x double> %21, %res3
-  ret <2 x double> %res4
-}
-
-define <4 x float> @test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X2:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call float @llvm.fma.f32(float [[TMP5]], float [[TMP6]], float [[TMP7]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <8 x i1> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <8 x i1> [[TMP10]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 [[_MSPROP4]], i32 [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP13]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP5]], i32 [[TMP17]], i32 [[TMP12]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TMP11]], float [[TMP8]], float [[TMP7]]
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X2]], float [[TMP18]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP7]], 0
-; CHECK-NEXT:    [[_MSCMP19:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP19]]
-; CHECK-NEXT:    [[_MSCMP20:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR21:%.*]] = or i1 [[_MSOR]], [[_MSCMP20]]
-; CHECK-NEXT:    br i1 [[_MSOR21]], label [[TMP23:%.*]], label [[TMP24:%.*]], !prof [[PROF1]]
-; CHECK:       23:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       24:
-; CHECK-NEXT:    [[TMP25:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP20]], float [[TMP21]], float [[TMP22]], i32 11)
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = insertelement <4 x i32> [[TMP3]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[X2]], float [[TMP25]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP11]], 0
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP12]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSCMP22]], [[_MSCMP23]]
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSOR26:%.*]] = or i1 [[_MSOR24]], [[_MSCMP25]]
-; CHECK-NEXT:    br i1 [[_MSOR26]], label [[TMP30:%.*]], label [[TMP31:%.*]], !prof [[PROF1]]
-; CHECK:       30:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       31:
-; CHECK-NEXT:    [[TMP32:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP27]], float [[TMP28]], float [[TMP29]], i32 10)
-; CHECK-NEXT:    [[TMP33:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP34:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP33]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[TMP34]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = select i1 [[TMP35]], i32 0, i32 [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast float [[TMP32]] to i32
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast float [[TMP29]] to i32
-; CHECK-NEXT:    [[TMP39:%.*]] = xor i32 [[TMP37]], [[TMP38]]
-; CHECK-NEXT:    [[TMP40:%.*]] = or i32 [[TMP39]], 0
-; CHECK-NEXT:    [[TMP41:%.*]] = or i32 [[TMP40]], [[_MSPROP13]]
-; CHECK-NEXT:    [[_MSPROP_SELECT15:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP41]], i32 [[TMP36]]
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP35]], float [[TMP32]], float [[TMP29]]
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT15]], i64 0
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x float> [[X2]], float [[TMP42]], i64 0
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = or <4 x i32> [[_MSPROP6]], [[_MSPROP10]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP19]], [[TMP26]]
-; CHECK-NEXT:    [[_MSPROP18:%.*]] = or <4 x i32> [[_MSPROP16]], [[_MSPROP17]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP43]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP18]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %x1, i64 0
-  %3 = extractelement <4 x float> %x2, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float %3
-  %8 = insertelement <4 x float> %x2, float %7, i64 0
-  %9 = extractelement <4 x float> %x0, i64 0
-  %10 = extractelement <4 x float> %x1, i64 0
-  %11 = extractelement <4 x float> %x2, i64 0
-  %12 = call float @llvm.x86.avx512.vfmadd.f32(float %9, float %10, float %11, i32 11)
-  %13 = insertelement <4 x float> %x2, float %12, i64 0
-  %14 = extractelement <4 x float> %x0, i64 0
-  %15 = extractelement <4 x float> %x1, i64 0
-  %16 = extractelement <4 x float> %x2, i64 0
-  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 10)
-  %18 = bitcast i8 %x3 to <8 x i1>
-  %19 = extractelement <8 x i1> %18, i64 0
-  %20 = select i1 %19, float %17, float %16
-  %21 = insertelement <4 x float> %x2, float %20, i64 0
-  %res3 = fadd <4 x float> %8, %13
-  %res4 = fadd <4 x float> %21, %res3
-  ret <4 x float> %res4
-}
-
-define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
-; CHECK-LABEL: @fmadd_ss_mask_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
-; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
-; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast float [[TMP14]] to i32
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP22]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or i32 [[TMP25]], [[_MSPROP9]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP26]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], float [[TMP17]], float [[TMP14]]
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <4 x float> [[AV]], float [[TMP27]], i64 0
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP28]], i32 0
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
-; CHECK:       29:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       30:
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
-; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
-; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP33]], align 4
-; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a.val = load float, ptr %a
-  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
-  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
-  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
-  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
-
-  %b.val = load float, ptr %b
-  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
-  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
-  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
-  %bv =  insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
-  %1 = extractelement <4 x float> %av, i64 0
-  %2 = extractelement <4 x float> %bv, i64 0
-  %3 = extractelement <4 x float> %av, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %c to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float %1
-  %8 = insertelement <4 x float> %av, float %7, i64 0
-  %sr = extractelement <4 x float> %8, i32 0
-  store float %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
-; CHECK-LABEL: @fmadd_ss_maskz_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load float, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <4 x float> undef, float [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <4 x i32> [[_MSPROP]], i32 0, i32 1
-; CHECK-NEXT:    [[AV1:%.*]] = insertelement <4 x float> [[AV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = insertelement <4 x i32> [[_MSPROP1]], i32 0, i32 2
-; CHECK-NEXT:    [[AV2:%.*]] = insertelement <4 x float> [[AV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <4 x i32> [[_MSPROP2]], i32 0, i32 3
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <4 x float> [[AV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSCMP17:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP17]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load float, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD4:%.*]] = load i32, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD4]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <4 x float> undef, float [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[_MSPROP5]], i32 0, i32 1
-; CHECK-NEXT:    [[BV1:%.*]] = insertelement <4 x float> [[BV0]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[_MSPROP6]], i32 0, i32 2
-; CHECK-NEXT:    [[BV2:%.*]] = insertelement <4 x float> [[BV1]], float 0.000000e+00, i32 2
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = insertelement <4 x i32> [[_MSPROP7]], i32 0, i32 3
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <4 x float> [[BV2]], float 0.000000e+00, i32 3
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[_MSPROP8]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x float> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[_MSPROP3]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x float> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = or i32 [[_MSPROP9]], [[_MSPROP10]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = or i32 [[_MSPROP12]], [[_MSPROP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call float @llvm.fma.f32(float [[TMP14]], float [[TMP15]], float [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i32 [[_MSPROP13]], i32 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast float [[TMP17]] to i32
-; CHECK-NEXT:    [[TMP23:%.*]] = xor i32 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i32 [[TMP23]], [[_MSPROP13]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i32 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP14]], i32 [[TMP25]], i32 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], float [[TMP17]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = insertelement <4 x i32> [[_MSPROP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x float> [[AV]], float [[TMP26]], i64 0
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[_MSPROP15]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <4 x float> [[TMP27]], i32 0
-; CHECK-NEXT:    [[_MSCMP18:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP18]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    store i32 [[_MSPROP16]], ptr [[TMP32]], align 4
-; CHECK-NEXT:    store float [[SR]], ptr [[A]], align 4
-; CHECK-NEXT:    ret void
-;
-  %a.val = load float, ptr %a
-  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
-  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
-  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
-  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
-
-  %b.val = load float, ptr %b
-  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
-  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
-  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
-  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
-  %1 = extractelement <4 x float> %av, i64 0
-  %2 = extractelement <4 x float> %bv, i64 0
-  %3 = extractelement <4 x float> %av, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %c to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float 0.000000e+00
-  %8 = insertelement <4 x float> %av, float %7, i64 0
-  %sr = extractelement <4 x float> %8, i32 0
-  store float %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) #0 {
-; CHECK-LABEL: @fmadd_sd_mask_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double [[TMP14]] to i64
-; CHECK-NEXT:    [[TMP24:%.*]] = xor i64 [[TMP22]], [[TMP23]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP26:%.*]] = or i64 [[TMP25]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP26]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP20]], double [[TMP17]], double [[TMP14]]
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <2 x double> [[AV]], double [[TMP27]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP28]], i32 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP29:%.*]], label [[TMP30:%.*]], !prof [[PROF1]]
-; CHECK:       29:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       30:
-; CHECK-NEXT:    [[TMP31:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP32:%.*]] = xor i64 [[TMP31]], 87960930222080
-; CHECK-NEXT:    [[TMP33:%.*]] = inttoptr i64 [[TMP32]] to ptr
-; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP33]], align 8
-; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a.val = load double, ptr %a
-  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
-  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
-
-  %b.val = load double, ptr %b
-  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
-  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
-  %1 = extractelement <2 x double> %av, i64 0
-  %2 = extractelement <2 x double> %bv, i64 0
-  %3 = extractelement <2 x double> %av, i64 0
-  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
-  %5 = bitcast i8 %c to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, double %4, double %1
-  %8 = insertelement <2 x double> %av, double %7, i64 0
-  %sr = extractelement <2 x double> %8, i32 0
-  store double %sr, ptr %a
-  ret void
-}
-
-define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) #0 {
-; CHECK-LABEL: @fmadd_sd_maskz_memfold(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[A_VAL:%.*]] = load double, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i64, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[AV0:%.*]] = insertelement <2 x double> undef, double [[A_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = insertelement <2 x i64> [[_MSPROP]], i64 0, i32 1
-; CHECK-NEXT:    [[AV:%.*]] = insertelement <2 x double> [[AV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSCMP13:%.*]] = icmp ne i64 [[TMP2]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP13]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK:       9:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       10:
-; CHECK-NEXT:    [[B_VAL:%.*]] = load double, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = ptrtoint ptr [[B]] to i64
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i64 [[TMP11]], 87960930222080
-; CHECK-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr
-; CHECK-NEXT:    [[_MSLD2:%.*]] = load i64, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = insertelement <2 x i64> splat (i64 -1), i64 [[_MSLD2]], i32 0
-; CHECK-NEXT:    [[BV0:%.*]] = insertelement <2 x double> undef, double [[B_VAL]], i32 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = insertelement <2 x i64> [[_MSPROP3]], i64 0, i32 1
-; CHECK-NEXT:    [[BV:%.*]] = insertelement <2 x double> [[BV0]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <2 x i64> [[_MSPROP4]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x double> [[BV]], i64 0
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = extractelement <2 x i64> [[_MSPROP1]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[AV]], i64 0
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = or i64 [[_MSPROP5]], [[_MSPROP6]]
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = or i64 [[_MSPROP8]], [[_MSPROP7]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call double @llvm.fma.f64(double [[TMP14]], double [[TMP15]], double [[TMP16]])
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8 [[C:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <8 x i1> [[TMP18]], i64 0
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[TMP19]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP20]], i64 [[_MSPROP9]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast double [[TMP17]] to i64
-; CHECK-NEXT:    [[TMP23:%.*]] = xor i64 [[TMP22]], 0
-; CHECK-NEXT:    [[TMP24:%.*]] = or i64 [[TMP23]], [[_MSPROP9]]
-; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[TMP24]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP10]], i64 [[TMP25]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP26:%.*]] = select i1 [[TMP20]], double [[TMP17]], double 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = insertelement <2 x i64> [[_MSPROP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x double> [[AV]], double [[TMP26]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = extractelement <2 x i64> [[_MSPROP11]], i32 0
-; CHECK-NEXT:    [[SR:%.*]] = extractelement <2 x double> [[TMP27]], i32 0
-; CHECK-NEXT:    [[_MSCMP14:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP14]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = ptrtoint ptr [[A]] to i64
-; CHECK-NEXT:    [[TMP31:%.*]] = xor i64 [[TMP30]], 87960930222080
-; CHECK-NEXT:    [[TMP32:%.*]] = inttoptr i64 [[TMP31]] to ptr
-; CHECK-NEXT:    store i64 [[_MSPROP12]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    store double [[SR]], ptr [[A]], align 8
-; CHECK-NEXT:    ret void
-;
-  %a.val = load double, ptr %a
-  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
-  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
-
-  %b.val = load double, ptr %b
-  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
-  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
-  %1 = extractelement <2 x double> %av, i64 0
-  %2 = extractelement <2 x double> %bv, i64 0
-  %3 = extractelement <2 x double> %av, i64 0
-  %4 = call double @llvm.fma.f64(double %1, double %2, double %3)
-  %5 = bitcast i8 %c to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, double %4, double 0.000000e+00
-  %8 = insertelement <2 x double> %av, double %7, i64 0
-  %sr = extractelement <2 x double> %8, i32 0
-  store double %sr, ptr %a
-  ret void
-}
-
-define <2 x double> @test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call double @llvm.fma.f64(double [[TMP6]], double [[TMP7]], double [[TMP8]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast double [[TMP9]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP19]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], double [[TMP9]], double [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <2 x double> [[X2]], double [[TMP20]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP22]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
-; CHECK:       26:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       27:
-; CHECK-NEXT:    [[TMP28:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP23]], double [[TMP24]], double [[TMP25]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP1]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <2 x double> [[X2]], double [[TMP28]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <2 x double> [[TMP31]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
-; CHECK:       35:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       36:
-; CHECK-NEXT:    [[TMP37:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP32]], double [[TMP33]], double [[TMP34]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i64 0, i64 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast double [[TMP37]] to i64
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast double [[TMP38]] to i64
-; CHECK-NEXT:    [[TMP45:%.*]] = xor i64 [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or i64 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = or i64 [[TMP46]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP47]], i64 [[TMP42]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], double [[TMP37]], double [[TMP38]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <2 x double> [[X2]], double [[TMP48]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP21]], [[TMP30]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP49]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %1 = fneg <2 x double> %x2
-  %2 = extractelement <2 x double> %x0, i64 0
-  %3 = extractelement <2 x double> %x1, i64 0
-  %4 = extractelement <2 x double> %1, i64 0
-  %5 = call double @llvm.fma.f64(double %2, double %3, double %4)
-  %6 = extractelement <2 x double> %x2, i64 0
-  %7 = bitcast i8 %x3 to <8 x i1>
-  %8 = extractelement <8 x i1> %7, i64 0
-  %9 = select i1 %8, double %5, double %6
-  %10 = insertelement <2 x double> %x2, double %9, i64 0
-  %11 = fneg <2 x double> %x2
-  %12 = extractelement <2 x double> %x0, i64 0
-  %13 = extractelement <2 x double> %x1, i64 0
-  %14 = extractelement <2 x double> %11, i64 0
-  %15 = call double @llvm.x86.avx512.vfmadd.f64(double %12, double %13, double %14, i32 11)
-  %16 = extractelement <2 x double> %x2, i64 0
-  %17 = insertelement <2 x double> %x2, double %15, i64 0
-  %18 = fneg <2 x double> %x2
-  %19 = extractelement <2 x double> %x0, i64 0
-  %20 = extractelement <2 x double> %x1, i64 0
-  %21 = extractelement <2 x double> %18, i64 0
-  %22 = call double @llvm.x86.avx512.vfmadd.f64(double %19, double %20, double %21, i32 10)
-  %23 = extractelement <2 x double> %x2, i64 0
-  %24 = bitcast i8 %x3 to <8 x i1>
-  %25 = extractelement <8 x i1> %24, i64 0
-  %26 = select i1 %25, double %22, double %23
-  %27 = insertelement <2 x double> %x2, double %26, i64 0
-  %res3 = fadd <2 x double> %10, %17
-  %res4 = fadd <2 x double> %27, %res3
-  ret <2 x double> %res4
-}
-
-define <4 x float> @test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call float @llvm.fma.f32(float [[TMP6]], float [[TMP7]], float [[TMP8]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float [[TMP9]] to i32
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = xor i32 [[TMP15]], [[TMP16]]
-; CHECK-NEXT:    [[TMP18:%.*]] = or i32 [[TMP17]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP19]], i32 [[TMP14]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TMP13]], float [[TMP9]], float [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[X2]], float [[TMP20]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP22]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
-; CHECK:       26:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       27:
-; CHECK-NEXT:    [[TMP28:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP23]], float [[TMP24]], float [[TMP25]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP1]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x float> [[X2]], float [[TMP28]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[X0]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x float> [[TMP31]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP35:%.*]], label [[TMP36:%.*]], !prof [[PROF1]]
-; CHECK:       35:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       36:
-; CHECK-NEXT:    [[TMP37:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP32]], float [[TMP33]], float [[TMP34]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP39:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP40:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP39]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[TMP40]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = select i1 [[TMP41]], i32 0, i32 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast float [[TMP37]] to i32
-; CHECK-NEXT:    [[TMP44:%.*]] = bitcast float [[TMP38]] to i32
-; CHECK-NEXT:    [[TMP45:%.*]] = xor i32 [[TMP43]], [[TMP44]]
-; CHECK-NEXT:    [[TMP46:%.*]] = or i32 [[TMP45]], 0
-; CHECK-NEXT:    [[TMP47:%.*]] = or i32 [[TMP46]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP47]], i32 [[TMP42]]
-; CHECK-NEXT:    [[TMP48:%.*]] = select i1 [[TMP41]], float [[TMP37]], float [[TMP38]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <4 x float> [[X2]], float [[TMP48]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP21]], [[TMP30]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP49]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %1 = fneg <4 x float> %x2
-  %2 = extractelement <4 x float> %x0, i64 0
-  %3 = extractelement <4 x float> %x1, i64 0
-  %4 = extractelement <4 x float> %1, i64 0
-  %5 = call float @llvm.fma.f32(float %2, float %3, float %4)
-  %6 = extractelement <4 x float> %x2, i64 0
-  %7 = bitcast i8 %x3 to <8 x i1>
-  %8 = extractelement <8 x i1> %7, i64 0
-  %9 = select i1 %8, float %5, float %6
-  %10 = insertelement <4 x float> %x2, float %9, i64 0
-  %11 = fneg <4 x float> %x2
-  %12 = extractelement <4 x float> %x0, i64 0
-  %13 = extractelement <4 x float> %x1, i64 0
-  %14 = extractelement <4 x float> %11, i64 0
-  %15 = call float @llvm.x86.avx512.vfmadd.f32(float %12, float %13, float %14, i32 11)
-  %16 = extractelement <4 x float> %x2, i64 0
-  %17 = insertelement <4 x float> %x2, float %15, i64 0
-  %18 = fneg <4 x float> %x2
-  %19 = extractelement <4 x float> %x0, i64 0
-  %20 = extractelement <4 x float> %x1, i64 0
-  %21 = extractelement <4 x float> %18, i64 0
-  %22 = call float @llvm.x86.avx512.vfmadd.f32(float %19, float %20, float %21, i32 10)
-  %23 = extractelement <4 x float> %x2, i64 0
-  %24 = bitcast i8 %x3 to <8 x i1>
-  %25 = extractelement <8 x i1> %24, i64 0
-  %26 = select i1 %25, float %22, float %23
-  %27 = insertelement <4 x float> %x2, float %26, i64 0
-  %res3 = fadd <4 x float> %10, %17
-  %res4 = fadd <4 x float> %27, %res3
-  ret <4 x float> %res4
-}
-
-define <2 x double> @test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_sd(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <2 x double> [[X0:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fneg <2 x double> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x double> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP6]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i64 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i64 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call double @llvm.fma.f64(double [[TMP7]], double [[TMP8]], double [[TMP9]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i64 [[_MSPROP4]], i64 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast double [[TMP10]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast double [[TMP11]] to i64
-; CHECK-NEXT:    [[TMP18:%.*]] = xor i64 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i64 [[TMP18]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i64 [[TMP19]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i64 [[TMP20]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], double [[TMP10]], double [[TMP11]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x double> [[X2]], double [[TMP21]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = fneg <2 x double> [[X0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x double> [[TMP23]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <2 x double> [[TMP24]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i64 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i64 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP25]], double [[TMP26]], double [[TMP27]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <2 x i64> [[TMP2]], i64 0, i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <2 x double> [[X2]], double [[TMP30]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = fneg <2 x double> [[X0]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fneg <2 x double> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <2 x i64> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <2 x i64> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <2 x double> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <2 x double> [[TMP34]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i64 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i64 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i64 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
-; CHECK:       38:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       39:
-; CHECK-NEXT:    [[TMP40:%.*]] = call double @llvm.x86.avx512.vfmadd.f64(double [[TMP35]], double [[TMP36]], double [[TMP37]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <2 x double> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i64 0, i64 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP46:%.*]] = bitcast double [[TMP40]] to i64
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast double [[TMP41]] to i64
-; CHECK-NEXT:    [[TMP48:%.*]] = xor i64 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = or i64 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = or i64 [[TMP49]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i64 [[TMP50]], i64 [[TMP45]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], double [[TMP40]], double [[TMP41]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <2 x double> [[X2]], double [[TMP51]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <2 x i64> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <2 x double> [[TMP22]], [[TMP32]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <2 x i64> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <2 x double> [[TMP52]], [[RES3]]
-; CHECK-NEXT:    store <2 x i64> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <2 x double> [[RES4]]
-;
-  %1 = fneg <2 x double> %x0
-  %2 = fneg <2 x double> %x2
-  %3 = extractelement <2 x double> %1, i64 0
-  %4 = extractelement <2 x double> %x1, i64 0
-  %5 = extractelement <2 x double> %2, i64 0
-  %6 = call double @llvm.fma.f64(double %3, double %4, double %5)
-  %7 = extractelement <2 x double> %x2, i64 0
-  %8 = bitcast i8 %x3 to <8 x i1>
-  %9 = extractelement <8 x i1> %8, i64 0
-  %10 = select i1 %9, double %6, double %7
-  %11 = insertelement <2 x double> %x2, double %10, i64 0
-  %12 = fneg <2 x double> %x0
-  %13 = fneg <2 x double> %x2
-  %14 = extractelement <2 x double> %12, i64 0
-  %15 = extractelement <2 x double> %x1, i64 0
-  %16 = extractelement <2 x double> %13, i64 0
-  %17 = call double @llvm.x86.avx512.vfmadd.f64(double %14, double %15, double %16, i32 11)
-  %18 = extractelement <2 x double> %x2, i64 0
-  %19 = insertelement <2 x double> %x2, double %17, i64 0
-  %20 = fneg <2 x double> %x0
-  %21 = fneg <2 x double> %x2
-  %22 = extractelement <2 x double> %20, i64 0
-  %23 = extractelement <2 x double> %x1, i64 0
-  %24 = extractelement <2 x double> %21, i64 0
-  %25 = call double @llvm.x86.avx512.vfmadd.f64(double %22, double %23, double %24, i32 10)
-  %26 = extractelement <2 x double> %x2, i64 0
-  %27 = bitcast i8 %x3 to <8 x i1>
-  %28 = extractelement <8 x i1> %27, i64 0
-  %29 = select i1 %28, double %25, double %26
-  %30 = insertelement <2 x double> %x2, double %29, i64 0
-  %res3 = fadd <2 x double> %11, %19
-  %res4 = fadd <2 x double> %30, %res3
-  ret <2 x double> %res4
-}
-
-define <4 x float> @test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfnmsub_ss(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 48) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = fneg <4 x float> [[X0:%.*]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fneg <4 x float> [[X2:%.*]]
-; CHECK-NEXT:    [[_MSPROP:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP5]], i64 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = or i32 [[_MSPROP]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP3]], [[_MSPROP2]]
-; CHECK-NEXT:    [[TMP10:%.*]] = call float @llvm.fma.f32(float [[TMP7]], float [[TMP8]], float [[TMP9]])
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[TMP13]], i64 0
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[_MSPROP4]], i32 [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast float [[TMP11]] to i32
-; CHECK-NEXT:    [[TMP18:%.*]] = xor i32 [[TMP16]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = or i32 [[TMP18]], [[_MSPROP4]]
-; CHECK-NEXT:    [[TMP20:%.*]] = or i32 [[TMP19]], [[_MSPROP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP20]], i32 [[TMP15]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TMP14]], float [[TMP10]], float [[TMP11]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x float> [[X2]], float [[TMP21]], i64 0
-; CHECK-NEXT:    [[TMP23:%.*]] = fneg <4 x float> [[X0]]
-; CHECK-NEXT:    [[TMP24:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP8:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x float> [[TMP23]], i64 0
-; CHECK-NEXT:    [[_MSPROP9:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP10:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x float> [[TMP24]], i64 0
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[_MSPROP8]], 0
-; CHECK-NEXT:    [[_MSCMP22:%.*]] = icmp ne i32 [[_MSPROP9]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP22]]
-; CHECK-NEXT:    [[_MSCMP23:%.*]] = icmp ne i32 [[_MSPROP10]], 0
-; CHECK-NEXT:    [[_MSOR24:%.*]] = or i1 [[_MSOR]], [[_MSCMP23]]
-; CHECK-NEXT:    br i1 [[_MSOR24]], label [[TMP28:%.*]], label [[TMP29:%.*]], !prof [[PROF1]]
-; CHECK:       28:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       29:
-; CHECK-NEXT:    [[TMP30:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP25]], float [[TMP26]], float [[TMP27]], i32 11)
-; CHECK-NEXT:    [[_MSPROP11:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[_MSPROP12:%.*]] = insertelement <4 x i32> [[TMP2]], i32 0, i64 0
-; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x float> [[X2]], float [[TMP30]], i64 0
-; CHECK-NEXT:    [[TMP33:%.*]] = fneg <4 x float> [[X0]]
-; CHECK-NEXT:    [[TMP34:%.*]] = fneg <4 x float> [[X2]]
-; CHECK-NEXT:    [[_MSPROP13:%.*]] = extractelement <4 x i32> [[TMP1]], i64 0
-; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x float> [[TMP33]], i64 0
-; CHECK-NEXT:    [[_MSPROP14:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x float> [[X1]], i64 0
-; CHECK-NEXT:    [[_MSPROP15:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <4 x float> [[TMP34]], i64 0
-; CHECK-NEXT:    [[_MSCMP25:%.*]] = icmp ne i32 [[_MSPROP13]], 0
-; CHECK-NEXT:    [[_MSCMP26:%.*]] = icmp ne i32 [[_MSPROP14]], 0
-; CHECK-NEXT:    [[_MSOR27:%.*]] = or i1 [[_MSCMP25]], [[_MSCMP26]]
-; CHECK-NEXT:    [[_MSCMP28:%.*]] = icmp ne i32 [[_MSPROP15]], 0
-; CHECK-NEXT:    [[_MSOR29:%.*]] = or i1 [[_MSOR27]], [[_MSCMP28]]
-; CHECK-NEXT:    br i1 [[_MSOR29]], label [[TMP38:%.*]], label [[TMP39:%.*]], !prof [[PROF1]]
-; CHECK:       38:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       39:
-; CHECK-NEXT:    [[TMP40:%.*]] = call float @llvm.x86.avx512.vfmadd.f32(float [[TMP35]], float [[TMP36]], float [[TMP37]], i32 10)
-; CHECK-NEXT:    [[_MSPROP16:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x float> [[X2]], i64 0
-; CHECK-NEXT:    [[TMP42:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP43:%.*]] = bitcast i8 [[X3]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP17:%.*]] = extractelement <8 x i1> [[TMP42]], i64 0
-; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <8 x i1> [[TMP43]], i64 0
-; CHECK-NEXT:    [[TMP45:%.*]] = select i1 [[TMP44]], i32 0, i32 [[_MSPROP16]]
-; CHECK-NEXT:    [[TMP46:%.*]] = bitcast float [[TMP40]] to i32
-; CHECK-NEXT:    [[TMP47:%.*]] = bitcast float [[TMP41]] to i32
-; CHECK-NEXT:    [[TMP48:%.*]] = xor i32 [[TMP46]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = or i32 [[TMP48]], 0
-; CHECK-NEXT:    [[TMP50:%.*]] = or i32 [[TMP49]], [[_MSPROP16]]
-; CHECK-NEXT:    [[_MSPROP_SELECT18:%.*]] = select i1 [[_MSPROP17]], i32 [[TMP50]], i32 [[TMP45]]
-; CHECK-NEXT:    [[TMP51:%.*]] = select i1 [[TMP44]], float [[TMP40]], float [[TMP41]]
-; CHECK-NEXT:    [[_MSPROP19:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT18]], i64 0
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x float> [[X2]], float [[TMP51]], i64 0
-; CHECK-NEXT:    [[_MSPROP20:%.*]] = or <4 x i32> [[_MSPROP7]], [[_MSPROP12]]
-; CHECK-NEXT:    [[RES3:%.*]] = fadd <4 x float> [[TMP22]], [[TMP32]]
-; CHECK-NEXT:    [[_MSPROP21:%.*]] = or <4 x i32> [[_MSPROP19]], [[_MSPROP20]]
-; CHECK-NEXT:    [[RES4:%.*]] = fadd <4 x float> [[TMP52]], [[RES3]]
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP21]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[RES4]]
-;
-  %1 = fneg <4 x float> %x0
-  %2 = fneg <4 x float> %x2
-  %3 = extractelement <4 x float> %1, i64 0
-  %4 = extractelement <4 x float> %x1, i64 0
-  %5 = extractelement <4 x float> %2, i64 0
-  %6 = call float @llvm.fma.f32(float %3, float %4, float %5)
-  %7 = extractelement <4 x float> %x2, i64 0
-  %8 = bitcast i8 %x3 to <8 x i1>
-  %9 = extractelement <8 x i1> %8, i64 0
-  %10 = select i1 %9, float %6, float %7
-  %11 = insertelement <4 x float> %x2, float %10, i64 0
-  %12 = fneg <4 x float> %x0
-  %13 = fneg <4 x float> %x2
-  %14 = extractelement <4 x float> %12, i64 0
-  %15 = extractelement <4 x float> %x1, i64 0
-  %16 = extractelement <4 x float> %13, i64 0
-  %17 = call float @llvm.x86.avx512.vfmadd.f32(float %14, float %15, float %16, i32 11)
-  %18 = extractelement <4 x float> %x2, i64 0
-  %19 = insertelement <4 x float> %x2, float %17, i64 0
-  %20 = fneg <4 x float> %x0
-  %21 = fneg <4 x float> %x2
-  %22 = extractelement <4 x float> %20, i64 0
-  %23 = extractelement <4 x float> %x1, i64 0
-  %24 = extractelement <4 x float> %21, i64 0
-  %25 = call float @llvm.x86.avx512.vfmadd.f32(float %22, float %23, float %24, i32 10)
-  %26 = extractelement <4 x float> %x2, i64 0
-  %27 = bitcast i8 %x3 to <8 x i1>
-  %28 = extractelement <8 x i1> %27, i64 0
-  %29 = select i1 %28, float %25, float %26
-  %30 = insertelement <4 x float> %x2, float %29, i64 0
-  %res3 = fadd <4 x float> %11, %19
-  %res4 = fadd <4 x float> %30, %res3
-  ret <4 x float> %res4
-}
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask3_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP12]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X1]], float [[TMP23]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP24]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %vecinit.i, i64 0
-  %3 = extractelement <4 x float> %x1, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float %3
-  %8 = insertelement <4 x float> %x1, float %7, i64 0
-  ret <4 x float> %8
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_mask_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP5:%.*]], label [[TMP6:%.*]], !prof [[PROF1]]
-; CHECK:       5:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       6:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = xor i64 [[TMP7]], 87960930222080
-; CHECK-NEXT:    [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP13:%.*]] = call float @llvm.fma.f32(float [[TMP10]], float [[TMP11]], float [[TMP12]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8 [[X3:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = extractelement <8 x i1> [[TMP14]], i64 0
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP15]], i64 0
-; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[_MSPROP5]], i32 [[_MSPROP1]]
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast float [[TMP13]] to i32
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast float [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP20:%.*]] = xor i32 [[TMP18]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = or i32 [[TMP20]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP22:%.*]] = or i32 [[TMP21]], [[_MSPROP1]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[_MSPROP6]], i32 [[TMP22]], i32 [[TMP17]]
-; CHECK-NEXT:    [[TMP23:%.*]] = select i1 [[TMP16]], float [[TMP13]], float [[TMP10]]
-; CHECK-NEXT:    [[_MSPROP7:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x float> [[X0]], float [[TMP23]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP7]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP24]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %vecinit.i, i64 0
-  %3 = extractelement <4 x float> %x1, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = bitcast i8 %x3 to <8 x i1>
-  %6 = extractelement <8 x i1> %5, i64 0
-  %7 = select i1 %6, float %4, float %1
-  %8 = insertelement <4 x float> %x0, float %7, i64 0
-  ret <4 x float> %8
-}
-
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1,ptr%ptr_b ,i8 %x3,i32 %x4) #0 {
-; CHECK-LABEL: @test_int_x86_avx512_maskz_vfmadd_ss_rm(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
-; CHECK:       4:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       5:
-; CHECK-NEXT:    [[Q:%.*]] = load float, ptr [[PTR_B:%.*]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR_B]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
-; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load i32, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[_MSPROP:%.*]] = insertelement <4 x i32> splat (i32 -1), i32 [[_MSLD]], i32 0
-; CHECK-NEXT:    [[VECINIT_I:%.*]] = insertelement <4 x float> undef, float [[Q]], i32 0
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x float> [[X0:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP2:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[X1:%.*]], i64 0
-; CHECK-NEXT:    [[_MSPROP3:%.*]] = extractelement <4 x i32> [[_MSPROP]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x float> [[VECINIT_I]], i64 0
-; CHECK-NEXT:    [[_MSPROP4:%.*]] = or i32 [[_MSPROP1]], [[_MSPROP2]]
-; CHECK-NEXT:    [[_MSPROP5:%.*]] = or i32 [[_MSPROP4]], [[_MSPROP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call float @llvm.fma.f32(float [[TMP9]], float [[TMP10]], float [[TMP11]])
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 false, i32 [[_MSPROP5]], i32 0
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP15:%.*]] = xor i32 [[TMP14]], 0
-; CHECK-NEXT:    [[TMP16:%.*]] = or i32 [[TMP15]], [[_MSPROP5]]
-; CHECK-NEXT:    [[TMP17:%.*]] = or i32 [[TMP16]], 0
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 false, i32 [[TMP17]], i32 [[TMP13]]
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 false, float [[TMP12]], float 0.000000e+00
-; CHECK-NEXT:    [[_MSPROP6:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[_MSPROP_SELECT]], i64 0
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> [[X0]], float [[TMP18]], i64 0
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <4 x float> [[TMP19]]
-;
-  %q = load float, ptr %ptr_b
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %1 = extractelement <4 x float> %x0, i64 0
-  %2 = extractelement <4 x float> %x1, i64 0
-  %3 = extractelement <4 x float> %vecinit.i, i64 0
-  %4 = call float @llvm.fma.f32(float %1, float %2, float %3)
-  %5 = select i1 false, float %4, float 0.000000e+00
-  %6 = insertelement <4 x float> %x0, float %5, i64 0
-  ret <4 x float> %6
-}
-
-define <16 x i32> @test_x86_avx512_psll_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psll_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psll_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_psll_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psll_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i64> @test_x86_avx512_psll_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psll_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psll_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_psll_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psll_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
-
-
-define <16 x i32> @test_x86_avx512_pslli_d_512(<16 x i32> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_pslli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_pslli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_pslli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_pslli_d_512(<16 x i32> %a0, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_pslli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
-
-
-define <8 x i64> @test_x86_avx512_pslli_q_512(<8 x i64> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_pslli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_pslli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_pslli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_pslli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
-
-
-define <8 x i64> @test_x86_avx512_psra_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psra_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psra_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_psra_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psra_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
-
-
-define <16 x i32> @test_x86_avx512_psra_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psra_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psra_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_psra_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psra_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
-
-
-
-define <8 x i64> @test_x86_avx512_psrai_q_512(<8 x i64> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrai_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_psrai_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrai_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_psrai_q_512(<8 x i64> %a0, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrai_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
-
-
-define <16 x i32> @test_x86_avx512_psrai_d_512(<16 x i32> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrai_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_psrai_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrai_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_psrai_d_512(<16 x i32> %a0, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrai_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
-
-
-
-define <16 x i32> @test_x86_avx512_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrl_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <16 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrl_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP11]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP16]], <16 x i32> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_psrl_d_512(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrl_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[TMP1]], <4 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> [[A0:%.*]], <4 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP10]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> [[TMP15]], <16 x i32> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i64> @test_x86_avx512_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrl_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP4:%.*]] = trunc i128 [[TMP3]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i1 [[TMP5]] to i512
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i512 [[TMP6]] to <8 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP7]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP9]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrl_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 144) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne i64 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP8:%.*]] = sext i1 [[TMP7]] to i512
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i512 [[TMP8]] to <8 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP9]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP11]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], [[TMP11]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i64> [[TMP15]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i64> [[TMP16]], <8 x i64> [[TMP13]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_psrl_q_512(<8 x i64> %a0, <2 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrl_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 80) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <2 x i64> [[TMP2]] to i128
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i128 [[TMP4]] to i64
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[TMP5]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = sext i1 [[TMP6]] to i512
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i512 [[TMP7]] to <8 x i64>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[TMP1]], <2 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP8]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> [[A0:%.*]], <2 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP10]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i64> [[TMP13]], [[TMP10]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i64> [[TMP14]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i64> [[TMP15]], <8 x i64> [[TMP12]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
-
-
-define <16 x i32> @test_x86_avx512_psrli_d_512(<16 x i32> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
-}
-define <16 x i32> @test_x86_avx512_mask_psrli_d_512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <16 x i32> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP5]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <16 x i32> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i32> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i32> [[TMP10]], <16 x i32> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[PASSTHRU]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %passthru
-  ret <16 x i32> %res2
-}
-define <16 x i32> @test_x86_avx512_maskz_psrli_d_512(<16 x i32> %a0, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrli_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP4]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 7) ; <<16 x i32>> [#uses=1]
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
-
-
-define <8 x i64> @test_x86_avx512_psrli_q_512(<8 x i64> %a0) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP3:%.*]] = or <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
-}
-define <8 x i64> @test_x86_avx512_mask_psrli_q_512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = or <8 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP7:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP5]], <8 x i64> [[TMP3]]
-; CHECK-NEXT:    [[TMP8:%.*]] = xor <8 x i64> [[RES]], [[PASSTHRU:%.*]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], [[TMP5]]
-; CHECK-NEXT:    [[TMP10:%.*]] = or <8 x i64> [[TMP9]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i64> [[TMP10]], <8 x i64> [[TMP7]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[PASSTHRU]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %passthru
-  ret <8 x i64> %res2
-}
-define <8 x i64> @test_x86_avx512_maskz_psrli_q_512(<8 x i64> %a0, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrli_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[TMP1]], i32 7)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> [[A0:%.*]], i32 7)
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP4]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = or <8 x i64> [[TMP8]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP5]], <8 x i64> [[TMP9]], <8 x i64> [[TMP6]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 7) ; <<8 x i64>> [#uses=1]
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_psllv_d_512_const() #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_d_512_const(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-  %res1 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1>)
-  %res2 = add <16 x i32> %res0, %res1
-  ret <16 x i32> %res2
-}
-
-define <16 x i32> @test_x86_avx512_mask_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psllv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
-  ret <16 x i32> %res2
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psllv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psllv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_psllv_q_512_const() #0 {
-; CHECK-LABEL: @test_x86_avx512_psllv_q_512_const(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
-  %res1 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
-  %res2 = add <8 x i64> %res0, %res1
-  ret <8 x i64> %res2
-}
-
-define <8 x i64> @test_x86_avx512_mask_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psllv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
-  ret <8 x i64> %res2
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psllv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psllv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrav_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrav_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
-  ret <16 x i32> %res2
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrav_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrav_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrav_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrav_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
-  ret <8 x i64> %res2
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrav_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrav_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
-
-define <16 x i32> @test_x86_avx512_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i1> [[TMP3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <16 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    store <16 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  ret <16 x i32> %res
-}
-
-define <16 x i32> @test_x86_avx512_psrlv_d_512_const() #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_d_512_const(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-; CHECK-NEXT:    [[TMP2:%.*]] = or <16 x i32> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[RES0:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1, i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> zeroinitializer, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <16 x i32> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES1:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 -1>)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES2:%.*]] = add <16 x i32> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res0 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 2, i32 9, i32 0, i32 -1, i32 3, i32 7, i32 -1, i32 0, i32 4, i32 5, i32 -2, i32 0, i32 5, i32 3, i32 -3, i32 0>, <16 x i32> <i32 1, i32 0, i32 33, i32 -1,i32 2, i32 0, i32 34, i32 -2, i32 3, i32 0, i32 35, i32 -1, i32 4, i32 0, i32 36, i32 -3>)
-  %res1 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 -1>, <16 x i32> <i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1,  i32 1, i32 1, i32 1, i32 1, i32 -1  >)
-  %res2 = add <16 x i32> %res0, %res1
-  ret <16 x i32> %res2
-}
-
-define <16 x i32> @test_x86_avx512_mask_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrlv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP8]], <16 x i32> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <16 x i32> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i32> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i32> [[TMP13]], <16 x i32> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> [[A2]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %a2
-  ret <16 x i32> %res2
-}
-
-define <16 x i32> @test_x86_avx512_maskz_psrlv_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_d_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i32> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <16 x i1> [[TMP4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[TMP1]], <16 x i32> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <16 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> [[A0:%.*]], <16 x i32> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[TMP7]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i32> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i32> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i32> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i32> [[TMP12]], <16 x i32> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <16 x i1> [[MASK_CAST]], <16 x i32> [[RES]], <16 x i32> zeroinitializer
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x i32> [[RES2]]
-;
-  %res = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
-  %mask.cast = bitcast i16 %mask to <16 x i1>
-  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> zeroinitializer
-  ret <16 x i32> %res2
-}
-
-declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
-
-define <8 x i64> @test_x86_avx512_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1) #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <8 x i1> [[TMP3]] to <8 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP6:%.*]] = or <8 x i64> [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    store <8 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  ret <8 x i64> %res
-}
-
-define <8 x i64> @test_x86_avx512_psrlv_q_512_const() #0 {
-; CHECK-LABEL: @test_x86_avx512_psrlv_q_512_const(
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
-; CHECK-NEXT:    [[TMP2:%.*]] = or <8 x i64> [[TMP1]], zeroinitializer
-; CHECK-NEXT:    [[RES0:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1, i64 2, i64 0, i64 34, i64 -2>)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> zeroinitializer, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
-; CHECK-NEXT:    [[TMP4:%.*]] = or <8 x i64> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[RES1:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 -1>)
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i64> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[RES2:%.*]] = add <8 x i64> [[RES0]], [[RES1]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res0 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 2, i64 9, i64 0, i64 -1, i64 3, i64 7, i64 -1, i64 0>, <8 x i64> <i64 1, i64 0, i64 33, i64 -1,i64 2, i64 0, i64 34, i64 -2>)
-  %res1 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 -1>, <8 x i64> <i64 1, i64 1, i64 1, i64 1,  i64 1, i64 1, i64 1, i64 -1>)
-  %res2 = add <8 x i64> %res0, %res1
-  ret <8 x i64> %res2
-}
-
-define <8 x i64> @test_x86_avx512_mask_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_mask_psrlv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP8:%.*]] = or <8 x i64> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP8]], <8 x i64> [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i64> [[RES]], [[A2:%.*]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i64> [[TMP12]], [[TMP4]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i64> [[TMP13]], <8 x i64> [[TMP10]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> [[A2]]
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> %a2
-  ret <8 x i64> %res2
-}
-
-define <8 x i64> @test_x86_avx512_maskz_psrlv_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) #0 {
-; CHECK-LABEL: @test_x86_avx512_maskz_psrlv_q_512(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i1> [[TMP4]] to <8 x i64>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[TMP1]], <8 x i64> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP7:%.*]] = or <8 x i64> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> [[A0:%.*]], <8 x i64> [[A1]])
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
-; CHECK-NEXT:    [[MASK_CAST:%.*]] = bitcast i8 [[MASK:%.*]] to <8 x i1>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[TMP7]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i64> [[RES]], zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i64> [[TMP10]], [[TMP7]]
-; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i64> [[TMP11]], zeroinitializer
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i64> [[TMP12]], <8 x i64> [[TMP9]]
-; CHECK-NEXT:    [[RES2:%.*]] = select <8 x i1> [[MASK_CAST]], <8 x i64> [[RES]], <8 x i64> zeroinitializer
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES2]]
-;
-  %res = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
-  %mask.cast = bitcast i8 %mask to <8 x i1>
-  %res2 = select <8 x i1> %mask.cast, <8 x i64> %res, <8 x i64> zeroinitializer
-  ret <8 x i64> %res2
-}
-
-declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
-
-
-define <8 x double> @test_mm256_castpd128_pd256_freeze(<2 x double> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm256_castpd128_pd256_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <2 x double> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x double> [[A0:%.*]], <2 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %a1 = freeze <2 x double> poison
-  %res = shufflevector <2 x double> %a0, <2 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  ret <8 x double> %res
-}
-
-
-define <8 x double> @test_mm256_castpd256_pd256_freeze(<4 x double> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm256_castpd256_pd256_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x double> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x double> [[A0:%.*]], <4 x double> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x double> [[RES]]
-;
-  %a1 = freeze <4 x double> poison
-  %res = shufflevector <4 x double> %a0, <4 x double> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x double> %res
-}
-
-
-define <16 x float> @test_mm256_castps128_ps512_freeze(<4 x float> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm256_castps128_ps512_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x float> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x float> [[A0:%.*]], <4 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %a1 = freeze <4 x float> poison
-  %res = shufflevector <4 x float> %a0, <4 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
-  ret <16 x float> %res
-}
-
-
-define <16 x float> @test_mm256_castps256_ps512_freeze(<8 x float> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm256_castps256_ps512_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <8 x float> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <8 x float> [[A0:%.*]], <8 x float> [[A1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[RES]]
-;
-  %a1 = freeze <8 x float> poison
-  %res = shufflevector <8 x float> %a0, <8 x float> %a1, <16x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ret <16 x float> %res
-}
-
-
-define <8 x i64> @test_mm512_castsi128_si512_freeze(<2 x i64> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm512_castsi128_si512_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <2 x i64> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <2 x i64> [[A0:%.*]], <2 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %a1 = freeze <2 x i64> poison
-  %res = shufflevector <2 x i64> %a0, <2 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 2, i32 3, i32 2, i32 3>
-  ret <8 x i64> %res
-}
-
-
-define <8 x i64> @test_mm512_castsi256_si512_pd256_freeze(<4 x i64> %a0) nounwind #0 {
-; CHECK-LABEL: @test_mm512_castsi256_si512_pd256_freeze(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[A1:%.*]] = freeze <4 x i64> poison
-; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[RES:%.*]] = shufflevector <4 x i64> [[A0:%.*]], <4 x i64> [[A1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    store <8 x i64> [[_MSPROP]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <8 x i64> [[RES]]
-;
-  %a1 = freeze <4 x i64> poison
-  %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i64> %res
-}
-
-
-define <16 x float> @bad_mask_transition(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
-; CHECK-LABEL: @bad_mask_transition(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP6]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP7]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
-; CHECK:       8:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       9:
-; CHECK-NEXT:    [[TMP10:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x i1> [[TMP10]] to i8
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x i64> [[TMP2]] to i512
-; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x i64> [[TMP3]] to i512
-; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i512 [[TMP13]], 0
-; CHECK-NEXT:    [[_MSOR4:%.*]] = or i1 [[_MSCMP2]], [[_MSCMP3]]
-; CHECK-NEXT:    br i1 [[_MSOR4]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
-; CHECK:       14:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       15:
-; CHECK-NEXT:    [[TMP16:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[C:%.*]], <8 x double> [[D:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x i1> [[TMP16]] to i8
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i16
-; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP17]] to i16
-; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i16 [[CONV2]] to <16 x i1>
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP18]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <16 x i1> [[TMP19]], <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[TMP23:%.*]] = select <16 x i1> [[TMP22]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]]
-; CHECK-NEXT:    [[TMP24:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP25:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP26:%.*]] = xor <16 x i32> [[TMP24]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = or <16 x i32> [[TMP26]], [[TMP4]]
-; CHECK-NEXT:    [[TMP28:%.*]] = or <16 x i32> [[TMP27]], [[TMP5]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP28]], <16 x i32> [[TMP23]]
-; CHECK-NEXT:    [[TMP29:%.*]] = select <16 x i1> [[TMP22]], <16 x float> [[F]], <16 x float> [[E]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP29]]
-;
-entry:
-  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
-  %1 = bitcast <8 x i1> %0 to i8
-  %2 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %c, <8 x double> %d, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
-  %3 = bitcast <8 x i1> %2 to i8
-  %conv = zext i8 %1 to i16
-  %conv2 = zext i8 %3 to i16
-  %4 = bitcast i16 %conv to <16 x i1>
-  %5 = bitcast i16 %conv2 to <16 x i1>
-  %6 = shufflevector <16 x i1> %4, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %7 = shufflevector <16 x i1> %5, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %8 = shufflevector <8 x i1> %6, <8 x i1> %7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %9 = select <16 x i1> %8, <16 x float> %f, <16 x float> %e
-  ret <16 x float> %9
-}
-
-define <16 x float> @bad_mask_transition_2(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double> %d, <16 x float> %e, <16 x float> %f) #0 {
-; CHECK-LABEL: @bad_mask_transition_2(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 320) to ptr), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 256) to ptr), align 8
-; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i64> [[TMP0]] to i512
-; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i64> [[TMP1]] to i512
-; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
-; CHECK-NEXT:    br i1 [[_MSOR]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK:       6:
-; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT:    unreachable
-; CHECK:       7:
-; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> [[A:%.*]], <8 x double> [[B:%.*]], i32 17, <8 x i1> splat (i1 true), i32 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i1> [[TMP8]] to i8
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP9]] to i16
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[CONV]] to <16 x i1>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP2]], <16 x i32> [[TMP3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x float> [[F:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[E:%.*]] to <16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], [[TMP2]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> zeroinitializer, <16 x i32> [[TMP16]], <16 x i32> [[TMP11]]
-; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x float> [[F]], <16 x float> [[E]]
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
-; CHECK-NEXT:    ret <16 x float> [[TMP17]]
-;
-entry:
-  %0 = call <8 x i1> @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 17, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4)
-  %1 = bitcast <8 x i1> %0 to i8
-  %conv = zext i8 %1 to i16
-  %2 = bitcast i16 %conv to <16 x i1>
-  %3 = select <16 x i1> %2, <16 x float> %f, <16 x float> %e
-  ret <16 x float> %3
-}
-
-declare <8 x double> @llvm.x86.avx512.mask.compress.v8f64(<8 x double>, <8 x double>, <8 x i1>)
-declare <16 x float> @llvm.x86.avx512.mask.compress.v16f32(<16 x float>, <16 x float>, <16 x i1>)
-declare <8 x i64> @llvm.x86.avx512.mask.compress.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
-declare <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
-declare <8 x double> @llvm.x86.avx512.mask.expand.v8f64(<8 x double>, <8 x double>, <8 x i1>)
-declare <16 x float> @llvm.x86.avx512.mask.expand.v16f32(<16 x float>, <16 x float>, <16 x i1>)
-declare <8 x i64> @llvm.x86.avx512.mask.expand.v8i64(<8 x i64>, <8 x i64>, <8 x i1>)
-declare <16 x i32> @llvm.x86.avx512.mask.expand.v16i32(<16 x i32>, <16 x i32>, <16 x i1>)
-
-attributes #0 = { sanitize_memory }

From 7107f55d82f8d1077d5478e8f58c94851385c06f Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Mon, 27 Jan 2025 00:25:19 -0300
Subject: [PATCH 158/432] [clang] NFC: remove redundant dyn_cast

---
 clang/lib/AST/TemplateBase.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 3625b6e435a55..0eef8f305fcb3 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -515,19 +515,17 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
   }
 
   case Declaration: {
-    NamedDecl *ND = getAsDecl();
+    ValueDecl *VD = getAsDecl();
     if (getParamTypeForDecl()->isRecordType()) {
-      if (auto *TPO = dyn_cast<TemplateParamObjectDecl>(ND)) {
+      if (auto *TPO = dyn_cast<TemplateParamObjectDecl>(VD)) {
         TPO->getType().getUnqualifiedType().print(Out, Policy);
         TPO->printAsInit(Out, Policy);
         break;
       }
     }
-    if (auto *VD = dyn_cast<ValueDecl>(ND)) {
-      if (needsAmpersandOnTemplateArg(getParamTypeForDecl(), VD->getType()))
-        Out << "&";
-    }
-    ND->printQualifiedName(Out);
+    if (needsAmpersandOnTemplateArg(getParamTypeForDecl(), VD->getType()))
+      Out << "&";
+    VD->printQualifiedName(Out);
     break;
   }
 

From 0e6b58202ca9c4d1ca814e4bea5bd3f0bac7f329 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 20:20:51 -0800
Subject: [PATCH 159/432] [ELF] Improve parseSymbolVersion tests in for
 compileBitcodeFiles

Otherwise, the tests won't catch a mistake that removes
`parseSymbolVersion`.
---
 lld/test/ELF/lto/version-script.ll  | 34 ++++-------------------------
 lld/test/ELF/lto/version-script2.ll |  8 +++----
 2 files changed, 8 insertions(+), 34 deletions(-)

diff --git a/lld/test/ELF/lto/version-script.ll b/lld/test/ELF/lto/version-script.ll
index 52b9afc38eeed..54a5e01f2dee5 100644
--- a/lld/test/ELF/lto/version-script.ll
+++ b/lld/test/ELF/lto/version-script.ll
@@ -3,7 +3,7 @@
 ; RUN: echo "VERSION_1.0{ global: foo; local: *; }; VERSION_2.0{ global: bar; local: *; };" > %t.script
 ; RUN: ld.lld %t.o -o %t2 -shared --version-script %t.script -save-temps
 ; RUN: llvm-dis < %t2.0.0.preopt.bc | FileCheck %s
-; RUN: llvm-readobj --dyn-syms %t2 | FileCheck --check-prefix=DSO %s
+; RUN: llvm-readelf --dyn-syms %t2 | FileCheck --check-prefix=DSO %s
 
 target triple = "x86_64-unknown-linux-gnu"
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
@@ -19,32 +19,6 @@ define void @bar() {
 ; CHECK: define void @foo()
 ; CHECK: define void @bar()
 
-; DSO: DynamicSymbols [
-; DSO:   Symbol {
-; DSO:     Name:
-; DSO:     Value: 0x0
-; DSO:     Size: 0
-; DSO:     Binding: Local
-; DSO:     Type: None
-; DSO:     Other: 0
-; DSO:     Section: Undefined
-; DSO:   }
-; DSO:   Symbol {
-; DSO:     Name: foo@@VERSION_1.0
-; DSO:     Value:
-; DSO:     Size: 1
-; DSO:     Binding: Global
-; DSO:     Type: Function
-; DSO:     Other: 0
-; DSO:     Section: .text
-; DSO:   }
-; DSO:   Symbol {
-; DSO:     Name: bar@@VERSION_2.0
-; DSO:     Value:
-; DSO:     Size: 1
-; DSO:     Binding: Global
-; DSO:     Type: Function
-; DSO:     Other: 0
-; DSO:     Section: .text
-; DSO:   }
-; DSO: ]
+; DSO: Symbol table '.dynsym' contains 3 entries:
+; DSO:      1: {{.*}}               1 FUNC    GLOBAL DEFAULT [[#]] foo@@VERSION_1.0{{$}}
+; DSO:      2: {{.*}}               1 FUNC    GLOBAL DEFAULT [[#]] bar@@VERSION_2.0{{$}}
diff --git a/lld/test/ELF/lto/version-script2.ll b/lld/test/ELF/lto/version-script2.ll
index dab22750f77b8..5635731518fdb 100644
--- a/lld/test/ELF/lto/version-script2.ll
+++ b/lld/test/ELF/lto/version-script2.ll
@@ -17,16 +17,16 @@
 ; RUN: ld.lld %t.o %tbar.so -o %t.so -shared --version-script %t/ver
 ; RUN: llvm-readelf --dyn-syms %t.so | FileCheck %s
 
-; CHECK:      UND       bar@VER1
-; CHECK-NEXT: {{[1-9]}} foo@@VER1
+; CHECK:      UND       bar@VER1{{$}}
+; CHECK-NEXT: {{[1-9]}} foo@@VER1{{$}}
 
 ;; For relocatable output, @ should be retained in the symbol name.
 ;; Don't parse and drop `@VER1`. Also check that --version-script is ignored.
 ; RUN: ld.lld %t.o -o %t.ro -r --version-script %t/ver
 ; RUN: llvm-readelf -s %t.ro | FileCheck %s --check-prefix=RELOCATABLE
 
-; RELOCATABLE:      {{[1-9]}} foo@@VER1
-; RELOCATABLE-NEXT: UND       bar@VER1
+; RELOCATABLE:      {{[1-9]}} foo@@VER1{{$}}
+; RELOCATABLE-NEXT: UND       bar@VER1{{$}}
 
 ;--- ver
 VER1 {};

From 2a26292388fcab0c857c91b2d08074c33abd37e8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 20:32:42 -0800
Subject: [PATCH 160/432] [ELF] Make isExported accurate early

LTO compilation might define symbols not in the symbol table (e.g.
__emutls_v.x in test/ELF/lto/wrap-unreferenced-before-codegen.test).
These symbols have a false `isExported` until
`demoteSymbolsAndComputeIsPreemptible`. This is usually benign as we do
not reference `isExported` that early.

Ensure that `isExported` is correct early. This helps remove a redundant
`isExported` computation in `demoteSymbolsAndComputeIsPreemptible`.
---
 lld/ELF/Driver.cpp | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7e0d3fca31353..06c93710497ed 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2157,9 +2157,12 @@ static void excludeLibs(Ctx &ctx, opt::InputArgList &args) {
     ArrayRef<Symbol *> symbols = file->getSymbols();
     if (isa<ELFFileBase>(file))
       symbols = cast<ELFFileBase>(file)->getGlobalSymbols();
-    for (Symbol *sym : symbols)
-      if (!sym->isUndefined() && sym->file == file)
+    for (Symbol *sym : symbols) {
+      if (!sym->isUndefined() && sym->file == file) {
         sym->versionId = VER_NDX_LOCAL;
+        sym->isExported = false;
+      }
+    }
   };
 
   for (ELFFileBase *file : ctx.objectFiles)
@@ -2545,11 +2548,17 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
     auto *obj = cast<ObjFile<ELFT>>(file.get());
     obj->parse(/*ignoreComdats=*/true);
 
-    // Parse '@' in symbol names for non-relocatable output.
+    // For defined symbols in non-relocatable output,
+    // compute isExported and parse '@'.
     if (!ctx.arg.relocatable)
-      for (Symbol *sym : obj->getGlobalSymbols())
+      for (Symbol *sym : obj->getGlobalSymbols()) {
+        if (!sym->isDefined())
+          continue;
+        if (sym->includeInDynsym(ctx))
+          sym->isExported = true;
         if (sym->hasVersionSuffix)
           sym->parseSymbolVersion(ctx);
+      }
     ctx.objectFiles.push_back(obj);
   }
 }
@@ -3061,7 +3070,7 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   // Handle --exclude-libs again because lto.tmp may reference additional
   // libcalls symbols defined in an excluded archive. This may override
-  // versionId set by scanVersionScript().
+  // versionId set by scanVersionScript() and isExported.
   if (args.hasArg(OPT_exclude_libs))
     excludeLibs(ctx, args);
 

From 1a4d6de1b532149b10522eae5dabce39e5f7c687 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 20:43:03 -0800
Subject: [PATCH 161/432] [ELF] Remove redundant isExported computation

Commit 2a26292388fcab0c857c91b2d08074c33abd37e8 made `isExported`
accurate except a few linker-synthesized symbols in finalizeSections.
We can collect these linker-synthesized symbols into a vector
and avoid recomputation for other symbols.
---
 lld/ELF/Config.h   |  1 +
 lld/ELF/Writer.cpp | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index b2859486d58e9..b1d7fb88553e2 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -619,6 +619,7 @@ struct Ctx : CommonLinkerContext {
   };
   ElfSym sym{};
   std::unique_ptr<SymbolTable> symtab;
+  SmallVector<Symbol *, 0> synthesizedSymbols;
 
   SmallVector<std::unique_ptr<MemoryBuffer>> memoryBuffers;
   SmallVector<ELFFileBase *, 0> objectFiles;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index b7c4790655e8a..4b75137a5db21 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -149,6 +149,7 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec,
   if (!s || s->isDefined() || s->isCommon())
     return nullptr;
 
+  ctx.synthesizedSymbols.push_back(s);
   s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL,
                           stOther, STT_NOTYPE, val,
                           /*size=*/0, sec});
@@ -282,6 +283,7 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
+  bool hasDynSymTab = ctx.arg.hasDynSymTab;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -294,11 +296,12 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
                   sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
+        if (sym->includeInDynsym(ctx))
+          sym->isExported = true;
       }
     }
 
-    sym->isExported = sym->includeInDynsym(ctx);
-    if (ctx.arg.hasDynSymTab)
+    if (hasDynSymTab)
       sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
   }
 }
@@ -1846,6 +1849,11 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     }
   }
 
+  // If the previous code block defines any non-hidden symbols (e.g.
+  // __global_pointer$), they may be exported.
+  for (Symbol *sym : ctx.synthesizedSymbols)
+    sym->isExported = sym->includeInDynsym(ctx);
+
   demoteSymbolsAndComputeIsPreemptible(ctx);
 
   if (ctx.arg.copyRelocs && ctx.arg.discard != DiscardPolicy::None)

From b9efbed468ec18044070eea936c694fb8f6e244b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 21:14:49 -0800
Subject: [PATCH 162/432] Revert "Move HIP fatbin sections farther away from
 .text"

This reverts commit 048f35037779763963c4b4478a0884e828ea9538.
This reverts commit f7bbc40b0736cc417f57cd039b098b504cf6a71f.

Related to #95949. A developer with no prior lld contribution and very
little AMD contribution sneaked in these application-specific section
order rules we discourage.
---
 lld/ELF/Writer.cpp                | 10 --------
 lld/test/ELF/hip-section-layout.s | 41 -------------------------------
 2 files changed, 51 deletions(-)
 delete mode 100644 lld/test/ELF/hip-section-layout.s

diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 4b75137a5db21..6b34f87f0b8d0 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -650,7 +650,6 @@ static bool isRelroSection(Ctx &ctx, const OutputSection *sec) {
 enum RankFlags {
   RF_NOT_ADDR_SET = 1 << 27,
   RF_NOT_ALLOC = 1 << 26,
-  RF_HIP_FATBIN = 1 << 19,
   RF_PARTITION = 1 << 18, // Partition number (8 bits)
   RF_LARGE_ALT = 1 << 15,
   RF_WRITE = 1 << 14,
@@ -748,15 +747,6 @@ unsigned elf::getSectionRank(Ctx &ctx, OutputSection &osec) {
   if (osec.type == SHT_NOBITS)
     rank |= RF_BSS;
 
-  // Put HIP fatbin related sections further away to avoid wasting relocation
-  // range to jump over them.  Make sure .hip_fatbin is the furthest.
-  if (osec.name == ".hipFatBinSegment")
-    rank |= RF_HIP_FATBIN;
-  if (osec.name == ".hip_gpubin_handle")
-    rank |= RF_HIP_FATBIN | 2;
-  if (osec.name == ".hip_fatbin")
-    rank |= RF_HIP_FATBIN | RF_WRITE | 3;
-
   // Some architectures have additional ordering restrictions for sections
   // within the same PT_LOAD.
   if (ctx.arg.emachine == EM_PPC64) {
diff --git a/lld/test/ELF/hip-section-layout.s b/lld/test/ELF/hip-section-layout.s
deleted file mode 100644
index b76141c6b41ae..0000000000000
--- a/lld/test/ELF/hip-section-layout.s
+++ /dev/null
@@ -1,41 +0,0 @@
-# REQUIRES: x86
-## Test HIP specific sections layout.
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=HIP_SECTIONS=1 --defsym=NON_HIP_SECTIONS=1 %s -o %t.o
-# RUN: ld.lld %t.o -o %t.out
-# RUN: llvm-readobj --sections %t.out | FileCheck %s
-
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=NON_HIP_SECTIONS=1 %s -o %t.1.o
-# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux --defsym=HIP_SECTIONS=1 %s -o %t.2.o
-# RUN: ld.lld %t.1.o %t.2.o -o %t.1.s.out
-# RUN: llvm-readobj --sections %t.1.s.out | FileCheck %s
-# RUN: ld.lld %t.2.o %t.1.o -o %t.2.s.out
-# RUN: llvm-readobj --sections %t.2.s.out | FileCheck %s
-
-.ifdef HIP_SECTIONS
-.section .hipFatBinSegment,"aw",@progbits; .space 1
-.section .hip_gpubin_handle,"aw",@progbits; .space 1
-.section .hip_fatbin,"a",@progbits; .space 1
-.endif
-
-.ifdef NON_HIP_SECTIONS
-.global _start
-.text
-_start:
-.section .bss,"aw",@nobits; .space 1
-.section .debug_info,"",@progbits
-.section .debug_line,"",@progbits
-.section .debug_str,"MS",@progbits,1
-.endif
-
-# Check that the HIP sections are placed towards the end but before non allocated sections
-
-// CHECK: Name: .text
-// CHECK: Name: .bss
-// CHECK: Name: .hipFatBinSegment
-// CHECK: Name: .hip_gpubin_handle
-// CHECK: Name: .hip_fatbin
-// CHECK: Name: .debug_info
-// CHECK: Name: .debug_line
-// CHECK: Name: .debug_str
-

From 6805d7e8aa5f2ecea021acbb8c6b4c29ca432e78 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 26 Jan 2025 22:28:31 -0800
Subject: [PATCH 163/432] [test] Convert remove-note.test from \r\n to \n after
 #118739

---
 .../tools/llvm-objcopy/ELF/remove-note.test   | 396 +++++++++---------
 1 file changed, 198 insertions(+), 198 deletions(-)

diff --git a/llvm/test/tools/llvm-objcopy/ELF/remove-note.test b/llvm/test/tools/llvm-objcopy/ELF/remove-note.test
index f8936bf9ea731..e15f934dfe2da 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/remove-note.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/remove-note.test
@@ -1,198 +1,198 @@
-## Check incompatible options.
-# RUN: not llvm-objcopy --remove-note=1 --remove-section=.test - 2>&1 | FileCheck %s --check-prefix=ERR-REMSEC
-# RUN: not llvm-objcopy --remove-note=1 --add-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-ADDSEC
-# RUN: not llvm-objcopy --remove-note=1 --update-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-UPDSEC
-
-# ERR-REMSEC: error: cannot specify both --remove-note and --remove-section
-# ERR-ADDSEC: error: cannot specify both --remove-note and --add-section
-# ERR-UPDSEC: error: cannot specify both --remove-note and --update-section
-
-## Check invalid argument formats.
-# RUN: not llvm-objcopy --remove-note= - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID
-# RUN: not llvm-objcopy --remove-note=CORE/ - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID
-# RUN: not llvm-objcopy --remove-note=/1 - 2>&1 | FileCheck %s --check-prefix=ERR-EMPTYNAME
-# RUN: not llvm-objcopy --remove-note=CORE/1/2 - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM1
-# RUN: not llvm-objcopy --remove-note=Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2
-# RUN: not llvm-objcopy --remove-note=CORE/Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2
-
-# ERR-NOTYPEID: error: bad format for --remove-note, missing type_id
-# ERR-EMPTYNAME: error: bad format for --remove-note, note name is empty
-# ERR-INVNUM1: error: bad note type_id for --remove-note: '1/2'
-# ERR-INVNUM2: error: bad note type_id for --remove-note: 'Notanumber'
-
-## Check deleting notes:
-## * --remove-note=1 will remove note "CORE/1" and "LINUX/1",
-## * --remove-note=DUMMY/2 will not remove any notes because there are no notes with this owner,
-## * --remove-note=CORE/3 will remove "CORE/3" but preserve "LINUX/3".
-# RUN: yaml2obj --docnum=1 -D ALIGN=8 -D ELFCLASS=64 -D ENDIANNESS=LSB %s -o %t8.64.lsb
-# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/2 --remove-note=CORE/0x03 %t8.64.lsb %t8.64.lsb.o
-# RUN: llvm-readobj --segments --sections --notes %t8.64.lsb.o | \
-# RUN:   FileCheck %s -D#SIZE0=32 -D#SIZE1=64
-
-# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=64 -D ENDIANNESS=MSB %s -o %t4.64.msb
-# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.64.msb %t4.64.msb.o
-# RUN: llvm-readobj --segments --sections --notes %t4.64.msb.o | \
-# RUN:   FileCheck %s -D#SIZE0=24 -D#SIZE1=48
-
-# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=32 -D ENDIANNESS=LSB %s -o %t4.32.lsb
-# RUN: llvm-objcopy --remove-note=1 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.32.lsb %t4.32.lsb.o
-# RUN: llvm-readobj --segments --sections --notes %t4.32.lsb.o | \
-# RUN:   FileCheck %s -D#SIZE0=24 -D#SIZE1=48
-
-# CHECK:      Sections [
-# CHECK:        Section {
-# CHECK:          Name: .note0
-# CHECK-NEXT:     Type: SHT_NOTE
-# CHECK-NEXT:     Flags [
-# CHECK-NEXT:     ]
-# CHECK-NEXT:     Address:
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: [[#%d,SIZE0]]
-# CHECK:          Name: .note1
-# CHECK-NEXT:     Type: SHT_NOTE
-# CHECK-NEXT:     Flags [
-# CHECK-NEXT:     ]
-# CHECK-NEXT:     Address:
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: [[#%d,SIZE1]]
-# CHECK:          Name: .note2
-# CHECK-NEXT:     Type: SHT_NOTE
-# CHECK-NEXT:     Flags [
-# CHECK-NEXT:     ]
-# CHECK-NEXT:     Address:
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: 0
-
-# CHECK:      NoteSections [
-# CHECK-NEXT:   NoteSection {
-# CHECK-NEXT:     Name: .note0
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: 0x[[#%X,SIZE0]]
-# CHECK-NEXT:     Notes [
-# CHECK-NEXT:       {
-# CHECK-NEXT:         Owner: CORE
-# CHECK-NEXT:         Data size: 0x2
-# CHECK-NEXT:         Type: NT_ARCH
-# CHECK-NEXT:         Description data (
-# CHECK-NEXT:           0000: 0201
-# CHECK-NEXT:         )
-# CHECK-NEXT:       }
-# CHECK-NEXT:     ]
-# CHECK-NEXT:   }
-# CHECK-NEXT:   NoteSection {
-# CHECK-NEXT:     Name: .note1
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: 0x[[#%X,SIZE1]]
-# CHECK-NEXT:     Notes [
-# CHECK-NEXT:       {
-# CHECK-NEXT:         Owner: LINUX
-# CHECK-NEXT:         Data size: 0x2
-# CHECK-NEXT:         Type: Unknown (0x00000003)
-# CHECK-NEXT:         Description data (
-# CHECK-NEXT:           0000: 0301
-# CHECK-NEXT:         )
-# CHECK-NEXT:       }
-# CHECK-NEXT:       {
-# CHECK-NEXT:         Owner: CORE
-# CHECK-NEXT:         Data size: 0x2
-# CHECK-NEXT:         Type: Unknown (0x00000004)
-# CHECK-NEXT:         Description data (
-# CHECK-NEXT:           0000: 0401
-# CHECK-NEXT:         )
-# CHECK-NEXT:       }
-# CHECK-NEXT:     ]
-# CHECK-NEXT:   }
-# CHECK-NEXT:   NoteSection {
-# CHECK-NEXT:     Name: .note2
-# CHECK-NEXT:     Offset:
-# CHECK-NEXT:     Size: 0x0
-# CHECK-NEXT:     Notes [
-# CHECK-NEXT:     ]
-# CHECK-NEXT:   }
-
---- !ELF
-FileHeader:
-  Class:          ELFCLASS[[ELFCLASS]]
-  Data:           ELFDATA2[[ENDIANNESS]]
-  Type:           ET_REL
-  Machine:        EM_X86_64
-Sections:
-  - Name:         .note0
-    Type:         SHT_NOTE
-    AddressAlign: [[ALIGN]]
-    Notes:
-      - Name:   CORE
-        Type:   0x01
-        Desc:   0101
-      - Name:   CORE
-        Type:   0x02
-        Desc:   0201
-  - Name:         .note1
-    Type:         SHT_NOTE
-    AddressAlign: [[ALIGN]]
-    Notes:
-      - Name:   LINUX
-        Type:   0x03
-        Desc:   0301
-      - Name:   CORE
-        Type:   0x03
-        Desc:   0302
-      - Name:   CORE
-        Type:   0x04
-        Desc:   0401
-  - Name:         .note2
-    Type:         SHT_NOTE
-    AddressAlign: [[ALIGN]]
-    Notes:
-      - Name:   LINUX
-        Type:   0x01
-        Desc:   0102
-
-# RUN: yaml2obj --docnum=2 %s -o %t2
-# RUN: llvm-objcopy --remove-note=1 %t2 %t2o 2>&1 | FileCheck %s --check-prefix=TEST2
-# TEST2: warning: note segments are not supported
-# TEST2-NOT: note segments are not supported
-
---- !ELF
-FileHeader:
-  Class:          ELFCLASS64
-  Data:           ELFDATA2LSB
-  Type:           ET_CORE
-  Machine:        EM_X86_64
-ProgramHeaders:
-  - Type:         PT_NOTE
-    FirstSec:     .data0
-    LastSec:      .data0
-  - Type:         PT_NOTE
-    FirstSec:     .data1
-    LastSec:      .data1
-Sections:
-  - Name:         .data0
-    Type:         Fill
-    Size:         8
-  - Name:         .data1
-    Type:         Fill
-    Size:         8
-
-# RUN: yaml2obj --docnum=3 %s -o %t3
-# RUN: llvm-objcopy --remove-note=1 %t3 %t3o 2>&1 | FileCheck %s --check-prefix=TEST3
-# TEST3: warning: cannot remove note(s) from .note: sections in segments are not supported
-
---- !ELF
-FileHeader:
-  Class:          ELFCLASS64
-  Data:           ELFDATA2LSB
-  Type:           ET_EXEC
-  Machine:        EM_X86_64
-ProgramHeaders:
-  - Type:         PT_LOAD
-    FirstSec:     .note
-    LastSec:      .note
-Sections:
-  - Name:         .note
-    Type:         SHT_NOTE
-    AddressAlign: 4
-    Notes:
-      - Name:     ABC
-        Type:     1
-        Desc:     0102
+## Check incompatible options.
+# RUN: not llvm-objcopy --remove-note=1 --remove-section=.test - 2>&1 | FileCheck %s --check-prefix=ERR-REMSEC
+# RUN: not llvm-objcopy --remove-note=1 --add-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-ADDSEC
+# RUN: not llvm-objcopy --remove-note=1 --update-section=.test=%s - 2>&1 | FileCheck %s --check-prefix=ERR-UPDSEC
+
+# ERR-REMSEC: error: cannot specify both --remove-note and --remove-section
+# ERR-ADDSEC: error: cannot specify both --remove-note and --add-section
+# ERR-UPDSEC: error: cannot specify both --remove-note and --update-section
+
+## Check invalid argument formats.
+# RUN: not llvm-objcopy --remove-note= - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID
+# RUN: not llvm-objcopy --remove-note=CORE/ - 2>&1 | FileCheck %s --check-prefix=ERR-NOTYPEID
+# RUN: not llvm-objcopy --remove-note=/1 - 2>&1 | FileCheck %s --check-prefix=ERR-EMPTYNAME
+# RUN: not llvm-objcopy --remove-note=CORE/1/2 - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM1
+# RUN: not llvm-objcopy --remove-note=Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2
+# RUN: not llvm-objcopy --remove-note=CORE/Notanumber - 2>&1 | FileCheck %s --check-prefix=ERR-INVNUM2
+
+# ERR-NOTYPEID: error: bad format for --remove-note, missing type_id
+# ERR-EMPTYNAME: error: bad format for --remove-note, note name is empty
+# ERR-INVNUM1: error: bad note type_id for --remove-note: '1/2'
+# ERR-INVNUM2: error: bad note type_id for --remove-note: 'Notanumber'
+
+## Check deleting notes:
+## * --remove-note=1 will remove note "CORE/1" and "LINUX/1",
+## * --remove-note=DUMMY/2 will not remove any notes because there are no notes with this owner,
+## * --remove-note=CORE/3 will remove "CORE/3" but preserve "LINUX/3".
+# RUN: yaml2obj --docnum=1 -D ALIGN=8 -D ELFCLASS=64 -D ENDIANNESS=LSB %s -o %t8.64.lsb
+# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/2 --remove-note=CORE/0x03 %t8.64.lsb %t8.64.lsb.o
+# RUN: llvm-readobj --segments --sections --notes %t8.64.lsb.o | \
+# RUN:   FileCheck %s -D#SIZE0=32 -D#SIZE1=64
+
+# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=64 -D ENDIANNESS=MSB %s -o %t4.64.msb
+# RUN: llvm-objcopy --remove-note=0x01 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.64.msb %t4.64.msb.o
+# RUN: llvm-readobj --segments --sections --notes %t4.64.msb.o | \
+# RUN:   FileCheck %s -D#SIZE0=24 -D#SIZE1=48
+
+# RUN: yaml2obj --docnum=1 -D ALIGN=4 -D ELFCLASS=32 -D ENDIANNESS=LSB %s -o %t4.32.lsb
+# RUN: llvm-objcopy --remove-note=1 --remove-note=DUMMY/0x02 --remove-note=CORE/3 %t4.32.lsb %t4.32.lsb.o
+# RUN: llvm-readobj --segments --sections --notes %t4.32.lsb.o | \
+# RUN:   FileCheck %s -D#SIZE0=24 -D#SIZE1=48
+
+# CHECK:      Sections [
+# CHECK:        Section {
+# CHECK:          Name: .note0
+# CHECK-NEXT:     Type: SHT_NOTE
+# CHECK-NEXT:     Flags [
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address:
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: [[#%d,SIZE0]]
+# CHECK:          Name: .note1
+# CHECK-NEXT:     Type: SHT_NOTE
+# CHECK-NEXT:     Flags [
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address:
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: [[#%d,SIZE1]]
+# CHECK:          Name: .note2
+# CHECK-NEXT:     Type: SHT_NOTE
+# CHECK-NEXT:     Flags [
+# CHECK-NEXT:     ]
+# CHECK-NEXT:     Address:
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: 0
+
+# CHECK:      NoteSections [
+# CHECK-NEXT:   NoteSection {
+# CHECK-NEXT:     Name: .note0
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: 0x[[#%X,SIZE0]]
+# CHECK-NEXT:     Notes [
+# CHECK-NEXT:       {
+# CHECK-NEXT:         Owner: CORE
+# CHECK-NEXT:         Data size: 0x2
+# CHECK-NEXT:         Type: NT_ARCH
+# CHECK-NEXT:         Description data (
+# CHECK-NEXT:           0000: 0201
+# CHECK-NEXT:         )
+# CHECK-NEXT:       }
+# CHECK-NEXT:     ]
+# CHECK-NEXT:   }
+# CHECK-NEXT:   NoteSection {
+# CHECK-NEXT:     Name: .note1
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: 0x[[#%X,SIZE1]]
+# CHECK-NEXT:     Notes [
+# CHECK-NEXT:       {
+# CHECK-NEXT:         Owner: LINUX
+# CHECK-NEXT:         Data size: 0x2
+# CHECK-NEXT:         Type: Unknown (0x00000003)
+# CHECK-NEXT:         Description data (
+# CHECK-NEXT:           0000: 0301
+# CHECK-NEXT:         )
+# CHECK-NEXT:       }
+# CHECK-NEXT:       {
+# CHECK-NEXT:         Owner: CORE
+# CHECK-NEXT:         Data size: 0x2
+# CHECK-NEXT:         Type: Unknown (0x00000004)
+# CHECK-NEXT:         Description data (
+# CHECK-NEXT:           0000: 0401
+# CHECK-NEXT:         )
+# CHECK-NEXT:       }
+# CHECK-NEXT:     ]
+# CHECK-NEXT:   }
+# CHECK-NEXT:   NoteSection {
+# CHECK-NEXT:     Name: .note2
+# CHECK-NEXT:     Offset:
+# CHECK-NEXT:     Size: 0x0
+# CHECK-NEXT:     Notes [
+# CHECK-NEXT:     ]
+# CHECK-NEXT:   }
+
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS[[ELFCLASS]]
+  Data:           ELFDATA2[[ENDIANNESS]]
+  Type:           ET_REL
+  Machine:        EM_X86_64
+Sections:
+  - Name:         .note0
+    Type:         SHT_NOTE
+    AddressAlign: [[ALIGN]]
+    Notes:
+      - Name:   CORE
+        Type:   0x01
+        Desc:   0101
+      - Name:   CORE
+        Type:   0x02
+        Desc:   0201
+  - Name:         .note1
+    Type:         SHT_NOTE
+    AddressAlign: [[ALIGN]]
+    Notes:
+      - Name:   LINUX
+        Type:   0x03
+        Desc:   0301
+      - Name:   CORE
+        Type:   0x03
+        Desc:   0302
+      - Name:   CORE
+        Type:   0x04
+        Desc:   0401
+  - Name:         .note2
+    Type:         SHT_NOTE
+    AddressAlign: [[ALIGN]]
+    Notes:
+      - Name:   LINUX
+        Type:   0x01
+        Desc:   0102
+
+# RUN: yaml2obj --docnum=2 %s -o %t2
+# RUN: llvm-objcopy --remove-note=1 %t2 %t2o 2>&1 | FileCheck %s --check-prefix=TEST2
+# TEST2: warning: note segments are not supported
+# TEST2-NOT: note segments are not supported
+
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_CORE
+  Machine:        EM_X86_64
+ProgramHeaders:
+  - Type:         PT_NOTE
+    FirstSec:     .data0
+    LastSec:      .data0
+  - Type:         PT_NOTE
+    FirstSec:     .data1
+    LastSec:      .data1
+Sections:
+  - Name:         .data0
+    Type:         Fill
+    Size:         8
+  - Name:         .data1
+    Type:         Fill
+    Size:         8
+
+# RUN: yaml2obj --docnum=3 %s -o %t3
+# RUN: llvm-objcopy --remove-note=1 %t3 %t3o 2>&1 | FileCheck %s --check-prefix=TEST3
+# TEST3: warning: cannot remove note(s) from .note: sections in segments are not supported
+
+--- !ELF
+FileHeader:
+  Class:          ELFCLASS64
+  Data:           ELFDATA2LSB
+  Type:           ET_EXEC
+  Machine:        EM_X86_64
+ProgramHeaders:
+  - Type:         PT_LOAD
+    FirstSec:     .note
+    LastSec:      .note
+Sections:
+  - Name:         .note
+    Type:         SHT_NOTE
+    AddressAlign: 4
+    Notes:
+      - Name:     ABC
+        Type:     1
+        Desc:     0102

From 9452ee4f750a849148a391ac75eb31220343fa1e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 27 Jan 2025 00:11:18 -0800
Subject: [PATCH 164/432] [clang-format] Treat uppercase identifiers after
 struct as macros (#124397)

This restores the behavior before llvmorg-20-init.

Fixes #94184.
Fixes #117477.
Fixes #122690.
Fixes #123142.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 3 ++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 4258329136348..906fc11a07d5e 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -4075,7 +4075,8 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
       break;
     default:
       if (!JSPastExtendsOrImplements && !ClassName &&
-          Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro)) {
+          Previous->is(tok::identifier) && Previous->isNot(TT_AttributeMacro) &&
+          Previous->TokenText != Previous->TokenText.upper()) {
         ClassName = Previous;
       }
     }
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 10587449dcea9..585878e0edc5b 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -560,9 +560,16 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[11], tok::l_brace, TT_StructLBrace);
 
+  constexpr StringRef Code{"struct EXPORT StructName {};"};
+
+  Tokens = annotate(Code);
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
+  EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);
+  EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_StructRBrace);
+
   auto Style = getLLVMStyle();
   Style.AttributeMacros.push_back("EXPORT");
-  Tokens = annotate("struct EXPORT StructName {};", Style);
+  Tokens = annotate(Code, Style);
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_AttributeMacro);
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_StructLBrace);

From a01e1d4e044ec0147e04a5af9ca54ede550f5dc1 Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Mon, 27 Jan 2025 03:31:56 -0500
Subject: [PATCH 165/432] [clang][Sema] Handle dependent qualifier in
 HeuristicResolver::resolveDeclRefExpr() (#124515)

---
 clang/lib/Sema/HeuristicResolver.cpp          |  5 +++--
 .../unittests/Sema/HeuristicResolverTest.cpp  | 21 +++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index e893afed71d26..87c7274e7aefa 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -262,8 +262,9 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
 
 std::vector<const NamedDecl *>
 HeuristicResolverImpl::resolveDeclRefExpr(const DependentScopeDeclRefExpr *RE) {
-  return resolveDependentMember(QualType(RE->getQualifier()->getAsType(), 0),
-                                RE->getDeclName(), StaticFilter);
+  return resolveDependentMember(
+      resolveNestedNameSpecifierToType(RE->getQualifier()), RE->getDeclName(),
+      StaticFilter);
 }
 
 std::vector<const NamedDecl *>
diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
index 2b775b11719ea..e5cd1254d7542 100644
--- a/clang/unittests/Sema/HeuristicResolverTest.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -385,6 +385,27 @@ TEST(HeuristicResolver, DeclRefExpr_RespectScope) {
       dependentScopeDeclRefExpr(hasDependentName("getPointer")).bind("input"));
 }
 
+TEST(HeuristicResolver, DeclRefExpr_Nested) {
+  std::string Code = R"cpp(
+    struct S {
+      static int Waldo;
+    };
+    template <typename T>
+    struct Meta {
+      using Type = S;
+    };
+    template <typename T>
+    void foo() {
+      Meta<T>::Type::Waldo;
+    }
+  )cpp";
+  // Test resolution of "Waldo" in "Meta<T>::Type::Waldo".
+  expectResolution(
+      Code, &HeuristicResolver::resolveDeclRefExpr,
+      dependentScopeDeclRefExpr(hasDependentName("Waldo")).bind("input"),
+      varDecl(hasName("Waldo")).bind("output"));
+}
+
 TEST(HeuristicResolver, DependentNameType) {
   std::string Code = R"cpp(
     template <typename>

From bd38c4993aa41d89a13cbc4dc457df4d81e410bf Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 09:21:56 +0000
Subject: [PATCH 166/432] [AArch64] Generate zeroing forms of certain SVE2.2
 instructions (8/11) (#116834)

SVE2.2 introduces instructions with predicated forms with zeroing of
the inactive lanes. This allows in some cases to save a `movprfx` or
a `mov` instruction when emitting code for `_x` or `_z` variants of
intrinsics.

This patch adds support for emitting the zeroing forms of certain
`FRINTx`, `FRECPX`, and `FSQRT` instructions.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   18 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |    9 +-
 .../zeroing-forms-frint-frecpx-fsqrt.ll       | 4656 +++++++++++++++++
 3 files changed, 4673 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6d5e2697160ab..9ed683e73e9cc 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4301,17 +4301,17 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm FRINT64X_ZPzZ : sve_fp_z2op_p_zd_frint<0b11, "frint64x">;
 
   // Floating-point round to integral fp value, zeroing predicate
-  defm FRINTN_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00000, "frintn">;
-  defm FRINTP_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00001, "frintp">;
-  defm FRINTM_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00010, "frintm">;
-  defm FRINTZ_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00011, "frintz">;
-  defm FRINTA_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00100, "frinta">;
-  defm FRINTX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00110, "frintx">;
-  defm FRINTI_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00111, "frinti">;
+  defm FRINTN_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00000, "frintn", AArch64frintn_mt>;
+  defm FRINTP_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00001, "frintp", AArch64frintp_mt>;
+  defm FRINTM_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00010, "frintm", AArch64frintm_mt>;
+  defm FRINTZ_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00011, "frintz", AArch64frintz_mt>;
+  defm FRINTA_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00100, "frinta", AArch64frinta_mt>;
+  defm FRINTX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00110, "frintx", AArch64frintx_mt>;
+  defm FRINTI_ZPzZ : sve_fp_z2op_p_zd_hsd<0b00111, "frinti", AArch64frinti_mt>;
   // Floating-point invert exponent, zeroing predicate
-  defm FRECPX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b01100, "frecpx">;
+  defm FRECPX_ZPzZ : sve_fp_z2op_p_zd_hsd<0b01100, "frecpx", AArch64frecpx_mt>;
   // Floating-point square root, zeroing predicate
-  defm FSQRT_ZPZz  : sve_fp_z2op_p_zd_hsd<0b01101, "fsqrt">;
+  defm FSQRT_ZPZz  : sve_fp_z2op_p_zd_hsd<0b01101, "fsqrt", AArch64fsqrt_mt>;
 
   // SVE2p2 integer unary arithmetic (bitwise), zeroing predicate
   defm CLS_ZPzZ  : sve_int_un_pred_arit_bitwise_z<0b000,  "cls", AArch64cls_mt>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 2ee9910da5079..8125014faa033 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3270,10 +3270,17 @@ multiclass sve_fp_z2op_p_zd<string asm, SDPatternOperator op> {
   defm : SVE_3_Op_UndefZero_Pat<nxv4f32, op, nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
-multiclass sve_fp_z2op_p_zd_hsd<bits<5> opc, string asm> {
+multiclass sve_fp_z2op_p_zd_hsd<bits<5> opc, string asm, SDPatternOperator op> {
   def _H : sve_fp_z2op_p_zd<{ 0b01, opc }, asm, ZPR16, ZPR16>;
   def _S : sve_fp_z2op_p_zd<{ 0b10, opc }, asm, ZPR32, ZPR32>;
   def _D : sve_fp_z2op_p_zd<{ 0b11, opc }, asm, ZPR64, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f16, op, nxv4i1, nxv4f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f16, op, nxv2i1, nxv2f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f32, op, nxv2i1, nxv2f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_fp_z2op_p_zd_frint<bits<2> opc, string asm> {
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll
new file mode 100644
index 0000000000000..c493ec2dcc95d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-frint-frecpx-fsqrt.ll
@@ -0,0 +1,4656 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve    < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme    -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x half> @test_svrinta_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinta_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinta.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrinta_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinta_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinta.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrinta_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinta_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinta.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinta_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinta_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinta.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinta_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinta_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinta.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinta_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinta_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinta.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinta_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinta_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinta.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinta_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinta_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinta.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinta_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinta_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinta.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrinta_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinta_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinta.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrinta_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinta_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinta.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrinta_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinta_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinta.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinta_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinta_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinta.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinta_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinta_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinta.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinta_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinta_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinta.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrinta_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinta_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinta z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinta.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrinta_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinta_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinta.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrinta_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinta_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frinta z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinta_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinta z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinta.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrinti_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinti_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinti.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrinti_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinti_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinti.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrinti_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrinti_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinti.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinti_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinti_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinti.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinti_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinti_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinti.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrinti_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrinti_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinti.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinti_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinti_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinti.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinti_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinti_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinti.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrinti_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrinti_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinti.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrinti_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinti_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinti.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrinti_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinti_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinti.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrinti_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrinti_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinti.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinti_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinti_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinti.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinti_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinti_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinti.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrinti_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrinti_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinti.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrinti_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinti_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frinti z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinti.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrinti_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinti_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinti.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 2 x double> @test_svrinti_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrinti_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frinti z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrinti_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frinti z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinti.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svrintm_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintm_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintm.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintm_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintm_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintm.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 8 x half> @test_svrintm_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintm_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintm.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintm_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintm_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintm.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintm_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintm_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintm.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintm_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintm_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintm.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintm_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintm_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintm.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintm_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintm_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintm.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintm_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintm_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintm.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrintm_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintm_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintm.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintm_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintm_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintm.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintm_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintm_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintm.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintm_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintm_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintm.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintm_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintm_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintm.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintm_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintm_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintm.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrintm_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintm_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintm z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintm.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintm_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintm_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintm.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintm_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintm_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frintm z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintm_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintm z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintm.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrintn_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintn_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintn.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintn_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintn_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintn.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintn_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintn_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintn.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintn_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintn_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintn.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintn_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintn_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintn.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintn_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintn_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintn.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintn_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintn_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintn.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintn_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintn_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintn.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintn_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintn_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintn.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrintn_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintn_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintn.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintn_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintn_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintn.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintn_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintn_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintn.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintn_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintn_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintn.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintn_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintn_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintn.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintn_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintn_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintn.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrintn_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintn_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintn z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintn.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintn_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintn_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintn.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintn_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintn_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frintn z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintn_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintn z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintn.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrintp_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintp_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintp.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintp_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintp_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintp.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintp_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintp_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintp.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintp_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintp_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintp.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintp_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintp_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintp.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintp_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintp_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintp.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintp_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintp_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintp.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintp_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintp_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintp.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintp_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintp_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintp.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrintp_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintp_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintp.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintp_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintp_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintp.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintp_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintp_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintp.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintp_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintp_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintp.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintp_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintp_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintp.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintp_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintp_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintp.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrintp_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintp_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintp z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintp.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintp_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintp_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintp.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 2 x double> @test_svrintp_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintp_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frintp z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintp_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintp z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintp.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrintx_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintx_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintx.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintx_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintx_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintx.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintx_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintx_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintx.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintx_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintx_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintx.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintx_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintx_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintx.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintx_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintx_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintx.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintx_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintx_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintx.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintx_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintx_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintx.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintx_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintx_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintx.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrintx_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintx_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintx.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintx_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintx_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintx.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintx_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintx_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintx.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintx_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintx_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintx.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintx_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintx_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintx.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintx_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintx_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintx.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrintx_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintx_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintx z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintx.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintx_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintx_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintx.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintx_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintx_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frintx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintx_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintx.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrintz_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintz_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintz.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintz_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintz_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintz.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrintz_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrintz_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintz.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintz_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintz_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintz.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintz_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintz_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintz.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrintz_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrintz_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintz.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintz_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintz_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintz.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintz_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintz_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintz.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrintz_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrintz_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintz.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrintz_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintz_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintz.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintz_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintz_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintz.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrintz_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrintz_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintz.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintz_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintz_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintz.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintz_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintz_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintz.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrintz_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrintz_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintz.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrintz_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintz_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frintz z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintz.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintz_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintz_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintz.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrintz_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrintz_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frintz z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrintz_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frintz z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintz.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svrecpx_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrecpx_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpx.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrecpx_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrecpx_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpx.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrecpx_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrecpx_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpx.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrecpx_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrecpx_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frecpx.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrecpx_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrecpx_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frecpx.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svrecpx_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svrecpx_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frecpx.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrecpx_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrecpx_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frecpx.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrecpx_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrecpx_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frecpx.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svrecpx_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svrecpx_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frecpx.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svrecpx_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrecpx_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frecpx.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrecpx_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrecpx_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frecpx.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svrecpx_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svrecpx_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frecpx.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrecpx_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrecpx_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpx.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrecpx_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrecpx_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpx.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrecpx_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrecpx_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpx.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrecpx_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrecpx_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    frecpx z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frecpx.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrecpx_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrecpx_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frecpx.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrecpx_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrecpx_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    frecpx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpx_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    frecpx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frecpx.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svsqrt_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svsqrt_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fsqrt.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svsqrt_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svsqrt_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fsqrt.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svsqrt_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svsqrt_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fsqrt.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 4 x half> @test_svsqrt_4f16_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svsqrt_4f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_4f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.fsqrt.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svsqrt_4f16_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svsqrt_4f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_4f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.fsqrt.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svsqrt_4f16_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svsqrt_4f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_4f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.fsqrt.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 2 x half> @test_svsqrt_2f16_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svsqrt_2f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.fsqrt.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svsqrt_2f16_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svsqrt_2f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.fsqrt.nxv2f16(<vscale x 2 x half> poison, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svsqrt_2f16_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svsqrt_2f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.fsqrt.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x float> @test_svsqrt_2f32_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svsqrt_2f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.fsqrt.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svsqrt_2f32_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svsqrt_2f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.fsqrt.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svsqrt_2f32_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svsqrt_2f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_2f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.fsqrt.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 4 x float> @test_svsqrt_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svsqrt_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svsqrt_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svsqrt_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svsqrt_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svsqrt_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svsqrt_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svsqrt_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsqrt.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svsqrt_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svsqrt_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsqrt.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svsqrt_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svsqrt_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqrt_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    fsqrt z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsqrt.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 8 x half> @test_svfrinta_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrinta_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinta.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrinta_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrinta_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinta.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrinta_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrinta_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinta.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrinta_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrinta_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinta.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrinta_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrinta_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinta.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrinta_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrinta_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinta.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrinta_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrinta_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinta.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrinta_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrinta_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinta.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrinta_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrinta_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinta.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrinta_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrinta_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinta z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinta.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrinta_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrinta_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinta z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinta.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrinta_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrinta_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinta z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinta_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinta z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinta.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrinti_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrinti_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinti.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrinti_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrinti_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frinti.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrinti_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrinti_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinti.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrinti_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrinti_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frinti.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrinti_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrinti_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinti.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrinti_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrinti_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frinti.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrinti_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrinti_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinti.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrinti_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrinti_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frinti.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrinti_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrinti_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinti.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrinti_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrinti_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frinti z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frinti.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrinti_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrinti_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frinti z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinti.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrinti_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrinti_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frinti z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrinti_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frinti z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frinti.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrintm_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrintm_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintm.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrintm_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrintm_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintm.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrintm_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrintm_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintm.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrintm_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrintm_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintm.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrintm_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrintm_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintm.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrintm_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrintm_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintm.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrintm_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrintm_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintm.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrintm_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrintm_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintm.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrintm_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrintm_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintm.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrintm_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrintm_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintm z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintm.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrintm_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrintm_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintm z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintm.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrintm_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrintm_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintm z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintm_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintm z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintm.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrintn_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrintn_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintn.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrintn_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrintn_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintn.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrintn_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrintn_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintn.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrintn_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrintn_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintn.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrintn_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrintn_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintn.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrintn_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrintn_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintn.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrintn_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrintn_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintn.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrintn_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrintn_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintn.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrintn_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrintn_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintn.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrintn_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrintn_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintn z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintn.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrintn_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrintn_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintn z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintn.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrintn_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrintn_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintn z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintn_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintn z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintn.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrintp_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrintp_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintp.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrintp_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrintp_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintp.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrintp_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrintp_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintp.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrintp_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrintp_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintp.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrintp_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrintp_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintp.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrintp_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrintp_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintp.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrintp_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrintp_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintp.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrintp_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrintp_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintp.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrintp_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrintp_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintp.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrintp_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrintp_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintp z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintp.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrintp_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrintp_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintp z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintp.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrintp_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrintp_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintp z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintp_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintp z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintp.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrintx_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrintx_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintx.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrintx_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrintx_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintx.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrintx_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrintx_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintx.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrintx_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrintx_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintx.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrintx_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrintx_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintx.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrintx_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrintx_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintx.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrintx_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrintx_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintx.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrintx_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrintx_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintx.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrintx_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrintx_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintx.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrintx_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrintx_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintx z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintx.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrintx_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrintx_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintx.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrintx_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrintx_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintx z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintx_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintx z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintx.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrintz_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrintz_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintz.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrintz_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrintz_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frintz.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrintz_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrintz_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintz.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrintz_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrintz_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frintz.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrintz_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrintz_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintz.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrintz_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrintz_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frintz.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrintz_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrintz_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintz.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrintz_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrintz_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frintz.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrintz_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrintz_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintz.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrintz_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrintz_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frintz z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frintz.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrintz_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrintz_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frintz z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintz.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrintz_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrintz_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frintz z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrintz_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frintz z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frintz.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfrecpx_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpx.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfrecpx_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpx.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfrecpx_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frecpx.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfrecpx_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.frecpx.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfrecpx_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frecpx.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfrecpx_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.frecpx.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfrecpx_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frecpx.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfrecpx_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.frecpx.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfrecpx_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpx.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfrecpx_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    frecpx z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpx.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfrecpx_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfrecpx_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    frecpx z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frecpx.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfrecpx_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfrecpx_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    frecpx z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfrecpx_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    frecpx z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.frecpx.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+
+
+define <vscale x 8 x half> @test_svfsqrt_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fsqrt.nxv8f16(<vscale x 8 x half> poison,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svfsqrt_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fsqrt.nxv8f16(<vscale x 8 x half> %x,  <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+
+define <vscale x 4 x half> @test_svfsqrt_nxv4f16_ptrue_u(double %z0, <vscale x 4 x half> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv4f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv4f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.fsqrt.nxv4f16(<vscale x 4 x half> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %x)
+  ret <vscale x 4 x half> %0
+}
+
+define <vscale x 4 x half> @test_svfsqrt_nxv4f16_ptrue(double %z0, <vscale x 4 x half> %x, <vscale x 4 x half> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv4f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv4f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x half> @llvm.aarch64.sve.fsqrt.nxv4f16(<vscale x 4 x half> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x half> %y)
+  ret <vscale x 4 x half> %0
+}
+
+
+define <vscale x 2 x half> @test_svfsqrt_nxv2f16_ptrue_u(double %z0, <vscale x 2 x half> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv2f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.fsqrt.nxv2f16(<vscale x 2 x half> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %x)
+  ret <vscale x 2 x half> %0
+}
+
+define <vscale x 2 x half> @test_svfsqrt_nxv2f16_ptrue(double %z0, <vscale x 2 x half> %x, <vscale x 2 x half> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv2f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x half> @llvm.aarch64.sve.fsqrt.nxv2f16(<vscale x 2 x half> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x half> %y)
+  ret <vscale x 2 x half> %0
+}
+
+
+define <vscale x 2 x float> @test_svfsqrt_nxv2f32_ptrue_u(double %z0, <vscale x 2 x float> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv2f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.fsqrt.nxv2f32(<vscale x 2 x float> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %x)
+  ret <vscale x 2 x float> %0
+}
+
+define <vscale x 2 x float> @test_svfsqrt_nxv2f32_ptrue(double %z0, <vscale x 2 x float> %x, <vscale x 2 x float> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv2f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x float> @llvm.aarch64.sve.fsqrt.nxv2f32(<vscale x 2 x float> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x float> %y)
+  ret <vscale x 2 x float> %0
+}
+
+
+define <vscale x 4 x float> @test_svfsqrt_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> poison,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svfsqrt_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    fsqrt z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fsqrt.nxv4f32(<vscale x 4 x float> %x,  <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+
+define <vscale x 2 x double> @test_svfsqrt_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svfsqrt_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsqrt.nxv2f64(<vscale x 2 x double> poison,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svfsqrt_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svfsqrt_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    fsqrt z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svfsqrt_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    fsqrt z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call  <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsqrt.nxv2f64(<vscale x 2 x double> %x,  <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}
+

From 351ee30529c054d39ea742c1b9c738c9e70c131b Mon Sep 17 00:00:00 2001
From: bernhardu <bernhardu@mailbox.org>
Date: Mon, 27 Jan 2025 10:49:07 +0100
Subject: [PATCH 167/432] [win/asan] GetInstructionSize: Support some more 6
 byte instructions. (#124006)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds several instructions seen when trying to run a
executable built with ASan with llvm-mingw.
(x86 and x86_64, using the git tip in llvm-project).

Also includes instructions collected by
Roman Pišl and Eric Pouech in the Wine bug reports below.

```
Related: https://github.com/llvm/llvm-project/issues/96270

Co-authored-by: Roman Pišl <rpisl@seznam.cz>
                https://bugs.winehq.org/show_bug.cgi?id=50993
                https://bugs.winehq.org/attachment.cgi?id=70233
Co-authored-by: Eric Pouech <eric.pouech@gmail.com>
                https://bugs.winehq.org/show_bug.cgi?id=52386
                https://bugs.winehq.org/attachment.cgi?id=71626
```
---
 compiler-rt/lib/interception/interception_win.cpp            | 5 +++++
 compiler-rt/lib/interception/tests/interception_win_test.cpp | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index 64004c171d534..de6e74edce2d1 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -661,6 +661,10 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0xC1F6:  // F6 C1 XX : test cl, XX
       return 3;
 
+    case 0x89FF:  // FF 89 XX XX XX XX : dec dword ptr [ecx + XX XX XX XX]
+    case 0xEC81:  // 81 EC XX XX XX XX : sub esp, XX XX XX XX
+      return 6;
+
     // Cannot overwrite control-instruction. Return 0 to indicate failure.
     case 0x25FF:  // FF 25 XX YY ZZ WW : jmp dword ptr ds:[WWZZYYXX]
       return 0;
@@ -739,6 +743,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
     case 0x058B:  // 8B 05 XX XX XX XX : mov eax, dword ptr [XX XX XX XX]
       if (rel_offset)
         *rel_offset = 2;
+    case 0xB841:  // 41 B8 XX XX XX XX : mov r8d, XX XX XX XX
       return 6;
 
     case 0x7E81:  // 81 7E YY XX XX XX XX  cmp DWORD PTR [rsi+YY], XX XX XX XX
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index c5dcf26070f0d..c29968e974742 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -870,6 +870,8 @@ const struct InstructionSizeData {
     { 5, {0xb8, 0x71, 0x72, 0x73, 0x74}, 0, "b8 XX XX XX XX : mov eax, XX XX XX XX"},
     { 5, {0xB9, 0x71, 0x72, 0x73, 0x74}, 0, "b9 XX XX XX XX : mov ecx, XX XX XX XX"},
     { 5, {0xBA, 0x71, 0x72, 0x73, 0x74}, 0, "ba XX XX XX XX : mov edx, XX XX XX XX"},
+    { 6, {0x81, 0xEC, 0x72, 0x73, 0x74, 0x75}, 0, "81 EC XX XX XX XX : sub esp, XX XX XX XX"},
+    { 6, {0xFF, 0x89, 0x72, 0x73, 0x74, 0x75}, 0, "FF 89 XX XX XX XX : dec dword ptr [ecx + XX XX XX XX]"},
     { 7, {0x8D, 0xA4, 0x24, 0x73, 0x74, 0x75, 0x76}, 0, "8D A4 24 XX XX XX XX : lea esp, [esp + XX XX XX XX]"},
 #if SANITIZER_WINDOWS_x64
     // sorted list
@@ -1000,6 +1002,7 @@ const struct InstructionSizeData {
     { 5, {0x66, 0x48, 0x0F, 0x7E, 0xC0}, 0, "66 48 0F 7E C0 : movq rax, xmm0"},
     { 5, {0x83, 0x44, 0x72, 0x73, 0x74}, 0, "83 44 72 XX YY : add DWORD PTR [rdx+rsi*2+XX],YY"},
     { 5, {0x83, 0x64, 0x24, 0x73, 0x74}, 0, "83 64 24 XX YY : and DWORD PTR [rsp+XX], YY"},
+    { 6, {0x41, 0xB8, 0x72, 0x73, 0x74, 0x75}, 0, "41 B8 XX XX XX XX : mov r8d, XX XX XX XX"},
     { 6, {0x48, 0x83, 0x64, 0x24, 0x74, 0x75}, 0, "48 83 64 24 XX YY : and QWORD PTR [rsp + XX], YY"},
     { 6, {0x66, 0x81, 0x78, 0x73, 0x74, 0x75}, 0, "66 81 78 XX YY YY : cmp WORD PTR [rax+XX], YY YY"},
     { 6, {0x66, 0x81, 0x79, 0x73, 0x74, 0x75}, 0, "66 81 79 XX YY YY : cmp WORD PTR [rcx+XX], YY YY"},

From bbf377060adc8607e1187952388c7eeea7cf4933 Mon Sep 17 00:00:00 2001
From: bernhardu <bernhardu@mailbox.org>
Date: Mon, 27 Jan 2025 10:50:54 +0100
Subject: [PATCH 168/432] [win/asan] GetInstructionSize: Support some more 7 or
 8 byte instructions. (#124011)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch adds several instructions seen when trying to run a
executable built with ASan with llvm-mingw.
(x86 and x86_64, using the git tip in llvm-project).

Also includes instructions collected by
Roman Pišl and Eric Pouech in the Wine bug reports below.

```
Related: https://github.com/llvm/llvm-project/issues/96270

Co-authored-by: Roman Pišl <rpisl@seznam.cz>
                https://bugs.winehq.org/show_bug.cgi?id=50993
                https://bugs.winehq.org/attachment.cgi?id=70233
Co-authored-by: Eric Pouech <eric.pouech@gmail.com>
                https://bugs.winehq.org/show_bug.cgi?id=52386
                https://bugs.winehq.org/attachment.cgi?id=71626
```
---
 compiler-rt/lib/interception/interception_win.cpp            | 3 +++
 compiler-rt/lib/interception/tests/interception_win_test.cpp | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/compiler-rt/lib/interception/interception_win.cpp b/compiler-rt/lib/interception/interception_win.cpp
index de6e74edce2d1..002b37468a200 100644
--- a/compiler-rt/lib/interception/interception_win.cpp
+++ b/compiler-rt/lib/interception/interception_win.cpp
@@ -857,6 +857,7 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
       return 6;
 
     case 0xec8148:    // 48 81 EC XX XX XX XX : sub rsp, XXXXXXXX
+    case 0xc0c748:    // 48 C7 C0 XX XX XX XX : mov rax, XX XX XX XX
       return 7;
 
     // clang-format off
@@ -918,6 +919,8 @@ static size_t GetInstructionSize(uptr address, size_t* rel_offset = nullptr) {
       return 5;
     case 0x24648348:  // 48 83 64 24 XX YY : and QWORD PTR [rsp + XX], YY
       return 6;
+    case 0x24A48D48:  // 48 8D A4 24 XX XX XX XX : lea rsp, [rsp + XX XX XX XX]
+      return 8;
   }
 
   switch (0xFFFFFFFFFFULL & *(u64 *)(address)) {
diff --git a/compiler-rt/lib/interception/tests/interception_win_test.cpp b/compiler-rt/lib/interception/tests/interception_win_test.cpp
index c29968e974742..2a7549d230ae2 100644
--- a/compiler-rt/lib/interception/tests/interception_win_test.cpp
+++ b/compiler-rt/lib/interception/tests/interception_win_test.cpp
@@ -1022,6 +1022,7 @@ const struct InstructionSizeData {
     { 7, {0x48, 0x89, 0x15, 0x73, 0x74, 0x75, 0x76}, 3, "48 89 15 XX XX XX XX : mov QWORD PTR [rip + XXXXXXXX], rdx"},
     { 7, {0x48, 0x8b, 0x05, 0x73, 0x74, 0x75, 0x76}, 3, "48 8b 05 XX XX XX XX : mov rax, QWORD PTR [rip + XXXXXXXX]"},
     { 7, {0x48, 0x8d, 0x05, 0x73, 0x74, 0x75, 0x76}, 3, "48 8d 05 XX XX XX XX : lea rax, QWORD PTR [rip + XXXXXXXX]"},
+    { 7, {0x48, 0xc7, 0xc0, 0x73, 0x74, 0x75, 0x76}, 0, "48 C7 C0 XX XX XX XX : mov rax, XX XX XX XX"},
     { 7, {0x48, 0xff, 0x25, 0x73, 0x74, 0x75, 0x76}, 3, "48 ff 25 XX XX XX XX : rex.W jmp QWORD PTR [rip + XXXXXXXX]"},
     { 7, {0x4C, 0x8D, 0x15, 0x73, 0x74, 0x75, 0x76}, 3, "4c 8d 15 XX XX XX XX : lea r10, [rip + XX]"},
     { 7, {0x81, 0x78, 0x72, 0x73, 0x74, 0x75, 0x76}, 0, "81 78 YY XX XX XX XX : cmp DWORD PTR [rax+YY], XX XX XX XX"},
@@ -1037,6 +1038,7 @@ const struct InstructionSizeData {
     { 8, {0x41, 0x81, 0x7d, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7d XX YY YY YY YY : cmp DWORD PTR [r13+YY], XX XX XX XX"},
     { 8, {0x41, 0x81, 0x7e, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7e XX YY YY YY YY : cmp DWORD PTR [r14+YY], XX XX XX XX"},
     { 8, {0x41, 0x81, 0x7f, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "41 81 7f YY XX XX XX XX : cmp DWORD PTR [r15+YY], XX XX XX XX"},
+    { 8, {0x48, 0x8D, 0xA4, 0x24, 0x74, 0x75, 0x76, 0x77}, 0, "48 8D A4 24 XX XX XX XX : lea rsp, [rsp + XX XX XX XX]"},
     { 8, {0x81, 0x7c, 0x24, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "81 7c 24 YY XX XX XX XX : cmp DWORD PTR [rsp+YY], XX XX XX XX"},
     { 8, {0xc7, 0x44, 0x24, 0x73, 0x74, 0x75, 0x76, 0x77}, 0, "C7 44 24 XX YY YY YY YY : mov dword ptr [rsp + XX], YYYYYYYY"},
     { 9, {0x41, 0x81, 0x7c, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78}, 0, "41 81 7c ZZ YY XX XX XX XX : cmp DWORD PTR [reg+reg*n+YY], XX XX XX XX"},

From 7211bf48a62bfe3a181013f412f2fa6e112ae99f Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Mon, 27 Jan 2025 10:51:23 +0100
Subject: [PATCH 169/432] [flang][driver] add negative from of
 -fsave-main-program (#124110)

Add the `-fno` form for consistency and to make it easy to switch the
default for downstream users.
---
 clang/include/clang/Driver/Options.td     | 7 +++++--
 clang/lib/Driver/ToolChains/Flang.cpp     | 3 ++-
 flang/lib/Frontend/CompilerInvocation.cpp | 9 +++++----
 flang/test/Driver/fsave-main-program.f90  | 6 +++++-
 flang/test/Lower/fsave-main-program.f90   | 1 +
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c5b7fcb7c7f09..6c171a62bbeee 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6970,8 +6970,11 @@ defm unsigned : OptInFC1FFlag<"unsigned", "Enables UNSIGNED type">;
 def fno_automatic : Flag<["-"], "fno-automatic">, Group<f_Group>,
   HelpText<"Implies the SAVE attribute for non-automatic local objects in subprograms unless RECURSIVE">;
 
-def fsave_main_program : Flag<["-"], "fsave-main-program">, Group<f_Group>,
-  HelpText<"Place all variables from the main program in static memory (otherwise scalars may be placed on the stack)">;
+defm save_main_program : BoolOptionWithoutMarshalling<"f", "save-main-program",
+  PosFlag<SetTrue, [], [],
+    "Place all main program variables in static memory (otherwise scalars may be placed on the stack)">,
+  NegFlag<SetFalse, [], [],
+    "Allow placing main program variables on the stack (default)">>;
 
 defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
   PosFlag<SetTrue, [], [ClangOption], "Attempt to allocate array temporaries on the stack, no matter their size">,
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 45d05ed3e2485..1ae865f379110 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -58,7 +58,8 @@ void Flang::addFortranDialectOptions(const ArgList &Args,
                             options::OPT_fhermetic_module_files,
                             options::OPT_frealloc_lhs,
                             options::OPT_fno_realloc_lhs,
-                            options::OPT_fsave_main_program});
+                            options::OPT_fsave_main_program,
+                            options::OPT_fno_save_main_program});
 }
 
 void Flang::addPreprocessingOptions(const ArgList &Args,
diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 3c6da4687f65d..68b5950d3a51b 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -770,10 +770,11 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
     opts.features.Enable(Fortran::common::LanguageFeature::DefaultSave);
   }
 
-  // -fsave-main-program
-  if (args.hasArg(clang::driver::options::OPT_fsave_main_program)) {
-    opts.features.Enable(Fortran::common::LanguageFeature::SaveMainProgram);
-  }
+  // -f{no}-save-main-program
+  opts.features.Enable(
+      Fortran::common::LanguageFeature::SaveMainProgram,
+      args.hasFlag(clang::driver::options::OPT_fsave_main_program,
+                   clang::driver::options::OPT_fno_save_main_program, false));
 
   if (args.hasArg(
           clang::driver::options::OPT_falternative_parameter_statement)) {
diff --git a/flang/test/Driver/fsave-main-program.f90 b/flang/test/Driver/fsave-main-program.f90
index bffdfd97911e8..e7a2f9d8b470e 100644
--- a/flang/test/Driver/fsave-main-program.f90
+++ b/flang/test/Driver/fsave-main-program.f90
@@ -1,5 +1,9 @@
 ! Check that the driver passes through -fsave-main-program:
 ! RUN: %flang -### -S -fsave-main-program %s -o - 2>&1 | FileCheck %s
+! CHECK: "-fc1"{{.*}}"-fsave-main-program"
+
+! RUN: %flang -### -S -fno-save-main-program %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK2
+! CHECK2: "-fc1"{{.*}}"-fno-save-main-program"
+
 ! Check that the compiler accepts -fsave-main-program:
 ! RUN: %flang_fc1 -emit-hlfir -fsave-main-program %s -o -
-! CHECK: "-fc1"{{.*}}"-fsave-main-program"
diff --git a/flang/test/Lower/fsave-main-program.f90 b/flang/test/Lower/fsave-main-program.f90
index 17fc1b02f5068..e89244c3c7c51 100644
--- a/flang/test/Lower/fsave-main-program.f90
+++ b/flang/test/Lower/fsave-main-program.f90
@@ -1,6 +1,7 @@
 ! Test -fsave-main-program switch.
 ! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
 ! RUN: %flang_fc1 -fsave-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-SAVE %s
+! RUN: %flang_fc1 -fsave-main-program -fno-save-main-program -emit-hlfir -o - %s | FileCheck --check-prefix=CHECK-DEFAULT %s
 program test
 integer :: i
 call foo(i)

From 3b5e9eed2f67c1fb6dcf7033e92509ba2b0381e9 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Mon, 27 Jan 2025 15:52:43 +0530
Subject: [PATCH 170/432] [NVPTX] Add float to tf32 conversion intrinsics
 (#124316)

This patch adds the set of f32 -> tf32 cvt intrinsics introduced
in sm100 with ptx8.6. This builds on top of the recent PR #121507.

Tests are verified with a 12.8 ptxas executable.

PTX ISA link:
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  8 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td  |  5 ++
 llvm/test/CodeGen/NVPTX/convert-sm100.ll | 68 ++++++++++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 68c2373a1a454..9a2f38d760e65 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1444,10 +1444,18 @@ let TargetPrefix = "nvvm" in {
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rn_relu : ClangBuiltin<"__nvvm_f2tf32_rn_relu">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rn_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rn_relu_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rz : ClangBuiltin<"__nvvm_f2tf32_rz">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_f2tf32_rz_relu : ClangBuiltin<"__nvvm_f2tf32_rz_relu">,
       Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
+  def int_nvvm_f2tf32_rz_relu_satfinite : ClangBuiltin<"__nvvm_f2tf32_rz_relu_satfinite">,
+      Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
 
   def int_nvvm_ff_to_e4m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e4m3x2_rn">,
       Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f17799c130015..633a99d0fc1be 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -764,6 +764,11 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_relu  : CVT_TO_TF32<"rz.relu">;
   defm CVT_to_tf32_rna      : CVT_TO_TF32<"rna", [hasPTX<70>, hasSM<80>]>;
   defm CVT_to_tf32_rna_satf : CVT_TO_TF32<"rna.satfinite", [hasPTX<81>, hasSM<89>]>;
+
+  defm CVT_to_tf32_rn_satf : CVT_TO_TF32<"rn.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
+  defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", [hasPTX<86>, hasSM<100>]>;
 }
 
 def fpround_oneuse : PatFrag<(ops node:$a), (fpround node:$a), [{
diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
new file mode 100644
index 0000000000000..f92822f7e0c16
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
+
+declare i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+declare i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+
+define i32 @cvt_rn_satf_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_satf_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rn_relu_satf_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_satf_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_satf_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz.satfinite(float %f1)
+  ret i32 %val
+}
+
+define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) {
+; CHECK-LABEL: cvt_rz_relu_satf_tf32_f32(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .f32 %f<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
+; CHECK-NEXT:    cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r1;
+; CHECK-NEXT:    ret;
+  %val = call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float %f1)
+  ret i32 %val
+}

From 87103a016fbfd480e1d3bb8eba23c27a9c74e70d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 10:41:53 +0000
Subject: [PATCH 171/432] [AArch64] Implement NEON FP8 vectors as VectorType
 (#123603)

Reimplement Neon FP8 vector types using attribute `neon_vector_type`
instead of having them as builtin types.
This allows to implement FP8 Neon intrinsics without the need to add
special cases for these types when using `__builtin_shufflevector`
or bitcast (using C-style cast operator) between vectors, both
extensively used in the generated code in `arm_neon.h`.
---
 clang/include/clang/AST/Type.h                |   5 +
 .../clang/Basic/AArch64SVEACLETypes.def       |   2 -
 clang/include/clang/Basic/TargetBuiltins.h    |   4 +-
 clang/lib/AST/ItaniumMangle.cpp               |   5 +
 clang/lib/CodeGen/CGBuiltin.cpp               |   1 +
 clang/lib/CodeGen/CGExpr.cpp                  |  11 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            |   4 +-
 clang/lib/CodeGen/Targets/AArch64.cpp         |   7 +-
 clang/lib/Sema/SemaARM.cpp                    |   2 +
 clang/lib/Sema/SemaExpr.cpp                   |   7 +-
 clang/lib/Sema/SemaType.cpp                   |   3 +-
 .../AArch64/builtin-shufflevector-fp8.c       | 123 +++++++++++
 clang/test/CodeGen/AArch64/fp8-cast.c         | 193 ++++++++++++++++++
 clang/test/CodeGen/arm-mfp8.c                 |  88 ++++----
 .../aarch64-mangle-neon-vectors.cpp           |   7 +
 clang/test/CodeGenCXX/mangle-neon-vectors.cpp |  11 +
 clang/test/Sema/aarch64-fp8-cast.c            | 104 ++++++++++
 clang/test/Sema/arm-mfp8.cpp                  |  34 +--
 clang/utils/TableGen/NeonEmitter.cpp          |  11 +-
 19 files changed, 553 insertions(+), 69 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
 create mode 100644 clang/test/CodeGen/AArch64/fp8-cast.c
 create mode 100644 clang/test/Sema/aarch64-fp8-cast.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 3457d524c63aa..1d9743520654e 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
@@ -8537,6 +8538,10 @@ inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
 
+inline bool Type::isMFloat8Type() const {
+  return isSpecificBuiltinType(BuiltinType::MFloat8);
+}
+
 inline bool Type::isFloat128Type() const {
   return isSpecificBuiltinType(BuiltinType::Float128);
 }
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 063cac1f4a58e..2dd2754e778d6 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -201,8 +201,6 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
 AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
-AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1)
-AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1)
 
 #undef SVE_VECTOR_TYPE
 #undef SVE_VECTOR_TYPE_BFLOAT
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 4dc8b24ed8ae6..83ef015018f1a 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -208,7 +208,8 @@ namespace clang {
       Float16,
       Float32,
       Float64,
-      BFloat16
+      BFloat16,
+      MFloat8
     };
 
     NeonTypeFlags(unsigned F) : Flags(F) {}
@@ -230,6 +231,7 @@ namespace clang {
       switch (getEltType()) {
       case Int8:
       case Poly8:
+      case MFloat8:
         return 8;
       case Int16:
       case Float16:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 1dd936cf4fb51..9948963d7f44b 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3919,6 +3919,9 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) {
     case BuiltinType::Float:     EltName = "float32_t"; break;
     case BuiltinType::Half:      EltName = "float16_t"; break;
     case BuiltinType::BFloat16:  EltName = "bfloat16_t"; break;
+    case BuiltinType::MFloat8:
+      EltName = "mfloat8_t";
+      break;
     default:
       llvm_unreachable("unexpected Neon vector element type");
     }
@@ -3972,6 +3975,8 @@ static StringRef mangleAArch64VectorBase(const BuiltinType *EltType) {
     return "Float64";
   case BuiltinType::BFloat16:
     return "Bfloat16";
+  case BuiltinType::MFloat8:
+    return "Mfloat8";
   default:
     llvm_unreachable("Unexpected vector element base type");
   }
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 26bccccdc5e36..5162ac503b8eb 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6679,6 +6679,7 @@ static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF,
   switch (TypeFlags.getEltType()) {
   case NeonTypeFlags::Int8:
   case NeonTypeFlags::Poly8:
+  case NeonTypeFlags::MFloat8:
     return llvm::FixedVectorType::get(CGF->Int8Ty, V1Ty ? 1 : (8 << IsQuad));
   case NeonTypeFlags::Int16:
   case NeonTypeFlags::Poly16:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 054f8d1eadb8c..e9f5497aaae24 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2414,8 +2414,15 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, LValue Dst,
         Vec = Builder.CreateBitCast(Vec, IRVecTy);
         // iN --> <N x i1>.
       }
-      Vec = Builder.CreateInsertElement(Vec, Src.getScalarVal(),
-                                        Dst.getVectorIdx(), "vecins");
+      llvm::Value *SrcVal = Src.getScalarVal();
+      // Allow inserting `<1 x T>` into an `<N x T>`. It can happen with scalar
+      // types which are mapped to vector LLVM IR types (e.g. for implementing
+      // an ABI).
+      if (auto *EltTy = dyn_cast<llvm::FixedVectorType>(SrcVal->getType());
+          EltTy && EltTy->getNumElements() == 1)
+        SrcVal = Builder.CreateBitCast(SrcVal, EltTy->getElementType());
+      Vec = Builder.CreateInsertElement(Vec, SrcVal, Dst.getVectorIdx(),
+                                        "vecins");
       if (IRStoreTy) {
         // <N x i1> --> <iN>.
         Vec = Builder.CreateBitCast(Vec, IRStoreTy);
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 09191a4901f49..950b23f4e13b9 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -650,7 +650,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
-                               : ConvertType(VT->getElementType());
+                               : (VT->getElementType()->isMFloat8Type()
+                                      ? llvm::Type::getInt8Ty(getLLVMContext())
+                                      : ConvertType(VT->getElementType()));
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
   }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 7db67ecba07c8..c702e79ff8eb9 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -383,10 +383,6 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
         NSRN = std::min(NSRN + 1, 8u);
       else {
         switch (BT->getKind()) {
-        case BuiltinType::MFloat8x8:
-        case BuiltinType::MFloat8x16:
-          NSRN = std::min(NSRN + 1, 8u);
-          break;
         case BuiltinType::SveBool:
         case BuiltinType::SveCount:
           NPRN = std::min(NPRN + 1, 4u);
@@ -629,8 +625,7 @@ bool AArch64ABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
   // but with the difference that any floating-point type is allowed,
   // including __fp16.
   if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
-    if (BT->isFloatingPoint() || BT->getKind() == BuiltinType::MFloat8x16 ||
-        BT->getKind() == BuiltinType::MFloat8x8)
+    if (BT->isFloatingPoint())
       return true;
   } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
     if (auto Kind = VT->getVectorKind();
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index db418d80e0e09..2620bbc97ba02 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -352,6 +352,8 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
     return Context.DoubleTy;
   case NeonTypeFlags::BFloat16:
     return Context.BFloat16Ty;
+  case NeonTypeFlags::MFloat8:
+    return Context.MFloat8Ty;
   }
   llvm_unreachable("Invalid NeonTypeFlag!");
 }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index d5273d463d7c0..176627c3df37c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7503,7 +7503,7 @@ static bool breakDownVectorType(QualType type, uint64_t &len,
   if (const VectorType *vecType = type->getAs<VectorType>()) {
     len = vecType->getNumElements();
     eltType = vecType->getElementType();
-    assert(eltType->isScalarType());
+    assert(eltType->isScalarType() || eltType->isMFloat8Type());
     return true;
   }
 
@@ -10174,6 +10174,11 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
     return HLSL().handleVectorBinOpConversion(LHS, RHS, LHSType, RHSType,
                                               IsCompAssign);
 
+  // Any operation with MFloat8 type is only possible with C intrinsics
+  if ((LHSVecType && LHSVecType->getElementType()->isMFloat8Type()) ||
+      (RHSVecType && RHSVecType->getElementType()->isMFloat8Type()))
+    return InvalidOperands(Loc, LHS, RHS);
+
   // AltiVec-style "vector bool op vector bool" combinations are allowed
   // for some operators but not others.
   if (!AllowBothBool && LHSVecType &&
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 2ccf5a8e1d6f3..33d5378944ddb 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -8306,7 +8306,8 @@ static bool isPermittedNeonBaseType(QualType &Ty, VectorKind VecKind, Sema &S) {
          BTy->getKind() == BuiltinType::ULongLong ||
          BTy->getKind() == BuiltinType::Float ||
          BTy->getKind() == BuiltinType::Half ||
-         BTy->getKind() == BuiltinType::BFloat16;
+         BTy->getKind() == BuiltinType::BFloat16 ||
+         BTy->getKind() == BuiltinType::MFloat8;
 }
 
 static bool verifyValidIntegerConstantExpr(Sema &S, const ParsedAttr &Attr,
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
new file mode 100644
index 0000000000000..147ca1d1becc1
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -0,0 +1,123 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __attribute__((neon_vector_type(8))) __mfp8 mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) __mfp8 mfloat8x16_t;
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
+//
+mfloat8x8_t test_8x8(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8_v(
+// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
+//
+mfloat8x8_t test_8x8_v(mfloat8x8_t x, int8x8_t p) {
+  return __builtin_shufflevector(x, p);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
+//
+mfloat8x16_t test_8x16(mfloat8x16_t x) {
+  return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2,
+                                 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16_v(
+// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8
+// CHECK-NEXT:    [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]]
+// CHECK-NEXT:    [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8
+// CHECK-NEXT:    [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9
+// CHECK-NEXT:    [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]]
+// CHECK-NEXT:    [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9
+// CHECK-NEXT:    [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10
+// CHECK-NEXT:    [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]]
+// CHECK-NEXT:    [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10
+// CHECK-NEXT:    [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11
+// CHECK-NEXT:    [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]]
+// CHECK-NEXT:    [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11
+// CHECK-NEXT:    [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12
+// CHECK-NEXT:    [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]]
+// CHECK-NEXT:    [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12
+// CHECK-NEXT:    [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13
+// CHECK-NEXT:    [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]]
+// CHECK-NEXT:    [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13
+// CHECK-NEXT:    [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14
+// CHECK-NEXT:    [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]]
+// CHECK-NEXT:    [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14
+// CHECK-NEXT:    [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15
+// CHECK-NEXT:    [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]]
+// CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
+//
+mfloat8x16_t test_8x16_v(mfloat8x16_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-cast.c b/clang/test/CodeGen/AArch64/fp8-cast.c
new file mode 100644
index 0000000000000..a9ce31b9e6bea
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-cast.c
@@ -0,0 +1,193 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// Bitcast between FP8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_s8(
+// CHECK-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_s810__Int8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_s8(int8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_s8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z10test_s8_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+int8x8_t test_s8_f8(mfloat8x8_t x) {
+    return (int8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_s8(
+// CHECK-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_s811__Int8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_s8(int8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_s8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z11testq_s8_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+int8x16_t testq_s8_f8(mfloat8x16_t x) {
+    return (int8x16_t) x;
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f32(
+// CHECK-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z11test_f8_f3213__Float32x2_t(
+// CHECK-CXX-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_f8_f32(float32x2_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_f32_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z11test_f32_f813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-CXX-NEXT:    ret <2 x float> [[TMP0]]
+//
+float32x2_t test_f32_f8(mfloat8x8_t x) {
+    return (float32x2_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f32(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z12testq_f8_f3213__Float32x4_t(
+// CHECK-CXX-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_f32(float32x4_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @testq_f32_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z12testq_f32_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-CXX-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t testq_f32_f8(mfloat8x16_t x) {
+    return (float32x4_t) x;
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_p128(
+// CHECK-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z13testq_f8_p128o(
+// CHECK-CXX-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_p128(poly128_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local i128 @testq_p128_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef i128 @_Z13testq_p128_f814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-CXX-NEXT:    ret i128 [[TMP0]]
+//
+poly128_t testq_p128_f8(mfloat8x16_t x) {
+    return (poly128_t) x;
+}
diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c
index bf91066335a25..9385b537f18b3 100644
--- a/clang/test/CodeGen/arm-mfp8.c
+++ b/clang/test/CodeGen/arm-mfp8.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-C
-// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -o -  -x c++ %s | FileCheck %s --check-prefixes=CHECK,CHECK-CXX
+// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -disable-O0-optnone -o -        %s | opt -S --passes=mem2reg | FileCheck %s --check-prefixes=CHECK-C
+// RUN: %clang_cc1 -emit-llvm -triple aarch64-arm-none-eabi -target-feature -fp8 -target-feature +neon -disable-O0-optnone -o - -x c++ %s | opt -S --passes=mem2reg | FileCheck %s --check-prefixes=CHECK-CXX
 
 // REQUIRES: aarch64-registered-target
 
@@ -10,18 +10,12 @@
 // CHECK-C-LABEL: define dso_local <16 x i8> @test_ret_mfloat8x16_t(
 // CHECK-C-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[V_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-C-NEXT:    store <16 x i8> [[V]], ptr [[V_ADDR]], align 16
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16
-// CHECK-C-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-C-NEXT:    ret <16 x i8> [[V]]
 //
-// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z21test_ret_mfloat8x16_tu14__MFloat8x16_t(
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z21test_ret_mfloat8x16_t14__Mfloat8x16_t(
 // CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[V_ADDR:%.*]] = alloca <16 x i8>, align 16
-// CHECK-CXX-NEXT:    store <16 x i8> [[V]], ptr [[V_ADDR]], align 16
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[V_ADDR]], align 16
-// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
 //
 mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) {
   return v;
@@ -30,18 +24,12 @@ mfloat8x16_t test_ret_mfloat8x16_t(mfloat8x16_t v) {
 // CHECK-C-LABEL: define dso_local <8 x i8> @test_ret_mfloat8x8_t(
 // CHECK-C-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[V_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-C-NEXT:    store <8 x i8> [[V]], ptr [[V_ADDR]], align 8
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8
-// CHECK-C-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-C-NEXT:    ret <8 x i8> [[V]]
 //
-// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z20test_ret_mfloat8x8_tu13__MFloat8x8_t(
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z20test_ret_mfloat8x8_t13__Mfloat8x8_t(
 // CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[V_ADDR:%.*]] = alloca <8 x i8>, align 8
-// CHECK-CXX-NEXT:    store <8 x i8> [[V]], ptr [[V_ADDR]], align 8
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[V_ADDR]], align 8
-// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
 //
 mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) {
   return v;
@@ -50,28 +38,22 @@ mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) {
 // CHECK-C-LABEL: define dso_local <1 x i8> @func1n(
 // CHECK-C-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] {
 // CHECK-C-NEXT:  [[ENTRY:.*:]]
-// CHECK-C-NEXT:    [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1
 // CHECK-C-NEXT:    [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1
-// CHECK-C-NEXT:    store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1
-// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1
 // CHECK-C-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-C-NEXT:    store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1
+// CHECK-C-NEXT:    store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1
 // CHECK-C-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-C-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
-// CHECK-C-NEXT:    ret <1 x i8> [[TMP1]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
+// CHECK-C-NEXT:    ret <1 x i8> [[TMP0]]
 //
 // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z6func1nu6__mfp8(
 // CHECK-CXX-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[MFP8_ADDR:%.*]] = alloca <1 x i8>, align 1
 // CHECK-CXX-NEXT:    [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1
-// CHECK-CXX-NEXT:    store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[MFP8_ADDR]], align 1
 // CHECK-CXX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-CXX-NEXT:    store <1 x i8> [[TMP0]], ptr [[ARRAYIDX]], align 1
+// CHECK-CXX-NEXT:    store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1
 // CHECK-CXX-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2
-// CHECK-CXX-NEXT:    [[TMP1:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
-// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP1]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1
+// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP0]]
 //
 __mfp8 func1n(__mfp8 mfp8) {
   __mfp8 f1n[10];
@@ -79,7 +61,43 @@ __mfp8 func1n(__mfp8 mfp8) {
   return f1n[2];
 }
 
+// CHECK-C-LABEL: define dso_local <1 x i8> @test_extract_element(
+// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
+// CHECK-C-NEXT:    [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]]
+// CHECK-C-NEXT:    store i8 [[VECEXT]], ptr [[RETVAL]], align 1
+// CHECK-C-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-C-NEXT:    ret <1 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z20test_extract_element14__Mfloat8x16_ti(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[RETVAL:%.*]] = alloca <1 x i8>, align 1
+// CHECK-CXX-NEXT:    [[VECEXT:%.*]] = extractelement <16 x i8> [[X]], i32 [[I]]
+// CHECK-CXX-NEXT:    store i8 [[VECEXT]], ptr [[RETVAL]], align 1
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1
+// CHECK-CXX-NEXT:    ret <1 x i8> [[TMP0]]
+//
+mfloat8_t test_extract_element(mfloat8x16_t x, int i) {
+  return x[i];
+}
 
-
-//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-// CHECK: {{.*}}
+// CHECK-C-LABEL: define dso_local <16 x i8> @test_insert_element(
+// CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-C-NEXT:  [[ENTRY:.*:]]
+// CHECK-C-NEXT:    [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8
+// CHECK-C-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]]
+// CHECK-C-NEXT:    ret <16 x i8> [[VECINS]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z19test_insert_element14__Mfloat8x16_tiu6__mfp8(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8
+// CHECK-CXX-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VECINS]]
+//
+mfloat8x16_t test_insert_element(mfloat8x16_t x, int i, mfloat8_t v) {
+  x[i] = v;
+  return x;
+}
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
index 3b4a309327fe6..9b855698f57fd 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-neon-vectors.cpp
@@ -11,6 +11,7 @@ typedef unsigned short poly16_t;
 typedef __fp16 float16_t;
 typedef float float32_t;
 typedef double float64_t;
+typedef __mfp8 mfloat8_t;
 
 typedef __attribute__((neon_vector_type(8))) int8_t int8x8_t;
 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
@@ -26,6 +27,8 @@ typedef __attribute__((neon_vector_type(8))) uint16_t uint16x8_t;
 typedef __attribute__((neon_vector_type(2))) unsigned int uint32x2_t;
 typedef __attribute__((neon_vector_type(4))) unsigned int uint32x4_t;
 typedef __attribute__((neon_vector_type(2))) uint64_t uint64x2_t;
+typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
 typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 typedef __attribute__((neon_vector_type(2))) float32_t float32x2_t;
@@ -82,3 +85,7 @@ void f21(int64x2_t) {}
 void f22(uint64x2_t) {}
 // CHECK: 13__Float64x2_t
 void f23(float64x2_t) {}
+// CHECK: 13__Mfloat8x8_t
+void f24(mfloat8x8_t) {}
+// CHECK: 14__Mfloat8x16_t
+void f25(mfloat8x16_t) {}
diff --git a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
index cb5e40be6a6df..2139a8ae98caf 100644
--- a/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
+++ b/clang/test/CodeGenCXX/mangle-neon-vectors.cpp
@@ -9,6 +9,7 @@ typedef __fp16 float16_t;
 #if defined(__aarch64__)
 typedef unsigned char poly8_t;
 typedef unsigned short poly16_t;
+typedef __mfp8 mfloat8_t;
 #else
 typedef signed char poly8_t;
 typedef short poly16_t;
@@ -29,6 +30,8 @@ typedef __attribute__((neon_vector_type(4))) float16_t float16x4_t;
 typedef __attribute__((neon_vector_type(8))) float16_t float16x8_t;
 #ifdef __aarch64__
 typedef __attribute__((neon_vector_type(2))) float64_t float64x2_t;
+typedef __attribute__((neon_vector_type(8))) mfloat8_t mfloat8x8_t;
+typedef __attribute__((neon_vector_type(16))) mfloat8_t mfloat8x16_t;
 #endif
 typedef __attribute__((neon_polyvector_type(16))) poly8_t  poly8x16_t;
 typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
@@ -86,3 +89,11 @@ void f11(float64x2_t v) { }
 // CHECK-AARCH64-BF16: 14__Bfloat16x4_t
 void f12(bfloat16x4_t v) {}
 #endif
+
+
+#ifdef __aarch64__
+// CHECK-AARCH64: 13__Mfloat8x8_t
+void f13(mfloat8x8_t v) { }
+// CHECK-AARCH64: 14__Mfloat8x16_t
+void f14(mfloat8x16_t v) { }
+#endif
diff --git a/clang/test/Sema/aarch64-fp8-cast.c b/clang/test/Sema/aarch64-fp8-cast.c
new file mode 100644
index 0000000000000..ad25401919b5a
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-cast.c
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// Bitcast between FP8 Neon vectors
+mfloat8x8_t err_test_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}}
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+mfloat8x8_t err_test_f8_s8(int8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'int8x16_t' (vector of 16 'int8_t' values) of different size}}
+}
+
+int8x8_t err_test_s8_f8(mfloat8x16_t x) {
+    return (int8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'int8x8_t' (vector of 8 'int8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_s8(int8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'int8x8_t' (vector of 8 'int8_t' values) of different size}}
+}
+
+int8x16_t err_testq_s8_f8(mfloat8x8_t x) {
+    return (int8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'int8x16_t' (vector of 16 'int8_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}}
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+mfloat8x8_t err_test_f8_f32(float32x4_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'float32x4_t' (vector of 4 'float32_t' values) of different size}}
+}
+
+float32x2_t err_test_f32_f8(mfloat8x16_t x) {
+    return (float32x2_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'float32x2_t' (vector of 2 'float32_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f32(float32x2_t x) {
+    return (mfloat8x16_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'float32x2_t' (vector of 2 'float32_t' values) of different size}}
+}
+
+float32x4_t err_testq_f32_f8(mfloat8x8_t x) {
+    return (float32x4_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'float32x4_t' (vector of 4 'float32_t' values) and 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) of different size}}
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+mfloat8x8_t err_testq_f8_p128(poly128_t x) {
+    return (mfloat8x8_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned __int128') of different size}}
+}
+
+poly128_t err_testq_p128_f8(mfloat8x8_t x) {
+    return (poly128_t) x;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and integer type 'poly128_t' (aka 'unsigned __int128') of different size}}
+}
+
+// Bitcast between FP8 and a non-integral type
+mfloat8x8_t err_test_f8_ptr(void *p) {
+    return (mfloat8x8_t) p;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'void *'}}
+}
+
+void *err_test_ptr_f8(mfloat8x8_t v) {
+    return (void *) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'void *'}}
+}
+
+mfloat8x8_t err_test_f8_dbl(double v) {
+    return (mfloat8x8_t) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'double'}}
+}
+
+double err_test_dbl_f8(mfloat8x8_t v) {
+    return (double) v;
+// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (vector of 8 'mfloat8_t' values) and scalar type 'double'}}
+}
+
+struct S {
+    char ch[16];
+};
+
+mfloat8x16_t err_test_f8_agg(struct S s) {
+    return (mfloat8x16_t) s;
+// expected-error@-1 {{operand of type 'struct S' where arithmetic or pointer type is required}}
+}
+
+struct S err_test_agg_f8(mfloat8x16_t v) {
+    return (struct S) v;
+// expected-error@-1 {{used type 'struct S' where arithmetic or pointer type is required}}
+}
diff --git a/clang/test/Sema/arm-mfp8.cpp b/clang/test/Sema/arm-mfp8.cpp
index be5bc9bb71dbd..1b4e6791420ec 100644
--- a/clang/test/Sema/arm-mfp8.cpp
+++ b/clang/test/Sema/arm-mfp8.cpp
@@ -48,17 +48,27 @@ void test_vector_sve(svmfloat8_t a, svuint8_t c) {
 #include <arm_neon.h>
 
 void test_vector(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c) {
-  a + b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a - b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a * b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  a / b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
+  a + a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a - a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a * a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
+  a / a;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x8_t')}}
 
-  a + c;  // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
-  a - c;  // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
-  a * c;  // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
-  a / c;  // neon-error {{cannot convert between vector and non-scalar values ('mfloat8x8_t' (aka '__MFloat8x8_t') and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
-  c + b;  // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  c - b;  // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  c * b;  // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
-  c / b;  // neon-error {{cannot convert between vector and non-scalar values ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (aka '__MFloat8x16_t'))}}
+  b + b;  // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b - b;  // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b * b;  // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+  b / b;  // neon-error {{invalid operands to binary expression ('mfloat8x16_t' (vector of 16 'mfloat8_t' values) and 'mfloat8x16_t')}}
+
+  a + b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  a - b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  a * b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  a / b;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+
+  a + c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
+  a - c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
+  a * c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
+  a / c;  // neon-error {{invalid operands to binary expression ('mfloat8x8_t' (vector of 8 'mfloat8_t' values) and 'uint8x8_t' (vector of 8 'uint8_t' values))}}
+  c + b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  c - b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  c * b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
+  c / b;  // neon-error {{invalid operands to binary expression ('uint8x8_t' (vector of 8 'uint8_t' values) and 'mfloat8x16_t' (vector of 16 'mfloat8_t' values))}}
 }
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 49633bb7b7f58..7299a49252f0d 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -102,7 +102,7 @@ enum EltType {
   Float32,
   Float64,
   BFloat16,
-  MFloat8 // Not used by Sema or CodeGen in Clang
+  MFloat8
 };
 
 } // end namespace NeonTypeFlags
@@ -2281,9 +2281,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) {
       InIfdef = true;
     }
 
-    if (T.isMFloat8())
-      OS << "typedef __MFloat8x";
-    else if (T.isPoly())
+    if (T.isPoly())
       OS << "typedef __attribute__((neon_polyvector_type(";
     else
       OS << "typedef __attribute__((neon_vector_type(";
@@ -2291,10 +2289,7 @@ static void emitNeonTypeDefs(const std::string& types, raw_ostream &OS) {
     Type T2 = T;
     T2.makeScalar();
     OS << T.getNumElements();
-    if (T.isMFloat8())
-      OS << "_t ";
-    else
-      OS << "))) " << T2.str();
+    OS << "))) " << T2.str();
     OS << " " << T.str() << ";\n";
   }
   if (InIfdef)

From 8f17f51deb12456f25d32b9a42ac1f00feabbfbc Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Mon, 27 Jan 2025 18:50:53 +0800
Subject: [PATCH 172/432] [mlir][tosa] Fix comments format(NFC) (#124520)

This PR corrects the formatting of comments in Markdown. The previous
format was as follows:
https://mlir.llvm.org/docs/Dialects/TOSA/#tosaerf-mlirtosaerfop

![image](https://github.com/user-attachments/assets/1d1d10d5-c960-4724-9fb4-29c17ea39b11)

https://mlir.llvm.org/docs/Dialects/TOSA/#tosarescale-mlirtosarescaleop

![image](https://github.com/user-attachments/assets/fb23cbf6-be10-4a60-8b43-b28dc2db6918)
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 35 ++++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 92ab729f5b933..2186510e7db1e 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -454,7 +454,7 @@ def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> {
   let summary = "Computes gauss error function of input";
 
   let description = [{
-    Gauss error function: $ erf(x) = \frac{2}{\sqrt(\pi)} \int_{0}^{x} e^{-t^2} \,dt $
+    Gauss error function: $ erf(x) = \frac{2}{\sqrt(\pi)} \int_{0}^{x} e^{-t^2} \ dt $
     For quantized integer data types, the TABLE operator should be used instead
     with the following definition.  The erf_table has 513 entries each of
     16-bit/8-bit precision and covering the input range -4.0 to +4.0 in steps of 1/64.
@@ -1886,23 +1886,22 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [Pure,
   let description = [{
     Rescale quantized values into a new domain. Supported rescalings are:
 
-    | Mode                   | Input | Output | Unsigned | Unsigned |
-    |                        |       |        |  input   |  output  |
-    |------------------------|-------|--------|----------|----------|
-    | signed 8 to 8          | int8  | int8   |  false   |  false   |
-    | signed 8 to 16         | int8  | int16  |  false   |  false   |
-    | signed 8 to 32         | int8  | int32  |  false   |  false   |
-    | signed 16 to 8         | int16 | int8   |  false   |  false   |
-    | signed 16 to 16        | int16 | int16  |  false   |  false   |
-    | signed 16 to 32        | int16 | int32  |  false   |  false   |
-    | signed 32 to 8         | int32 | int8   |  false   |  false   |
-    | signed 32 to 16        | int32 | int16  |  false   |  false   |
-    | signed 32 to 32        | int32 | int32  |  false   |  false   |
-    | signed 48 to 8         | int48 | int8   |  false   |  false   |
-    | signed 48 to 16        | int48 | int16  |  false   |  false   |
-    | signed 48 to 32        | int48 | int32  |  false   |  false   |
-    | unsigned 8 to signed 8 | uint8 | int8   |  true    |  false   |
-    | signed 8 to unsigned 8 | int8  | uint8  |  false   |  true    |
+    | Mode                   | Input | Output | Unsigned input | Unsigned output |
+    |------------------------|-------|--------|----------------|-----------------|
+    | signed 8 to 8          | int8  | int8   |  false         |  false          |
+    | signed 8 to 16         | int8  | int16  |  false         |  false          |
+    | signed 8 to 32         | int8  | int32  |  false         |  false          |
+    | signed 16 to 8         | int16 | int8   |  false         |  false          |
+    | signed 16 to 16        | int16 | int16  |  false         |  false          |
+    | signed 16 to 32        | int16 | int32  |  false         |  false          |
+    | signed 32 to 8         | int32 | int8   |  false         |  false          |
+    | signed 32 to 16        | int32 | int16  |  false         |  false          |
+    | signed 32 to 32        | int32 | int32  |  false         |  false          |
+    | signed 48 to 8         | int48 | int8   |  false         |  false          |
+    | signed 48 to 16        | int48 | int16  |  false         |  false          |
+    | signed 48 to 32        | int48 | int32  |  false         |  false          |
+    | unsigned 8 to signed 8 | uint8 | int8   |  true          |  false          |
+    | signed 8 to unsigned 8 | int8  | uint8  |  false         |  true           |
   }];
 
   let arguments = (ins

From 14ffff384740f484b382a1225f4bd01aeebfdc3f Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 27 Jan 2025 11:54:06 +0100
Subject: [PATCH 173/432] [clang] Add dump() support for lvalue APValues
 (#124476)

Add some lvalue information to the `dump()` output of lvalue APValues.
---
 clang/lib/AST/TextNodeDumper.cpp           | 30 ++++++++++++-
 clang/test/AST/ast-dump-APValue-lvalue.cpp | 50 ++++++++++++++++++++++
 clang/test/AST/ast-dump-APValue-todo.cpp   |  4 --
 3 files changed, 78 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/AST/ast-dump-APValue-lvalue.cpp

diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 670641242cae2..46ec553fc05f0 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -710,10 +710,36 @@ void TextNodeDumper::Visit(const APValue &Value, QualType Ty) {
          << GetApproxValue(Value.getComplexFloatImag()) << 'i';
     }
     return;
-  case APValue::LValue:
+  case APValue::LValue: {
     (void)Context;
-    OS << "LValue <todo>";
+    OS << "LValue Base=";
+    APValue::LValueBase B = Value.getLValueBase();
+    if (B.isNull())
+      OS << "null";
+    else if (const auto *BE = B.dyn_cast<const Expr *>()) {
+      OS << BE->getStmtClassName() << ' ';
+      dumpPointer(BE);
+    } else {
+      const auto *VDB = B.get<const ValueDecl *>();
+      OS << VDB->getDeclKindName() << "Decl";
+      dumpPointer(VDB);
+    }
+    OS << ", Null=" << Value.isNullPointer()
+       << ", Offset=" << Value.getLValueOffset().getQuantity()
+       << ", HasPath=" << Value.hasLValuePath();
+    if (Value.hasLValuePath()) {
+      OS << ", PathLength=" << Value.getLValuePath().size();
+      OS << ", Path=(";
+      llvm::ListSeparator Sep;
+      for (const auto &PathEntry : Value.getLValuePath()) {
+        // We're printing all entries as array indices because don't have the
+        // type information here to do anything else.
+        OS << Sep << PathEntry.getAsArrayIndex();
+      }
+      OS << ")";
+    }
     return;
+  }
   case APValue::Array: {
     unsigned ArraySize = Value.getArraySize();
     unsigned NumInitializedElements = Value.getArrayInitializedElts();
diff --git a/clang/test/AST/ast-dump-APValue-lvalue.cpp b/clang/test/AST/ast-dump-APValue-lvalue.cpp
new file mode 100644
index 0000000000000..224caddb3eabe
--- /dev/null
+++ b/clang/test/AST/ast-dump-APValue-lvalue.cpp
@@ -0,0 +1,50 @@
+// Test without serialization:
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 \
+// RUN:            -ast-dump %s -ast-dump-filter Test \
+// RUN: | FileCheck --strict-whitespace --match-full-lines %s
+//
+// Test with serialization:
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c++ -triple x86_64-unknown-unknown -Wno-unused-value -std=gnu++17 \
+// RUN:           -include-pch %t -ast-dump-all -ast-dump-filter Test /dev/null \
+// RUN: | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" \
+// RUN: | FileCheck --strict-whitespace --match-full-lines %s
+
+int i;
+struct S {
+  int i;
+  int ii;
+};
+S s;
+
+struct F {
+  char padding[12];
+  S s;
+};
+F f;
+
+void Test(int (&arr)[10]) {
+  constexpr int *pi = &i;
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pi 'int *const' constexpr cinit
+  // CHECK-NEXT:  |   |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=0, HasPath=1, PathLength=0, Path=()
+
+  constexpr int *psi = &s.i;
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} psi 'int *const' constexpr cinit
+  // CHECK-NEXT:  |   |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=0, HasPath=1, PathLength=1, Path=({{.*}})
+
+  constexpr int *psii = &s.ii;
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} psii 'int *const' constexpr cinit
+  // CHECK-NEXT:  |   |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=4, HasPath=1, PathLength=1, Path=({{.*}})
+
+  constexpr int *pf = &f.s.ii;
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pf 'int *const' constexpr cinit
+  // CHECK-NEXT:  |   |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=16, HasPath=1, PathLength=2, Path=({{.*}}, {{.*}})
+
+  constexpr char *pc = &f.padding[2];
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pc 'char *const' constexpr cinit
+  // CHECK-NEXT:  |   |-value: LValue Base=VarDecl {{.*}}, Null=0, Offset=2, HasPath=1, PathLength=2, Path=({{.*}}, 2)
+
+  constexpr const int *n = nullptr;
+  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} n 'const int *const' constexpr cinit
+  // CHECK-NEXT:      |-value: LValue Base=null, Null=1, Offset=0, HasPath=1, PathLength=0, Path=()
+}
diff --git a/clang/test/AST/ast-dump-APValue-todo.cpp b/clang/test/AST/ast-dump-APValue-todo.cpp
index 78cc9cf36c73c..acaa82ba53b6f 100644
--- a/clang/test/AST/ast-dump-APValue-todo.cpp
+++ b/clang/test/AST/ast-dump-APValue-todo.cpp
@@ -16,10 +16,6 @@ struct S {
 };
 
 void Test() {
-  constexpr int *pi = &i;
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pi 'int *const' constexpr cinit
-  // CHECK-NEXT:  |   |-value: LValue <todo>
-
   constexpr int(S::*pmi) = &S::i;
   // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} pmi 'int (S::*const)' constexpr cinit
   // CHECK-NEXT:      |-value: MemberPointer <todo>

From 43a50deb63453cd3c800f097514d500536f9d436 Mon Sep 17 00:00:00 2001
From: Samuel Ginzburg <ginzburg.sam@gmail.com>
Date: Mon, 27 Jan 2025 05:58:26 -0500
Subject: [PATCH 174/432] [MLIR][ROCDL] Add GFX940 SMFMAC (2:4 sparsity)
 instructions to the ROCDL dialect (#124435)

# Overview

This PR adds 2:4 structured sparsity (sparse A, dense B) matrix multiply
instructions to ROCDL.

# Testing

I've added tests to Dialect/mlir and Target/mlir
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 18 ++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 87 +++++++++++++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 89 ++++++++++++++++++++
 3 files changed, 194 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 95fbe7ed66a43..974712c581537 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -408,6 +408,24 @@ def ROCDL_mfma_i32_32x32x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x32.i8">;
 def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">;
 def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>;
 def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>;
+
+// 2:4 Sparsity ops (GFX940)
+def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">;
+def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">;
+def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">;
+def ROCDL_smfmac_f32_32x32x16_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.bf16">;
+def ROCDL_smfmac_i32_16x16x64_i8 : ROCDL_Mfma_IntrOp<"smfmac.i32.16x16x64.i8">;
+def ROCDL_smfmac_i32_32x32x32_i8 : ROCDL_Mfma_IntrOp<"smfmac.i32.32x32x32.i8">;
+def ROCDL_smfmac_f32_16x16x64_bf8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.bf8.bf8">;
+def ROCDL_smfmac_f32_16x16x64_bf8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.bf8.fp8">;
+def ROCDL_smfmac_f32_16x16x64_fp8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.fp8.bf8">;
+def ROCDL_smfmac_f32_16x16x64_fp8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x64.fp8.fp8">;
+def ROCDL_smfmac_f32_32x32x32_bf8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.bf8.bf8">;
+def ROCDL_smfmac_f32_32x32x32_bf8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.bf8.fp8">;
+def ROCDL_smfmac_f32_32x32x32_fp8_bf8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.fp8.bf8">;
+def ROCDL_smfmac_f32_32x32x32_fp8_fp8 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x32.fp8.fp8">;
+
+
 //===---------------------------------------------------------------------===//
 // WMMA intrinsics
 class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 712f8c2a1caf6..5186e43398f01 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -258,6 +258,93 @@ func.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
   llvm.return
 }
 
+
+llvm.func @rocdl.smfmac(%arg0 : i32,
+                   %arg1 : vector<4 x f16>,
+                   %arg2 : vector<8 x f16>,
+                   %arg3 : vector<4 x f32>,
+                   %arg4 : vector<16 x f32>,
+                   %arg5 : vector<4 x i16>,
+                   %arg6 : vector<8 x i16>,
+                   %arg7 : vector<2xi32>,
+                   %arg8 : vector<4xi32>,
+                   %arg9 : vector<16xi32>) -> vector<4 x f32> {
+  %csti32 = llvm.mlir.constant(42 : i32) : i32
+
+  // CHECK-LABEL: rocdl.smfmac
+  // CHECK: rocdl.smfmac.f32.16x16x32.f16 %{{.*}} : (vector<4xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r0 = rocdl.smfmac.f32.16x16x32.f16 %arg1, %arg2, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<4xf16>, vector<8xf16>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x16.f16 %{{.*}} : (vector<4xf16>, vector<8xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r1 = rocdl.smfmac.f32.32x32x16.f16 %arg1, %arg2, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<4xf16>, vector<8xf16>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: rocdl.smfmac.f32.16x16x32.bf16 %{{.*}} : (vector<4xi16>, vector<8xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r2 = rocdl.smfmac.f32.16x16x32.bf16 %arg5, %arg6, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<4xi16>, vector<8xi16>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x16.bf16 %{{.*}} : (vector<4xi16>, vector<8xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r3 = rocdl.smfmac.f32.32x32x16.bf16 %arg5, %arg6, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<4xi16>, vector<8xi16>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: rocdl.smfmac.i32.16x16x64.i8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xi32>, i32, i32, i32) -> vector<4xi32>
+  %r4 = rocdl.smfmac.i32.16x16x64.i8 %arg7, %arg8, %arg8, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xi32>,
+                                 i32, i32, i32) -> vector<4xi32>
+
+  // CHECK: rocdl.smfmac.i32.32x32x32.i8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
+  %r5 = rocdl.smfmac.i32.32x32x32.i8 %arg7, %arg8, %arg9, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xi32>,
+                                 i32, i32, i32) -> vector<16xi32>
+
+  // CHECK: rocdl.smfmac.f32.16x16x64.bf8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r6 = rocdl.smfmac.f32.16x16x64.bf8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.16x16x64.bf8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r7 = rocdl.smfmac.f32.16x16x64.bf8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.16x16x64.fp8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r8 = rocdl.smfmac.f32.16x16x64.fp8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.16x16x64.fp8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
+  %r9 = rocdl.smfmac.f32.16x16x64.fp8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x32.bf8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r10 = rocdl.smfmac.f32.32x32x32.bf8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x32.bf8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r11 = rocdl.smfmac.f32.32x32x32.bf8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x32.fp8.bf8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r12 = rocdl.smfmac.f32.32x32x32.fp8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: rocdl.smfmac.f32.32x32x32.fp8.fp8 %{{.*}} : (vector<2xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
+  %r13 = rocdl.smfmac.f32.32x32x32.fp8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  llvm.return %r0 : vector<4 x f32>
+}
+
 llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32,
                    %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>,
                    %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) {
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index b74edb6210683..326bd3ae6b6f8 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -398,6 +398,95 @@ llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
   llvm.return %r0 : vector<32 x f32>
 }
 
+llvm.func @rocdl.smfmac(%arg0 : i32,
+                   %arg1 : vector<4 x f16>,
+                   %arg2 : vector<8 x f16>,
+                   %arg3 : vector<4 x f32>,
+                   %arg4 : vector<16 x f32>,
+                   %arg5 : vector<4 x i16>,
+                   %arg6 : vector<8 x i16>,
+                   %arg7 : vector<2xi32>,
+                   %arg8 : vector<4xi32>,
+                   %arg9 : vector<16xi32>) -> vector<4 x f32> {
+  %csti32 = llvm.mlir.constant(42 : i32) : i32
+
+  // CHECK-LABEL: rocdl.smfmac
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r0 = rocdl.smfmac.f32.16x16x32.f16 %arg1, %arg2, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<4xf16>, vector<8xf16>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %{{.*}}, <8 x half> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r1 = rocdl.smfmac.f32.32x32x16.f16 %arg1, %arg2, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<4xf16>, vector<8xf16>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r2 = rocdl.smfmac.f32.16x16x32.bf16 %arg5, %arg6, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<4xi16>, vector<8xi16>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r3 = rocdl.smfmac.f32.32x32x16.bf16 %arg5, %arg6, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<4xi16>, vector<8xi16>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i32 42, i32 42, i32 42)
+  %r4 = rocdl.smfmac.i32.16x16x64.i8 %arg7, %arg8, %arg8, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xi32>,
+                                 i32, i32, i32) -> vector<4xi32>
+
+  // CHECK: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x i32> %{{.*}}, i32 42, i32 42, i32 42)
+  %r5 = rocdl.smfmac.i32.32x32x32.i8 %arg7, %arg8, %arg9, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xi32>,
+                                 i32, i32, i32) -> vector<16xi32>
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r6 = rocdl.smfmac.f32.16x16x64.bf8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r7 = rocdl.smfmac.f32.16x16x64.bf8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r8 = rocdl.smfmac.f32.16x16x64.fp8.bf8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r9 = rocdl.smfmac.f32.16x16x64.fp8.fp8 %arg7, %arg8, %arg3, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<4xf32>,
+                                 i32, i32, i32) -> vector<4xf32>
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r10 = rocdl.smfmac.f32.32x32x32.bf8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r11 = rocdl.smfmac.f32.32x32x32.bf8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r12 = rocdl.smfmac.f32.32x32x32.fp8.bf8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+
+  // CHECK: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %{{.*}}, <4 x i32> %{{.*}}, <16 x float> %{{.*}}, i32 42, i32 42, i32 42)
+  %r13 = rocdl.smfmac.f32.32x32x32.fp8.fp8 %arg7, %arg8, %arg4, %csti32, %csti32, %csti32 :
+                                (vector<2xi32>, vector<4xi32>, vector<16xf32>,
+                                 i32, i32, i32) -> vector<16xf32>
+
+  llvm.return %r0 : vector<4 x f32>
+}
+
+
 llvm.func @rocdl.mfma.scale.f32.32x32x64.f8f6f4(%arg0 : i32,
                    %arg1 : vector<16 x f32>, %arg2 : vector<8xi32>,
                    %arg3 : vector<6xi32>, %arg4 : vector<4xi32>) -> vector<16 x f32> {

From ac87d6b03642eca3901a7776d73be368299402e9 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Mon, 27 Jan 2025 12:28:09 +0100
Subject: [PATCH 175/432] [mlir][arith] Fold `arith.cmpi eq, %val, %one : i1`
 -> `%val` and `arith.cmpi ne, %val, %zero : i1 -> %val` (#124436)

https://alive2.llvm.org/ce/z/dNZMdC
---
 flang/test/Lower/Intrinsics/ieee_next.f90 | 10 ++--
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp    | 12 ++++
 mlir/test/Dialect/Arith/canonicalize.mlir | 72 +++++++++++++++++++++++
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/flang/test/Lower/Intrinsics/ieee_next.f90 b/flang/test/Lower/Intrinsics/ieee_next.f90
index fa9692b83bc87..eb9cc028368a5 100644
--- a/flang/test/Lower/Intrinsics/ieee_next.f90
+++ b/flang/test/Lower/Intrinsics/ieee_next.f90
@@ -131,9 +131,8 @@ program p
   ! CHECK:     %[[V_106:[0-9]+]] = arith.bitcast %[[V_104]] : f32 to i32
   ! CHECK:     %[[V_107:[0-9]+]] = arith.shrui %[[V_106]], %c31{{.*}} : i32
   ! CHECK:     %[[V_108:[0-9]+]] = fir.convert %[[V_107]] : (i32) -> i1
-  ! CHECK:     %[[V_109:[0-9]+]] = arith.cmpi ne, %[[V_108]], %false{{[_0-9]*}} : i1
   ! CHECK:     %[[V_110:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 516 : i32}> : (f32) -> i1
-  ! CHECK:     %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_109]] : i1
+  ! CHECK:     %[[V_111:[0-9]+]] = arith.andi %[[V_110]], %[[V_108]] : i1
   ! CHECK:     %[[V_112:[0-9]+]] = arith.ori %[[V_105]], %[[V_111]] : i1
   ! CHECK:     %[[V_113:[0-9]+]] = fir.if %[[V_112]] -> (f32) {
   ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_104]]) <{bit = 1 : i32}> : (f32) -> i1
@@ -149,7 +148,7 @@ program p
   ! CHECK:       } else {
   ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_106]], %c1{{.*}} : i32
   ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_106]], %c1{{.*}} : i32
-  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_109]], %[[V_205]], %[[V_204]] : i32
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_108]], %[[V_205]], %[[V_204]] : i32
   ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i32 to f32
   ! CHECK:         fir.result %[[V_207]] : f32
   ! CHECK:       }
@@ -253,9 +252,8 @@ program p
   ! CHECK:     %[[V_182:[0-9]+]] = arith.bitcast %[[V_180]] : f128 to i128
   ! CHECK:     %[[V_183:[0-9]+]] = arith.shrui %[[V_182]], %c127{{.*}} : i128
   ! CHECK:     %[[V_184:[0-9]+]] = fir.convert %[[V_183]] : (i128) -> i1
-  ! CHECK:     %[[V_185:[0-9]+]] = arith.cmpi ne, %[[V_184]], %false{{[_0-9]*}} : i1
   ! CHECK:     %[[V_186:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 516 : i32}> : (f128) -> i1
-  ! CHECK:     %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_185]] : i1
+  ! CHECK:     %[[V_187:[0-9]+]] = arith.andi %[[V_186]], %[[V_184]] : i1
   ! CHECK:     %[[V_188:[0-9]+]] = arith.ori %[[V_181]], %[[V_187]] : i1
   ! CHECK:     %[[V_189:[0-9]+]] = fir.if %[[V_188]] -> (f128) {
   ! CHECK:       %[[V_202:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_180]]) <{bit = 1 : i32}> : (f128) -> i1
@@ -271,7 +269,7 @@ program p
   ! CHECK:       } else {
   ! CHECK-DAG:     %[[V_204:[0-9]+]] = arith.subi %[[V_182]], %c1{{.*}} : i128
   ! CHECK-DAG:     %[[V_205:[0-9]+]] = arith.addi %[[V_182]], %c1{{.*}} : i128
-  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_185]], %[[V_205]], %[[V_204]] : i128
+  ! CHECK:         %[[V_206:[0-9]+]] = arith.select %[[V_184]], %[[V_205]], %[[V_204]] : i128
   ! CHECK:         %[[V_207:[0-9]+]] = arith.bitcast %[[V_206]] : i128 to f128
   ! CHECK:         fir.result %[[V_207]] : f128
   ! CHECK:       }
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 7ca104691e6df..75d59ba8c1a10 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -1865,6 +1865,18 @@ OpFoldResult arith::CmpIOp::fold(FoldAdaptor adaptor) {
           getPredicate() == arith::CmpIPredicate::ne)
         return extOp.getOperand();
     }
+
+    // arith.cmpi ne, %val, %zero : i1 -> %val
+    if (getElementTypeOrSelf(getLhs().getType()).isInteger(1) &&
+        getPredicate() == arith::CmpIPredicate::ne)
+      return getLhs();
+  }
+
+  if (matchPattern(adaptor.getRhs(), m_One())) {
+    // arith.cmpi eq, %val, %one : i1 -> %val
+    if (getElementTypeOrSelf(getLhs().getType()).isInteger(1) &&
+        getPredicate() == arith::CmpIPredicate::eq)
+      return getLhs();
   }
 
   // Move constant to the right side.
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index 522711b08f289..3a16ee3d4f8fd 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -160,6 +160,78 @@ func.func @selNotCond(%arg0: i1, %arg1 : i32, %arg2 : i32, %arg3 : i32, %arg4 :
   return %res1, %res2 : i32, i32
 }
 
+// CHECK-LABEL: @cmpiI1eq
+//  CHECK-SAME: (%[[ARG:.*]]: i1)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1eq(%arg0: i1) -> i1 {
+  %one = arith.constant 1 : i1
+  %res = arith.cmpi eq, %arg0, %one : i1
+  return %res : i1
+}
+
+// CHECK-LABEL: @cmpiI1eqVec
+//  CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1eqVec(%arg0: vector<4xi1>) -> vector<4xi1> {
+  %one = arith.constant dense<1> : vector<4xi1>
+  %res = arith.cmpi eq, %arg0, %one : vector<4xi1>
+  return %res : vector<4xi1>
+}
+
+// CHECK-LABEL: @cmpiI1ne
+//  CHECK-SAME: (%[[ARG:.*]]: i1)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1ne(%arg0: i1) -> i1 {
+  %zero = arith.constant 0 : i1
+  %res = arith.cmpi ne, %arg0, %zero : i1
+  return %res : i1
+}
+
+// CHECK-LABEL: @cmpiI1neVec
+//  CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1neVec(%arg0: vector<4xi1>) -> vector<4xi1> {
+  %zero = arith.constant dense<0> : vector<4xi1>
+  %res = arith.cmpi ne, %arg0, %zero : vector<4xi1>
+  return %res : vector<4xi1>
+}
+
+// CHECK-LABEL: @cmpiI1eqLhs
+//  CHECK-SAME: (%[[ARG:.*]]: i1)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1eqLhs(%arg0: i1) -> i1 {
+  %one = arith.constant 1 : i1
+  %res = arith.cmpi eq, %one, %arg0  : i1
+  return %res : i1
+}
+
+// CHECK-LABEL: @cmpiI1eqVecLhs
+//  CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1eqVecLhs(%arg0: vector<4xi1>) -> vector<4xi1> {
+  %one = arith.constant dense<1> : vector<4xi1>
+  %res = arith.cmpi eq, %one, %arg0 : vector<4xi1>
+  return %res : vector<4xi1>
+}
+
+// CHECK-LABEL: @cmpiI1neLhs
+//  CHECK-SAME: (%[[ARG:.*]]: i1)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1neLhs(%arg0: i1) -> i1 {
+  %zero = arith.constant 0 : i1
+  %res = arith.cmpi ne, %zero, %arg0 : i1
+  return %res : i1
+}
+
+// CHECK-LABEL: @cmpiI1neVecLhs
+//  CHECK-SAME: (%[[ARG:.*]]: vector<4xi1>)
+//       CHECK: return %[[ARG]]
+func.func @cmpiI1neVecLhs(%arg0: vector<4xi1>) -> vector<4xi1> {
+  %zero = arith.constant dense<0> : vector<4xi1>
+  %res = arith.cmpi ne, %zero, %arg0 : vector<4xi1>
+  return %res : vector<4xi1>
+}
+
 // Test case: Folding of comparisons with equal operands.
 // CHECK-LABEL: @cmpi_equal_operands
 //   CHECK-DAG:   %[[T:.*]] = arith.constant true

From ddbfe6f7d2075a828fa9e8e5f5734bf881cda13a Mon Sep 17 00:00:00 2001
From: Robert Dazi <14996868+v01dXYZ@users.noreply.github.com>
Date: Mon, 27 Jan 2025 12:43:37 +0100
Subject: [PATCH 176/432] [Sema] Fix __array_rank instantiation (#124491)

The type being queried was left as a template type parameter, making the
whole expression as dependent and thus not eligible to static_assert.

Fixes #123498

Co-authored-by: v01dxyz <v01dxyz@v01d.xyz>
Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |   2 +
 clang/include/clang/AST/ExprCXX.h             |   4 +-
 clang/lib/Sema/TreeTransform.h                |   3 -
 .../array-type-trait-with-template.cpp        | 129 ++++++++++++++++++
 4 files changed, 133 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/SemaCXX/array-type-trait-with-template.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b63bd366cfe88..c60565a568234 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1002,6 +1002,8 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
+- Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
+  template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498)
 - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234)
 
 Bug Fixes to AST Handling
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index aa10945addf78..2a130bc6da79a 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2847,8 +2847,8 @@ class TypeTraitExpr final
 ///
 /// Example:
 /// \code
-///   __array_rank(int[10][20]) == 2
-///   __array_extent(int, 1)    == 20
+///   __array_rank(int[10][20])      == 2
+///   __array_extent(int[10][20], 1) == 20
 /// \endcode
 class ArrayTypeTraitExpr : public Expr {
   /// The trait. An ArrayTypeTrait enum in MSVC compat unsigned.
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 12680843a434a..f04adf7fdf8ad 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -14947,9 +14947,6 @@ TreeTransform<Derived>::TransformArrayTypeTraitExpr(ArrayTypeTraitExpr *E) {
     SubExpr = getDerived().TransformExpr(E->getDimensionExpression());
     if (SubExpr.isInvalid())
       return ExprError();
-
-    if (!getDerived().AlwaysRebuild() && SubExpr.get() == E->getDimensionExpression())
-      return E;
   }
 
   return getDerived().RebuildArrayTypeTrait(E->getTrait(), E->getBeginLoc(), T,
diff --git a/clang/test/SemaCXX/array-type-trait-with-template.cpp b/clang/test/SemaCXX/array-type-trait-with-template.cpp
new file mode 100644
index 0000000000000..942714ec5d55a
--- /dev/null
+++ b/clang/test/SemaCXX/array-type-trait-with-template.cpp
@@ -0,0 +1,129 @@
+// RUN: %clang_cc1 -fsyntax-only %s
+// RUN: %clang_cc1 -fsyntax-only -std=c++20 -DWITH_AUTO_FUNCTION_PARAMETER=1 %s
+
+// When __array_rank is used with a template type parameter, this test
+// ensures clang considers the final expression could be used with
+// static_assert/constexpr.
+//
+// Although array_extent was handled well, we add it as a precaution.
+
+template <typename T>
+using remove_reference_t = __remove_reference_t(T);
+
+template <typename T, int N>
+constexpr int array_rank(T (&lhs)[N]) {
+  return __array_rank(T[N]);
+}
+
+template <int I, typename T, int N>
+ constexpr int array_extent(T (&lhs)[N]) {
+  return __array_extent(T[N], I);
+}
+
+template <typename T>
+struct Rank {
+  using ArrayT = remove_reference_t<T>;
+
+  template <int N>
+  static constexpr int call(ArrayT (&lhs)[N]) {
+    return __array_rank(ArrayT[N]);
+  }
+};
+
+template <typename T>
+struct Extent {
+  using ArrayT = remove_reference_t<T>;
+
+  template <int I, int N>
+  static constexpr int call(ArrayT (&lhs)[N]) {
+    return __array_extent(ArrayT[N], I);
+  }
+};
+
+#ifdef WITH_AUTO_FUNCTION_PARAMETER
+template <int N>
+constexpr int array_rank_auto(auto (&lhs)[N]) {
+  return __array_rank(remove_reference_t<decltype(lhs[0])>[N]);
+}
+
+template <int I, int N>
+constexpr int array_extent_auto(auto (&lhs)[N]) {
+  return __array_extent(remove_reference_t<decltype(lhs[0])>[N], I);
+}
+#endif
+
+template <int N>
+constexpr int array_rank_int(const int (&lhs)[N]) {
+  return __array_rank(const int[N]);
+}
+
+template <int I, int N>
+constexpr int array_extent_int(const int (&lhs)[N]) {
+  return __array_extent(const int[N], I);
+}
+
+template <int M, int N>
+constexpr int array_rank_int(const int (&lhs)[M][N]) {
+  return __array_rank(const int[M][N]);
+}
+
+template <int I, int M, int N>
+constexpr int array_extent_int(const int (&lhs)[M][N]) {
+  return __array_extent(const int[M][N], I);
+}
+
+int main() {
+  constexpr int vec[] = {0, 1, 2, 1};
+  constexpr int mat[4][4] = {
+    {1, 0, 0, 0},
+    {0, 1, 0, 0},
+    {0, 0, 1, 0},
+    {0, 0, 0, 1}
+  };
+
+#define ATT_TESTS_WITH_ASSERT(ATT_ASSERT)	\
+  { ATT_ASSERT(RANK(vec) == 1);	}		\
+  { ATT_ASSERT(RANK(mat) == 2);	}		\
+  { ATT_ASSERT(EXTENT(vec, 0) == 4); }		\
+  { ATT_ASSERT(EXTENT(vec, 1) == 0); }		\
+  { ATT_ASSERT(EXTENT(mat, 1) == 4); }
+
+#define ATT_TESTS()				\
+  ATT_TESTS_WITH_ASSERT( constexpr bool cst = )	\
+  ATT_TESTS_WITH_ASSERT( (void) )		\
+  ATT_TESTS_WITH_ASSERT( static_assert )
+
+  {
+#define RANK(lhs) array_rank(lhs)
+#define EXTENT(lhs, i) array_extent<i>(lhs)
+    ATT_TESTS();
+#undef RANK
+#undef EXTENT
+  }
+
+  {
+#define RANK(lhs) Rank<decltype(lhs[0])>::call(lhs)
+#define EXTENT(lhs, i) Extent<decltype(lhs[0])>::call<i>(lhs)
+    ATT_TESTS();
+#undef RANK
+#undef EXTENT
+  }
+
+#ifdef WITH_AUTO_FUNCTION_PARAMETER
+  {
+#define RANK(lhs) array_rank_auto(lhs)
+#define EXTENT(lhs, i) array_extent_auto<i>(lhs)
+    ATT_TESTS();
+#undef RANK
+#undef EXTENT
+  }
+#endif
+
+  {
+#define RANK(lhs) array_rank_int(lhs)
+#define EXTENT(lhs, i) array_extent_int<i>(lhs)
+    ATT_TESTS();
+#undef RANK
+#undef EXTENT
+  }
+}

From b7286dbef9dc1986860d29e390b092599e1d7db5 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 27 Jan 2025 11:59:38 +0000
Subject: [PATCH 177/432] Reland "[LoopVectorize] Add support for reverse loops
 in isDereferenceableAndAlignedInLoop #96752" (#123616)

The last attempt failed a sanitiser build because we were
creating a reference to a null Predicates pointer in
isDereferenceableAndAlignedInLoop. This was exposed by
the unit test IsDerefReadOnlyLoop in
unittests/Analysis/LoadsTest.cpp. I fixed this by falling
back on getConstantMaxBackedgeTakenCount if Predicates is
null - see line 316 in llvm/lib/Analysis/Loads.cpp. There
are no other changes.
---
 .../llvm/Analysis/LoopAccessAnalysis.h        |  19 ++
 llvm/lib/Analysis/Loads.cpp                   | 113 +++++----
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |  61 ++---
 .../LoopVectorize/X86/load-deref-pred.ll      | 238 ++++++------------
 .../LoopVectorize/load-deref-pred-align.ll    |  27 +-
 5 files changed, 191 insertions(+), 267 deletions(-)

diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 31374a128856c..6fc6ca14d0889 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -853,6 +853,25 @@ bool sortPtrAccesses(ArrayRef<Value *> VL, Type *ElemTy, const DataLayout &DL,
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                          ScalarEvolution &SE, bool CheckType = true);
 
+/// Calculate Start and End points of memory access.
+/// Let's assume A is the first access and B is a memory access on N-th loop
+/// iteration. Then B is calculated as:
+///   B = A + Step*N .
+/// Step value may be positive or negative.
+/// N is a calculated back-edge taken count:
+///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
+/// Start and End points are calculated in the following way:
+/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
+/// where SizeOfElt is the size of single memory access in bytes.
+///
+/// There is no conflict when the intervals are disjoint:
+/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
+std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
+    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
+    ScalarEvolution *SE,
+    DenseMap<std::pair<const SCEV *, Type *>,
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds);
+
 class LoopAccessInfoManager {
   /// The cache.
   DenseMap<Loop *, std::unique_ptr<LoopAccessInfo>> LoopAccessInfoMap;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 9279f19b72a3f..691d7e4a3edcf 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -277,84 +278,90 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 bool llvm::isDereferenceableAndAlignedInLoop(
     LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     AssumptionCache *AC, SmallVectorImpl<const SCEVPredicate *> *Predicates) {
+  const Align Alignment = LI->getAlign();
   auto &DL = LI->getDataLayout();
   Value *Ptr = LI->getPointerOperand();
-
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
                 DL.getTypeStoreSize(LI->getType()).getFixedValue());
-  const Align Alignment = LI->getAlign();
-
-  Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt();
 
   // If given a uniform (i.e. non-varying) address, see if we can prove the
   // access is safe within the loop w/o needing predication.
   if (L->isLoopInvariant(Ptr))
-    return isDereferenceableAndAlignedPointer(Ptr, Alignment, EltSize, DL,
-                                              HeaderFirstNonPHI, AC, &DT);
+    return isDereferenceableAndAlignedPointer(
+        Ptr, Alignment, EltSize, DL, &*L->getHeader()->getFirstNonPHIIt(), AC,
+        &DT);
+
+  const SCEV *PtrScev = SE.getSCEV(Ptr);
+  auto *AddRec = dyn_cast<SCEVAddRecExpr>(PtrScev);
 
-  // Otherwise, check to see if we have a repeating access pattern where we can
-  // prove that all accesses are well aligned and dereferenceable.
-  auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Ptr));
+  // Check to see if we have a repeating access pattern and it's possible
+  // to prove all accesses are well aligned.
   if (!AddRec || AddRec->getLoop() != L || !AddRec->isAffine())
     return false;
-  auto* Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
+
+  auto *Step = dyn_cast<SCEVConstant>(AddRec->getStepRecurrence(SE));
   if (!Step)
     return false;
 
-  auto TC = SE.getSmallConstantMaxTripCount(L, Predicates);
-  if (!TC)
+  // For the moment, restrict ourselves to the case where the access size is a
+  // multiple of the requested alignment and the base is aligned.
+  // TODO: generalize if a case found which warrants
+  if (EltSize.urem(Alignment.value()) != 0)
     return false;
 
   // TODO: Handle overlapping accesses.
-  // We should be computing AccessSize as (TC - 1) * Step + EltSize.
-  if (EltSize.sgt(Step->getAPInt()))
+  if (EltSize.ugt(Step->getAPInt().abs()))
+    return false;
+
+  const SCEV *MaxBECount =
+      Predicates ? SE.getPredicatedConstantMaxBackedgeTakenCount(L, *Predicates)
+                 : SE.getConstantMaxBackedgeTakenCount(L);
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    return false;
+
+  const auto &[AccessStart, AccessEnd] = getStartAndEndForAccess(
+      L, PtrScev, LI->getType(), MaxBECount, &SE, nullptr);
+  if (isa<SCEVCouldNotCompute>(AccessStart) ||
+      isa<SCEVCouldNotCompute>(AccessEnd))
     return false;
 
-  // Compute the total access size for access patterns with unit stride and
-  // patterns with gaps. For patterns with unit stride, Step and EltSize are the
-  // same.
-  // For patterns with gaps (i.e. non unit stride), we are
-  // accessing EltSize bytes at every Step.
-  APInt AccessSize = TC * Step->getAPInt();
+  // Try to get the access size.
+  const SCEV *PtrDiff = SE.getMinusSCEV(AccessEnd, AccessStart);
+  APInt MaxPtrDiff = SE.getUnsignedRangeMax(PtrDiff);
 
-  assert(SE.isLoopInvariant(AddRec->getStart(), L) &&
-         "implied by addrec definition");
   Value *Base = nullptr;
-  if (auto *StartS = dyn_cast<SCEVUnknown>(AddRec->getStart())) {
-    Base = StartS->getValue();
-  } else if (auto *StartS = dyn_cast<SCEVAddExpr>(AddRec->getStart())) {
-    // Handle (NewBase + offset) as start value.
-    const auto *Offset = dyn_cast<SCEVConstant>(StartS->getOperand(0));
-    const auto *NewBase = dyn_cast<SCEVUnknown>(StartS->getOperand(1));
-    if (StartS->getNumOperands() == 2 && Offset && NewBase) {
-      // The following code below assumes the offset is unsigned, but GEP
-      // offsets are treated as signed so we can end up with a signed value
-      // here too. For example, suppose the initial PHI value is (i8 255),
-      // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
-      if (Offset->getAPInt().isNegative())
-        return false;
+  APInt AccessSize;
+  if (const SCEVUnknown *NewBase = dyn_cast<SCEVUnknown>(AccessStart)) {
+    Base = NewBase->getValue();
+    AccessSize = MaxPtrDiff;
+  } else if (auto *MinAdd = dyn_cast<SCEVAddExpr>(AccessStart)) {
+    if (MinAdd->getNumOperands() != 2)
+      return false;
 
-      // For the moment, restrict ourselves to the case where the offset is a
-      // multiple of the requested alignment and the base is aligned.
-      // TODO: generalize if a case found which warrants
-      if (Offset->getAPInt().urem(Alignment.value()) != 0)
-        return false;
-      Base = NewBase->getValue();
-      bool Overflow = false;
-      AccessSize = AccessSize.uadd_ov(Offset->getAPInt(), Overflow);
-      if (Overflow)
-        return false;
-    }
-  }
+    const auto *Offset = dyn_cast<SCEVConstant>(MinAdd->getOperand(0));
+    const auto *NewBase = dyn_cast<SCEVUnknown>(MinAdd->getOperand(1));
+    if (!Offset || !NewBase)
+      return false;
 
-  if (!Base)
-    return false;
+    // The following code below assumes the offset is unsigned, but GEP
+    // offsets are treated as signed so we can end up with a signed value
+    // here too. For example, suppose the initial PHI value is (i8 255),
+    // the offset will be treated as (i8 -1) and sign-extended to (i64 -1).
+    if (Offset->getAPInt().isNegative())
+      return false;
 
-  // For the moment, restrict ourselves to the case where the access size is a
-  // multiple of the requested alignment and the base is aligned.
-  // TODO: generalize if a case found which warrants
-  if (EltSize.urem(Alignment.value()) != 0)
+    // For the moment, restrict ourselves to the case where the offset is a
+    // multiple of the requested alignment and the base is aligned.
+    // TODO: generalize if a case found which warrants
+    if (Offset->getAPInt().urem(Alignment.value()) != 0)
+      return false;
+
+    AccessSize = MaxPtrDiff + Offset->getAPInt();
+    Base = NewBase->getValue();
+  } else
     return false;
+
+  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 2a68979add666..11e0a221fc887 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -190,31 +190,20 @@ RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
   Members.push_back(Index);
 }
 
-/// Calculate Start and End points of memory access.
-/// Let's assume A is the first access and B is a memory access on N-th loop
-/// iteration. Then B is calculated as:
-///   B = A + Step*N .
-/// Step value may be positive or negative.
-/// N is a calculated back-edge taken count:
-///     N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
-/// Start and End points are calculated in the following way:
-/// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
-/// where SizeOfElt is the size of single memory access in bytes.
-///
-/// There is no conflict when the intervals are disjoint:
-/// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
-static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
-    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy,
-    PredicatedScalarEvolution &PSE,
+std::pair<const SCEV *, const SCEV *> llvm::getStartAndEndForAccess(
+    const Loop *Lp, const SCEV *PtrExpr, Type *AccessTy, const SCEV *MaxBECount,
+    ScalarEvolution *SE,
     DenseMap<std::pair<const SCEV *, Type *>,
-             std::pair<const SCEV *, const SCEV *>> &PointerBounds) {
-  ScalarEvolution *SE = PSE.getSE();
-
-  auto [Iter, Ins] = PointerBounds.insert(
-      {{PtrExpr, AccessTy},
-       {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
-  if (!Ins)
-    return Iter->second;
+             std::pair<const SCEV *, const SCEV *>> *PointerBounds) {
+  std::pair<const SCEV *, const SCEV *> *PtrBoundsPair;
+  if (PointerBounds) {
+    auto [Iter, Ins] = PointerBounds->insert(
+        {{PtrExpr, AccessTy},
+         {SE->getCouldNotCompute(), SE->getCouldNotCompute()}});
+    if (!Ins)
+      return Iter->second;
+    PtrBoundsPair = &Iter->second;
+  }
 
   const SCEV *ScStart;
   const SCEV *ScEnd;
@@ -222,10 +211,8 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
   if (SE->isLoopInvariant(PtrExpr, Lp)) {
     ScStart = ScEnd = PtrExpr;
   } else if (auto *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr)) {
-    const SCEV *Ex = PSE.getSymbolicMaxBackedgeTakenCount();
-
     ScStart = AR->getStart();
-    ScEnd = AR->evaluateAtIteration(Ex, *SE);
+    ScEnd = AR->evaluateAtIteration(MaxBECount, *SE);
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
     // For expressions with negative step, the upper bound is ScStart and the
@@ -244,7 +231,7 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
     return {SE->getCouldNotCompute(), SE->getCouldNotCompute()};
 
   assert(SE->isLoopInvariant(ScStart, Lp) && "ScStart needs to be invariant");
-  assert(SE->isLoopInvariant(ScEnd, Lp)&& "ScEnd needs to be invariant");
+  assert(SE->isLoopInvariant(ScEnd, Lp) && "ScEnd needs to be invariant");
 
   // Add the size of the pointed element to ScEnd.
   auto &DL = Lp->getHeader()->getDataLayout();
@@ -252,8 +239,10 @@ static std::pair<const SCEV *, const SCEV *> getStartAndEndForAccess(
   const SCEV *EltSizeSCEV = SE->getStoreSizeOfExpr(IdxTy, AccessTy);
   ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
 
-  Iter->second = {ScStart, ScEnd};
-  return Iter->second;
+  std::pair<const SCEV *, const SCEV *> Res = {ScStart, ScEnd};
+  if (PointerBounds)
+    *PtrBoundsPair = Res;
+  return Res;
 }
 
 /// Calculate Start and End points of memory access using
@@ -263,8 +252,9 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, const SCEV *PtrExpr,
                                     unsigned DepSetId, unsigned ASId,
                                     PredicatedScalarEvolution &PSE,
                                     bool NeedsFreeze) {
+  const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
   const auto &[ScStart, ScEnd] = getStartAndEndForAccess(
-      Lp, PtrExpr, AccessTy, PSE, DC.getPointerBounds());
+      Lp, PtrExpr, AccessTy, MaxBECount, PSE.getSE(), &DC.getPointerBounds());
   assert(!isa<SCEVCouldNotCompute>(ScStart) &&
          !isa<SCEVCouldNotCompute>(ScEnd) &&
          "must be able to compute both start and end expressions");
@@ -1938,10 +1928,11 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
   // required for correctness.
   if (SE.isLoopInvariant(Src, InnermostLoop) ||
       SE.isLoopInvariant(Sink, InnermostLoop)) {
-    const auto &[SrcStart_, SrcEnd_] =
-        getStartAndEndForAccess(InnermostLoop, Src, ATy, PSE, PointerBounds);
-    const auto &[SinkStart_, SinkEnd_] =
-        getStartAndEndForAccess(InnermostLoop, Sink, BTy, PSE, PointerBounds);
+    const SCEV *MaxBECount = PSE.getSymbolicMaxBackedgeTakenCount();
+    const auto &[SrcStart_, SrcEnd_] = getStartAndEndForAccess(
+        InnermostLoop, Src, ATy, MaxBECount, PSE.getSE(), &PointerBounds);
+    const auto &[SinkStart_, SinkEnd_] = getStartAndEndForAccess(
+        InnermostLoop, Sink, BTy, MaxBECount, PSE.getSE(), &PointerBounds);
     if (!isa<SCEVCouldNotCompute>(SrcStart_) &&
         !isa<SCEVCouldNotCompute>(SrcEnd_) &&
         !isa<SCEVCouldNotCompute>(SinkStart_) &&
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 1433e48690bc6..3e50ee42866b9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -2920,8 +2920,8 @@ loop_exit:
   ret i32 %accum.next
 }
 
-define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
-; CHECK-LABEL: @neg_test_non_unit_stride_off_by_four_bytes(
+define i32 @test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base) {
+; CHECK-LABEL: @test_non_unit_stride_off_by_four_bytes(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [103 x i32], align 4
 ; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
@@ -2929,11 +2929,11 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base)
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE33:%.*]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP144:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP145:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP146:%.*]], [[PRED_LOAD_CONTINUE33]] ]
-; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP147:%.*]], [[PRED_LOAD_CONTINUE33]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP112:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP113:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP114:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP115:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
@@ -2999,170 +2999,74 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base)
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK:       pred.load.if:
-; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
-; CHECK-NEXT:    [[TMP67:%.*]] = insertelement <4 x i32> poison, i32 [[TMP66]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
-; CHECK:       pred.load.continue:
-; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK:       pred.load.if4:
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4
-; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i32> [[TMP68]], i32 [[TMP71]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
-; CHECK:       pred.load.continue5:
-; CHECK-NEXT:    [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
-; CHECK:       pred.load.if6:
-; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4
-; CHECK-NEXT:    [[TMP77:%.*]] = insertelement <4 x i32> [[TMP73]], i32 [[TMP76]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
-; CHECK:       pred.load.continue7:
-; CHECK-NEXT:    [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
-; CHECK:       pred.load.if8:
-; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4
-; CHECK-NEXT:    [[TMP82:%.*]] = insertelement <4 x i32> [[TMP78]], i32 [[TMP81]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
-; CHECK:       pred.load.continue9:
-; CHECK-NEXT:    [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
-; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
-; CHECK:       pred.load.if10:
-; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
-; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> poison, i32 [[TMP86]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
-; CHECK:       pred.load.continue11:
-; CHECK-NEXT:    [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
-; CHECK:       pred.load.if12:
-; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4
-; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> [[TMP88]], i32 [[TMP91]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
-; CHECK:       pred.load.continue13:
-; CHECK-NEXT:    [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ]
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
-; CHECK:       pred.load.if14:
-; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
-; CHECK-NEXT:    [[TMP97:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP96]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
-; CHECK:       pred.load.continue15:
-; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
-; CHECK:       pred.load.if16:
-; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
-; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP98]], i32 [[TMP101]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
-; CHECK:       pred.load.continue17:
-; CHECK-NEXT:    [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
-; CHECK:       pred.load.if18:
-; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4
-; CHECK-NEXT:    [[TMP107:%.*]] = insertelement <4 x i32> poison, i32 [[TMP106]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
-; CHECK:       pred.load.continue19:
-; CHECK-NEXT:    [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ]
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
-; CHECK:       pred.load.if20:
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4
-; CHECK-NEXT:    [[TMP112:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP111]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
-; CHECK:       pred.load.continue21:
-; CHECK-NEXT:    [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ]
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
-; CHECK:       pred.load.if22:
-; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]]
-; CHECK-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
-; CHECK-NEXT:    [[TMP117:%.*]] = insertelement <4 x i32> [[TMP113]], i32 [[TMP116]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
-; CHECK:       pred.load.continue23:
-; CHECK-NEXT:    [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ]
-; CHECK-NEXT:    [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
-; CHECK:       pred.load.if24:
-; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4
-; CHECK-NEXT:    [[TMP122:%.*]] = insertelement <4 x i32> [[TMP118]], i32 [[TMP121]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
-; CHECK:       pred.load.continue25:
-; CHECK-NEXT:    [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ]
-; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
-; CHECK:       pred.load.if26:
-; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4
-; CHECK-NEXT:    [[TMP127:%.*]] = insertelement <4 x i32> poison, i32 [[TMP126]], i32 0
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
-; CHECK:       pred.load.continue27:
-; CHECK-NEXT:    [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ]
-; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
-; CHECK:       pred.load.if28:
-; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4
-; CHECK-NEXT:    [[TMP132:%.*]] = insertelement <4 x i32> [[TMP128]], i32 [[TMP131]], i32 1
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
-; CHECK:       pred.load.continue29:
-; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
-; CHECK:       pred.load.if30:
-; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
-; CHECK-NEXT:    [[TMP137:%.*]] = insertelement <4 x i32> [[TMP133]], i32 [[TMP136]], i32 2
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
-; CHECK:       pred.load.continue31:
-; CHECK-NEXT:    [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
-; CHECK:       pred.load.if32:
-; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4
-; CHECK-NEXT:    [[TMP142:%.*]] = insertelement <4 x i32> [[TMP138]], i32 [[TMP141]], i32 3
-; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE33]]
-; CHECK:       pred.load.continue33:
-; CHECK-NEXT:    [[TMP143:%.*]] = phi <4 x i32> [ [[TMP138]], [[PRED_LOAD_CONTINUE31]] ], [ [[TMP142]], [[PRED_LOAD_IF32]] ]
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP83]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI34:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI35:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP123]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[PREDPHI36:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP143]], <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP144]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT:    [[TMP145]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI34]]
-; CHECK-NEXT:    [[TMP146]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI35]]
-; CHECK-NEXT:    [[TMP147]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI36]]
+; CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds i32, ptr [[ALLOCA]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP80:%.*]] = load i32, ptr [[TMP64]], align 4
+; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP65]], align 4
+; CHECK-NEXT:    [[TMP82:%.*]] = load i32, ptr [[TMP66]], align 4
+; CHECK-NEXT:    [[TMP83:%.*]] = load i32, ptr [[TMP67]], align 4
+; CHECK-NEXT:    [[TMP84:%.*]] = insertelement <4 x i32> poison, i32 [[TMP80]], i32 0
+; CHECK-NEXT:    [[TMP85:%.*]] = insertelement <4 x i32> [[TMP84]], i32 [[TMP81]], i32 1
+; CHECK-NEXT:    [[TMP86:%.*]] = insertelement <4 x i32> [[TMP85]], i32 [[TMP82]], i32 2
+; CHECK-NEXT:    [[TMP87:%.*]] = insertelement <4 x i32> [[TMP86]], i32 [[TMP83]], i32 3
+; CHECK-NEXT:    [[TMP88:%.*]] = load i32, ptr [[TMP68]], align 4
+; CHECK-NEXT:    [[TMP89:%.*]] = load i32, ptr [[TMP69]], align 4
+; CHECK-NEXT:    [[TMP90:%.*]] = load i32, ptr [[TMP70]], align 4
+; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP71]], align 4
+; CHECK-NEXT:    [[TMP92:%.*]] = insertelement <4 x i32> poison, i32 [[TMP88]], i32 0
+; CHECK-NEXT:    [[TMP93:%.*]] = insertelement <4 x i32> [[TMP92]], i32 [[TMP89]], i32 1
+; CHECK-NEXT:    [[TMP94:%.*]] = insertelement <4 x i32> [[TMP93]], i32 [[TMP90]], i32 2
+; CHECK-NEXT:    [[TMP95:%.*]] = insertelement <4 x i32> [[TMP94]], i32 [[TMP91]], i32 3
+; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP72]], align 4
+; CHECK-NEXT:    [[TMP97:%.*]] = load i32, ptr [[TMP73]], align 4
+; CHECK-NEXT:    [[TMP98:%.*]] = load i32, ptr [[TMP74]], align 4
+; CHECK-NEXT:    [[TMP99:%.*]] = load i32, ptr [[TMP75]], align 4
+; CHECK-NEXT:    [[TMP100:%.*]] = insertelement <4 x i32> poison, i32 [[TMP96]], i32 0
+; CHECK-NEXT:    [[TMP101:%.*]] = insertelement <4 x i32> [[TMP100]], i32 [[TMP97]], i32 1
+; CHECK-NEXT:    [[TMP102:%.*]] = insertelement <4 x i32> [[TMP101]], i32 [[TMP98]], i32 2
+; CHECK-NEXT:    [[TMP103:%.*]] = insertelement <4 x i32> [[TMP102]], i32 [[TMP99]], i32 3
+; CHECK-NEXT:    [[TMP104:%.*]] = load i32, ptr [[TMP76]], align 4
+; CHECK-NEXT:    [[TMP105:%.*]] = load i32, ptr [[TMP77]], align 4
+; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP78]], align 4
+; CHECK-NEXT:    [[TMP107:%.*]] = load i32, ptr [[TMP79]], align 4
+; CHECK-NEXT:    [[TMP108:%.*]] = insertelement <4 x i32> poison, i32 [[TMP104]], i32 0
+; CHECK-NEXT:    [[TMP109:%.*]] = insertelement <4 x i32> [[TMP108]], i32 [[TMP105]], i32 1
+; CHECK-NEXT:    [[TMP110:%.*]] = insertelement <4 x i32> [[TMP109]], i32 [[TMP106]], i32 2
+; CHECK-NEXT:    [[TMP111:%.*]] = insertelement <4 x i32> [[TMP110]], i32 [[TMP107]], i32 3
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP39]], <4 x i32> [[TMP87]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI4:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[TMP95]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI5:%.*]] = select <4 x i1> [[TMP55]], <4 x i32> [[TMP103]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI6:%.*]] = select <4 x i1> [[TMP63]], <4 x i32> [[TMP111]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP112]] = add <4 x i32> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[TMP113]] = add <4 x i32> [[VEC_PHI1]], [[PREDPHI4]]
+; CHECK-NEXT:    [[TMP114]] = add <4 x i32> [[VEC_PHI2]], [[PREDPHI5]]
+; CHECK-NEXT:    [[TMP115]] = add <4 x i32> [[VEC_PHI3]], [[PREDPHI6]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT:    [[TMP148:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
-; CHECK-NEXT:    br i1 [[TMP148]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
+; CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], 48
+; CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP145]], [[TMP144]]
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = add <4 x i32> [[TMP146]], [[BIN_RDX]]
-; CHECK-NEXT:    [[BIN_RDX38:%.*]] = add <4 x i32> [[TMP147]], [[BIN_RDX37]]
-; CHECK-NEXT:    [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]])
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP113]], [[TMP112]]
+; CHECK-NEXT:    [[BIN_RDX7:%.*]] = add <4 x i32> [[TMP114]], [[BIN_RDX]]
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = add <4 x i32> [[TMP115]], [[BIN_RDX7]]
+; CHECK-NEXT:    [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]])
 ; CHECK-NEXT:    br i1 false, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP149]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP117]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ]
@@ -3181,7 +3085,7 @@ define i32 @neg_test_non_unit_stride_off_by_four_bytes(i64 %len, ptr %test_base)
 ; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 100
 ; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP37:![0-9]+]]
 ; CHECK:       loop_exit:
-; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP149]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LATCH]] ], [ [[TMP117]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
index 0f4e327891899..cbc483fabc184 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll
@@ -351,27 +351,30 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) {
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 -1
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <2 x i32> [[WIDE_LOAD1]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    store i32 [[TMP12]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
-; CHECK:       pred.store.if1:
+; CHECK:       pred.store.if3:
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP21]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x i32> [[REVERSE2]], i32 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i32 [[TMP16]], 2
 ; CHECK-NEXT:    store i32 [[TMP17]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE4]]
-; CHECK:       pred.store.continue2:
+; CHECK:       pred.store.continue4:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -665,12 +668,15 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <2 x i32> [[REVERSE]], splat (i32 3)
 ; CHECK-NEXT:    [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = shl nsw i32 [[TMP11]], 2
 ; CHECK-NEXT:    store i32 [[TMP15]], ptr [[TMP14]], align 4
@@ -680,9 +686,6 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; CHECK:       pred.store.if1:
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_SRC]], i64 0, i64 [[TMP23]]
-; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_DEST]], i64 0, i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i32 [[TMP12]], 2
 ; CHECK-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4

From b8d921003d1f20819b897b066e02d22787f11550 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Mon, 27 Jan 2025 12:23:03 +0000
Subject: [PATCH 178/432] [flang][NFC] Restrict -funroll-loops tests to known
 working targets (#123939)

If -funroll-loops tests are not restricted to specific targets the tests
may behave differently based on the host platform. This patch restricts
the tests to aarch64 and x86_64, and removes the PowerPC XFAIL.
---
 flang/test/HLFIR/unroll-loops.fir       | 12 ++++++++----
 flang/test/Integration/unroll-loops.f90 | 15 ++++++++-------
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/HLFIR/unroll-loops.fir
index 4494cfa570dd7..1c214f76f5649 100644
--- a/flang/test/HLFIR/unroll-loops.fir
+++ b/flang/test/HLFIR/unroll-loops.fir
@@ -1,7 +1,11 @@
-// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
 // FIXME: https://github.com/llvm/llvm-project/issues/123668
 // XFAIL: target=powerpc64{{.*}}
diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 4b4a394502881..86c57dd2fd0ea 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -1,10 +1,11 @@
-! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-
-! FIXME: https://github.com/llvm/llvm-project/issues/123668
-! XFAIL: target=powerpc64{{.*}}
+! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
 ! CHECK-LABEL: @unroll
 ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])

From 98e52db4a5e57f919bb70312f9ca7deb16ee6fcb Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Mon, 27 Jan 2025 12:37:58 +0000
Subject: [PATCH 179/432] Revert "[flang][NFC] Restrict -funroll-loops tests to
 known working targets" (#124536)

Reverts llvm/llvm-project#123939
---
 flang/test/HLFIR/unroll-loops.fir       | 12 ++++--------
 flang/test/Integration/unroll-loops.f90 | 15 +++++++--------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/HLFIR/unroll-loops.fir
index 1c214f76f5649..4494cfa570dd7 100644
--- a/flang/test/HLFIR/unroll-loops.fir
+++ b/flang/test/HLFIR/unroll-loops.fir
@@ -1,11 +1,7 @@
-// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
 // FIXME: https://github.com/llvm/llvm-project/issues/123668
 // XFAIL: target=powerpc64{{.*}}
diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 86c57dd2fd0ea..4b4a394502881 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -1,11 +1,10 @@
-! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple aarch64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple x86_64-unknown-linux-gnu -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+
+! FIXME: https://github.com/llvm/llvm-project/issues/123668
+! XFAIL: target=powerpc64{{.*}}
 
 ! CHECK-LABEL: @unroll
 ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])

From 6087c3049656bbaef51fffb48e2404e86f7e0d3f Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Mon, 27 Jan 2025 13:41:18 +0100
Subject: [PATCH 180/432] [lldb] Simplify preprocessor conditional (#124522)

The long list of defines is just a very elaborate way to say "not
windows".
---
 lldb/source/Host/common/Host.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index fdb623667bc25..d5054008268b9 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -11,13 +11,19 @@
 #include <climits>
 #include <cstdlib>
 #include <sys/types.h>
+
 #ifndef _WIN32
 #include <dlfcn.h>
 #include <grp.h>
 #include <netdb.h>
 #include <pwd.h>
 #include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
 #include <unistd.h>
+#if !defined(__ANDROID__)
+#include <spawn.h>
+#endif
 #endif
 
 #if defined(__APPLE__)
@@ -26,16 +32,6 @@
 #include <mach/mach_port.h>
 #endif
 
-#if defined(__linux__) || defined(__FreeBSD__) ||                              \
-    defined(__FreeBSD_kernel__) || defined(__APPLE__) ||                       \
-    defined(__NetBSD__) || defined(__OpenBSD__) || defined(__EMSCRIPTEN__)
-#if !defined(__ANDROID__)
-#include <spawn.h>
-#endif
-#include <sys/syscall.h>
-#include <sys/wait.h>
-#endif
-
 #if defined(__FreeBSD__)
 #include <pthread_np.h>
 #endif

From cfdd7d736a94aa65a23eb41258d9d6712cdb2b0d Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Mon, 27 Jan 2025 12:50:10 +0000
Subject: [PATCH 181/432] [compiler-rt][rtsan] sched cpu affinity for linux
 interception. (#124194)

---
 .../lib/rtsan/rtsan_interceptors_posix.cpp    | 22 +++++++++++++++++++
 .../tests/rtsan_test_interceptors_posix.cpp   | 16 ++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 1b499f2194f21..7d8b1c84f7d1c 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -740,6 +740,26 @@ INTERCEPTOR(int, sched_yield, void) {
   return REAL(sched_yield)();
 }
 
+#if SANITIZER_LINUX
+INTERCEPTOR(int, sched_getaffinity, pid_t pid, size_t len, cpu_set_t *set) {
+  __rtsan_notify_intercepted_call("sched_getaffinity");
+  return REAL(sched_getaffinity)(pid, len, set);
+}
+
+INTERCEPTOR(int, sched_setaffinity, pid_t pid, size_t len,
+            const cpu_set_t *set) {
+  __rtsan_notify_intercepted_call("sched_setaffinity");
+  return REAL(sched_setaffinity)(pid, len, set);
+}
+#define RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY                                \
+  INTERCEPT_FUNCTION(sched_getaffinity)
+#define RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY                                \
+  INTERCEPT_FUNCTION(sched_setaffinity)
+#else
+#define RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY
+#define RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY
+#endif
+
 // Memory
 
 INTERCEPTOR(void *, calloc, SIZE_T num, SIZE_T size) {
@@ -1415,6 +1435,8 @@ void __rtsan::InitializeInterceptors() {
   INTERCEPT_FUNCTION(usleep);
   INTERCEPT_FUNCTION(nanosleep);
   INTERCEPT_FUNCTION(sched_yield);
+  RTSAN_MAYBE_INTERCEPT_SCHED_GETAFFINITY;
+  RTSAN_MAYBE_INTERCEPT_SCHED_SETAFFINITY;
 
   INTERCEPT_FUNCTION(accept);
   INTERCEPT_FUNCTION(bind);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index a4f2b92b7c494..ef9ec626610d5 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -323,6 +323,22 @@ TEST(TestRtsanInterceptors, SchedYieldDiesWhenRealtime) {
   ExpectNonRealtimeSurvival(Func);
 }
 
+#if SANITIZER_LINUX
+TEST(TestRtsanInterceptors, SchedGetaffinityDiesWhenRealtime) {
+  cpu_set_t set{};
+  auto Func = [&set]() { sched_getaffinity(0, sizeof(set), &set); };
+  ExpectRealtimeDeath(Func, "sched_getaffinity");
+  ExpectNonRealtimeSurvival(Func);
+}
+
+TEST(TestRtsanInterceptors, SchedSetaffinityDiesWhenRealtime) {
+  cpu_set_t set{};
+  auto Func = [&set]() { sched_setaffinity(0, sizeof(set), &set); };
+  ExpectRealtimeDeath(Func, "sched_setaffinity");
+  ExpectNonRealtimeSurvival(Func);
+}
+#endif
+
 /*
     Filesystem
 */

From e21b80464a44ef6491e44517ac59892c10ba2d6c Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Mon, 27 Jan 2025 12:52:35 +0000
Subject: [PATCH 182/432] [compiler-rt][rtsan] socketpair interception.
 (#124107)

---
 compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp         | 6 ++++++
 .../lib/rtsan/tests/rtsan_test_interceptors_posix.cpp      | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
index 7d8b1c84f7d1c..6816119065263 100644
--- a/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_interceptors_posix.cpp
@@ -1088,6 +1088,11 @@ INTERCEPTOR(int, setsockopt, int socket, int level, int option,
 #define RTSAN_MAYBE_INTERCEPT_SETSOCKOPT
 #endif
 
+INTERCEPTOR(int, socketpair, int domain, int type, int protocol, int pair[2]) {
+  __rtsan_notify_intercepted_call("socketpair");
+  return REAL(socketpair)(domain, type, protocol, pair);
+}
+
 // I/O Multiplexing
 
 INTERCEPTOR(int, poll, struct pollfd *fds, nfds_t nfds, int timeout) {
@@ -1459,6 +1464,7 @@ void __rtsan::InitializeInterceptors() {
   RTSAN_MAYBE_INTERCEPT_GETPEERNAME;
   RTSAN_MAYBE_INTERCEPT_GETSOCKOPT;
   RTSAN_MAYBE_INTERCEPT_SETSOCKOPT;
+  INTERCEPT_FUNCTION(socketpair);
 
   RTSAN_MAYBE_INTERCEPT_SELECT;
   INTERCEPT_FUNCTION(pselect);
diff --git a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
index ef9ec626610d5..59663776366bb 100644
--- a/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
+++ b/compiler-rt/lib/rtsan/tests/rtsan_test_interceptors_posix.cpp
@@ -1351,6 +1351,13 @@ TEST(TestRtsanInterceptors, SetsockoptOnASocketDiesWhenRealtime) {
 }
 #endif
 
+TEST(TestRtsanInterceptors, SocketpairDiesWhenRealtime) {
+  int pair[2]{};
+  auto Func = [&pair]() { socketpair(0, 0, 0, pair); };
+  ExpectRealtimeDeath(Func, "socketpair");
+  ExpectNonRealtimeSurvival(Func);
+}
+
 /*
     I/O Multiplexing
 */

From d8ad1eef8ffeb4ef5474f0e38d6d340d82c53572 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 12:53:38 +0000
Subject: [PATCH 183/432] [AArch64] Generate zeroing forms of certain SVE2.2
 instructions (7/11) (#116833)

SVE2.2 introduces instructions with predicated forms with zeroing of
the inactive lanes. This allows in some cases to save a `movprfx` or
a `mov` instruction when emitting code for `_x` or `_z` variants of
intrinsics.

This patch adds support for emitting the zeroing forms of certain
`FLOGB` instructions.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   2 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   6 +-
 .../CodeGen/AArch64/zeroing-forms-flogb.ll    | 258 ++++++++++++++++++
 3 files changed, 264 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 9ed683e73e9cc..2d6a3b6199c67 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4280,7 +4280,7 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm SCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b0, "scvtf", "int_aarch64_sve_scvtf", AArch64scvtf_mt>;
   defm UCVTF_ZPzZ  : sve_fp_z2op_p_zd_c<0b1, "ucvtf", "int_aarch64_sve_ucvtf", AArch64ucvtf_mt>;
   // Signed integer base 2 logarithm of fp value, zeroing predicate
-  defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb">;
+  defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb", int_aarch64_sve_flogb>;
 
   // SVE2 integer unary operations, zeroing predicate
   def URECPE_ZPzZ  : sve2_int_un_pred_arit_z<0b10, 0b00, "urecpe", ZPR32>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8125014faa033..199b2e343d3f7 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3332,10 +3332,14 @@ multiclass sve_fp_z2op_p_zd_c<bit U, string asm, string int_op, SDPatternOperato
   defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64, ir_op, nxv2i1,nxv2i64, !cast<Instruction>(NAME # _DtoD)>;
 }
 
-multiclass sve_fp_z2op_p_zd_d_flogb<string asm> {
+multiclass sve_fp_z2op_p_zd_d_flogb<string asm, SDPatternOperator op> {
   def _H : sve_fp_z2op_p_zd<0b0011001, asm, ZPR16, ZPR16>;
   def _S : sve_fp_z2op_p_zd<0b0011010, asm, ZPR32, ZPR32>;
   def _D : sve_fp_z2op_p_zd<0b0011011, asm, ZPR64, ZPR64>;
+
+  defm : SVE_3_Op_UndefZero_Pat<nxv8i16, op, nxv8i16, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2i64, op, nxv2i64, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_fp_z2op_p_zd_b_0<string asm, string op> {
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll
new file mode 100644
index 0000000000000..23620a3419b99
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-flogb.ll
@@ -0,0 +1,258 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2   < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme2   -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x i16> @test_svlogb_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svlogb_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svlogb_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svlogb_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svlogb_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svlogb_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    flogb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svlogb_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svlogb_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svlogb_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svlogb_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svlogb_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svlogb_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    flogb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svlogb_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svlogb_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svlogb_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svlogb_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    flogb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svlogb_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svlogb_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    flogb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    flogb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svlogb_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svlogb_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    flogb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    flogb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svlogb_nxv8f16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svlogb_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    flogb z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    flogb z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.flogb.nxv8f16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svlogb_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svlogb_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    flogb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    flogb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svlogb_nxv4f32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svlogb_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    flogb z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    flogb z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.flogb.nxv4f32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svlogb_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svlogb_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    flogb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    flogb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svlogb_nxv2f64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svlogb_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    flogb z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svlogb_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    flogb z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.flogb.nxv2f64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x i64> %0
+}
+

From b31e9747d0866ff97a1cd4a608b7eade31c0aa0b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 27 Jan 2025 13:06:33 +0000
Subject: [PATCH 184/432] [lldb][AArch64] Fix expression evaluation with
 Guarded Control Stacks (#123918)

When the Guarded Control Stack (GCS) is enabled, returns cause the
processor to validate that the address at the location pointed to by
gcspr_el0 matches the one in the link register.

```
ret (lr=A) << pc

| GCS |
+=====+
|  A  |
|  B  | << gcspr_el0

Fault: tried to return to A when you should have returned to B.
```

Therefore when an expression wrapper function tries to return to the
expression return address (usually `_start` if there is a libc), it
would fault.

```
ret (lr=_start) << pc

| GCS        |
+============+
| user_func1 |
| user_func2 | << gcspr_el0

Fault: tried to return to _start when you should have returned to user_func2.
```

To fix this we must push that return address to the GCS in
PrepareTrivialCall. This value is then consumed by the final return and
the expression completes as expected.

If for some reason that fails, we will manually restore the value of
gcspr_el0, because it turns out that PrepareTrivialCall
does not restore registers if it fails at all. So for now I am handling
gcspr_el0 specifically, but I have filed
https://github.com/llvm/llvm-project/issues/124269 to address the
general problem.

(the other things PrepareTrivialCall does are exceedingly likely to not
fail, so we have never noticed this)

```
ret (lr=_start) << pc

| GCS        |
+============+
| user_func1 |
| user_func2 |
| _start     | << gcspr_el0

No fault, we return to _start as normal.
```

The gcspr_el0 register will be restored after expression evaluation so
that the program can continue correctly.

However, due to restrictions in the Linux GCS ABI, we will not restore
the enable bit of gcs_features_enabled. Re-enabling GCS via ptrace is
not supported because it requires memory to be allocated by the kernel.

We could disable GCS if the expression enabled GCS, however this would
use up that state transition that the program might later rely on. And
generally it is cleaner to ignore the enable bit, rather than one state
transition of it.

We will also not restore the GCS entry that was overwritten with the
expression's return address. On the grounds that:
* This entry will never be used by the program. If the program branches,
the entry will be overwritten. If the program returns, gcspr_el0 will
point to the entry before the expression return address and that entry
will instead be validated.
* Any expression that calls functions will overwrite even more entries,
so the user needs to be aware of that anyway if they want to preserve
the contents of the GCS for inspection.
* An expression could leave the program in a state where restoring the
value makes the situation worse. Especially if we ever support this in
bare metal debugging.

I will later document all this on
https://lldb.llvm.org/use/aarch64-linux.html.

Tests have been added for:
* A function call that does not interact with GCS.
* A call that does, and disables it (we do not re-enable it).
* A call that does, and enables it (we do not disable it again).
* Failure to push an entry to the GCS stack.
---
 .../Plugins/ABI/AArch64/ABISysV_arm64.cpp     |  75 +++++++
 .../NativeRegisterContextLinux_arm64.cpp      |  20 +-
 .../linux/aarch64/gcs/TestAArch64LinuxGCS.py  | 211 +++++++++++++++---
 lldb/test/API/linux/aarch64/gcs/main.c        |  47 +++-
 4 files changed, 311 insertions(+), 42 deletions(-)

diff --git a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp
index 93b8141e97ef8..74047ea65788c 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABISysV_arm64.cpp
@@ -60,6 +60,69 @@ ABISysV_arm64::CreateInstance(lldb::ProcessSP process_sp, const ArchSpec &arch)
   return ABISP();
 }
 
+static Status PushToLinuxGuardedControlStack(addr_t return_addr,
+                                             RegisterContext *reg_ctx,
+                                             Thread &thread) {
+  Status err;
+
+  // If the Guarded Control Stack extension is present we may need to put the
+  // return address onto that stack.
+  const RegisterInfo *gcs_features_enabled_info =
+      reg_ctx->GetRegisterInfoByName("gcs_features_enabled");
+  if (!gcs_features_enabled_info)
+    return err;
+
+  uint64_t gcs_features_enabled = reg_ctx->ReadRegisterAsUnsigned(
+      gcs_features_enabled_info, LLDB_INVALID_ADDRESS);
+  if (gcs_features_enabled == LLDB_INVALID_ADDRESS)
+    return Status("Could not read GCS features enabled register.");
+
+  // Only attempt this if GCS is enabled. If it's not enabled then gcspr_el0
+  // may point to unmapped memory.
+  if ((gcs_features_enabled & 1) == 0)
+    return err;
+
+  const RegisterInfo *gcspr_el0_info =
+      reg_ctx->GetRegisterInfoByName("gcspr_el0");
+  if (!gcspr_el0_info)
+    return Status("Could not get register info for gcspr_el0.");
+
+  uint64_t gcspr_el0 =
+      reg_ctx->ReadRegisterAsUnsigned(gcspr_el0_info, LLDB_INVALID_ADDRESS);
+  if (gcspr_el0 == LLDB_INVALID_ADDRESS)
+    return Status("Could not read gcspr_el0.");
+
+  // A link register entry on the GCS is 8 bytes.
+  gcspr_el0 -= 8;
+  if (!reg_ctx->WriteRegisterFromUnsigned(gcspr_el0_info, gcspr_el0))
+    return Status(
+        "Attempted to decrement gcspr_el0, but could not write to it.");
+
+  Status error;
+  size_t wrote = thread.GetProcess()->WriteMemory(gcspr_el0, &return_addr,
+                                                  sizeof(return_addr), error);
+  if ((wrote != sizeof(return_addr) || error.Fail())) {
+    // When PrepareTrivialCall fails, the register context is not restored,
+    // unlike when an expression fails to execute. This is arguably a bug,
+    // see https://github.com/llvm/llvm-project/issues/124269.
+    // For now we are handling this here specifically. We can assume this
+    // write will work as the one to decrement the register did.
+    reg_ctx->WriteRegisterFromUnsigned(gcspr_el0_info, gcspr_el0 + 8);
+    return Status("Failed to write new Guarded Control Stack entry.");
+  }
+
+  Log *log = GetLog(LLDBLog::Expressions);
+  LLDB_LOGF(log,
+            "Pushed return address 0x%" PRIx64 " to Guarded Control Stack. "
+            "gcspr_el0 was 0%" PRIx64 ", is now 0x%" PRIx64 ".",
+            return_addr, gcspr_el0 - 8, gcspr_el0);
+
+  // gcspr_el0 will be restored to the original value by lldb-server after
+  // the call has finished, which serves as the "pop".
+
+  return err;
+}
+
 bool ABISysV_arm64::PrepareTrivialCall(Thread &thread, addr_t sp,
                                        addr_t func_addr, addr_t return_addr,
                                        llvm::ArrayRef<addr_t> args) const {
@@ -87,6 +150,18 @@ bool ABISysV_arm64::PrepareTrivialCall(Thread &thread, addr_t sp,
   if (args.size() > 8)
     return false;
 
+  // Do this first, as it's got the most chance of failing (though still very
+  // low).
+  if (GetProcessSP()->GetTarget().GetArchitecture().GetTriple().isOSLinux()) {
+    Status err = PushToLinuxGuardedControlStack(return_addr, reg_ctx, thread);
+    // If we could not manage the GCS, the expression will certainly fail,
+    // and if we just carried on, that failure would be a lot more cryptic.
+    if (err.Fail()) {
+      LLDB_LOGF(log, "Failed to setup Guarded Call Stack: %s", err.AsCString());
+      return false;
+    }
+  }
+
   for (size_t i = 0; i < args.size(); ++i) {
     const RegisterInfo *reg_info = reg_ctx->GetRegisterInfo(
         eRegisterKindGeneric, LLDB_REGNUM_GENERIC_ARG1 + i);
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index efd3385c46e92..884c7d4b9e359 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -1063,9 +1063,27 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues(
           std::bind(&NativeRegisterContextLinux_arm64::WriteFPMR, this));
       break;
     case RegisterSetType::GCS:
+      // It is not permitted to enable GCS via ptrace. We can disable it, but
+      // to keep things simple we will not revert any change to the
+      // PR_SHADOW_STACK_ENABLE bit. Instead patch in the current enable bit
+      // into the registers we are about to restore.
+      m_gcs_is_valid = false;
+      error = ReadGCS();
+      if (error.Fail())
+        return error;
+
+      uint64_t enable_bit = m_gcs_regs.features_enabled & 1UL;
+      gcs_regs new_gcs_regs = *reinterpret_cast<const gcs_regs *>(src);
+      new_gcs_regs.features_enabled =
+          (new_gcs_regs.features_enabled & ~1UL) | enable_bit;
+
+      const uint8_t *new_gcs_src =
+          reinterpret_cast<const uint8_t *>(&new_gcs_regs);
       error = RestoreRegisters(
-          GetGCSBuffer(), &src, GetGCSBufferSize(), m_gcs_is_valid,
+          GetGCSBuffer(), &new_gcs_src, GetGCSBufferSize(), m_gcs_is_valid,
           std::bind(&NativeRegisterContextLinux_arm64::WriteGCS, this));
+      src += GetGCSBufferSize();
+
       break;
     }
 
diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index d3d4dbecf4a2a..fd46a42b3c69f 100644
--- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -3,7 +3,6 @@
 extension is enabled.
 """
 
-
 import lldb
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
@@ -84,6 +83,40 @@ def test_gcs_fault(self):
             ],
         )
 
+    def check_gcs_registers(
+        self,
+        expected_gcs_features_enabled=None,
+        expected_gcs_features_locked=None,
+        expected_gcspr_el0=None,
+    ):
+        thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0)
+        registerSets = thread.GetFrameAtIndex(0).GetRegisters()
+        gcs_registers = registerSets.GetFirstValueByName(
+            r"Guarded Control Stack Registers"
+        )
+
+        gcs_features_enabled = gcs_registers.GetChildMemberWithName(
+            "gcs_features_enabled"
+        ).GetValueAsUnsigned()
+        if expected_gcs_features_enabled is not None:
+            self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled)
+
+        gcs_features_locked = gcs_registers.GetChildMemberWithName(
+            "gcs_features_locked"
+        ).GetValueAsUnsigned()
+        if expected_gcs_features_locked is not None:
+            self.assertEqual(expected_gcs_features_locked, gcs_features_locked)
+
+        gcspr_el0 = gcs_registers.GetChildMemberWithName(
+            "gcspr_el0"
+        ).GetValueAsUnsigned()
+        if expected_gcspr_el0 is not None:
+            self.assertEqual(expected_gcspr_el0, gcspr_el0)
+
+        return gcs_features_enabled, gcs_features_locked, gcspr_el0
+
+    # This helper reads all the GCS registers and optionally compares them
+    # against a previous state, then returns the current register values.
     @skipUnlessArch("aarch64")
     @skipUnlessPlatform(["linux"])
     def test_gcs_registers(self):
@@ -108,40 +141,7 @@ def test_gcs_registers(self):
 
         self.expect("register read --all", substrs=["Guarded Control Stack Registers:"])
 
-        # This helper reads all the GCS registers and optionally compares them
-        # against a previous state, then returns the current register values.
-        def check_gcs_registers(
-            expected_gcs_features_enabled=None,
-            expected_gcs_features_locked=None,
-            expected_gcspr_el0=None,
-        ):
-            thread = self.dbg.GetSelectedTarget().process.GetThreadAtIndex(0)
-            registerSets = thread.GetFrameAtIndex(0).GetRegisters()
-            gcs_registers = registerSets.GetFirstValueByName(
-                r"Guarded Control Stack Registers"
-            )
-
-            gcs_features_enabled = gcs_registers.GetChildMemberWithName(
-                "gcs_features_enabled"
-            ).GetValueAsUnsigned()
-            if expected_gcs_features_enabled is not None:
-                self.assertEqual(expected_gcs_features_enabled, gcs_features_enabled)
-
-            gcs_features_locked = gcs_registers.GetChildMemberWithName(
-                "gcs_features_locked"
-            ).GetValueAsUnsigned()
-            if expected_gcs_features_locked is not None:
-                self.assertEqual(expected_gcs_features_locked, gcs_features_locked)
-
-            gcspr_el0 = gcs_registers.GetChildMemberWithName(
-                "gcspr_el0"
-            ).GetValueAsUnsigned()
-            if expected_gcspr_el0 is not None:
-                self.assertEqual(expected_gcspr_el0, gcspr_el0)
-
-            return gcs_features_enabled, gcs_features_locked, gcspr_el0
-
-        enabled, locked, spr_el0 = check_gcs_registers()
+        enabled, locked, spr_el0 = self.check_gcs_registers()
 
         # Features enabled should have at least the enable bit set, it could have
         # others depending on what the C library did, but we can't rely on always
@@ -164,7 +164,7 @@ def check_gcs_registers(
             substrs=["stopped", "stop reason = breakpoint"],
         )
 
-        _, _, spr_el0 = check_gcs_registers(enabled, locked, spr_el0 - 8)
+        _, _, spr_el0 = self.check_gcs_registers(enabled, locked, spr_el0 - 8)
 
         # Any combination of GCS feature lock bits might have been set by the C
         # library, and could be set to 0 or 1. To check that we can modify them,
@@ -235,3 +235,142 @@ def check_gcs_registers(
                 "exited with status = 0",
             ],
         )
+
+    @skipUnlessPlatform(["linux"])
+    def test_gcs_expression_simple(self):
+        if not self.isAArch64GCS():
+            self.skipTest("Target must support GCS.")
+
+        self.build()
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+
+        # Break before GCS has been enabled.
+        self.runCmd("b main")
+        # And after it has been enabled.
+        lldbutil.run_break_set_by_file_and_line(
+            self,
+            "main.c",
+            line_number("main.c", "// Set break point at this line."),
+            num_expected_locations=1,
+        )
+
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        # GCS has not been enabled yet and the ABI plugin should know not to
+        # attempt pushing to the control stack.
+        before = self.check_gcs_registers()
+        expr_cmd = "p get_gcs_status()"
+        self.expect(expr_cmd, substrs=["(unsigned long) 0"])
+        self.check_gcs_registers(*before)
+
+        # Continue to when GCS has been enabled.
+        self.runCmd("continue")
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        # If we fail to setup the GCS entry, we should not leave any of the GCS registers
+        # changed. The last thing we do is write a new GCS entry to memory and
+        # to simulate the failure of that, temporarily point the GCS to the zero page.
+        #
+        # We use the value 8 here because LLDB will decrement it by 8 so it points to
+        # what we think will be an empty entry on the guarded control stack.
+        _, _, original_gcspr = self.check_gcs_registers()
+        self.runCmd("register write gcspr_el0 8")
+        before = self.check_gcs_registers()
+        self.expect(expr_cmd, error=True)
+        self.check_gcs_registers(*before)
+        # Point to the valid shadow stack region again.
+        self.runCmd(f"register write gcspr_el0 {original_gcspr}")
+
+        # This time we do need to push to the GCS and having done so, we can
+        # return from this expression without causing a fault.
+        before = self.check_gcs_registers()
+        self.expect(expr_cmd, substrs=["(unsigned long) 1"])
+        self.check_gcs_registers(*before)
+
+    @skipUnlessPlatform(["linux"])
+    def test_gcs_expression_disable_gcs(self):
+        if not self.isAArch64GCS():
+            self.skipTest("Target must support GCS.")
+
+        self.build()
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+
+        # Break after GCS is enabled.
+        lldbutil.run_break_set_by_file_and_line(
+            self,
+            "main.c",
+            line_number("main.c", "// Set break point at this line."),
+            num_expected_locations=1,
+        )
+
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        # Unlock all features so the expression can enable them again.
+        self.runCmd("register write gcs_features_locked 0")
+        # Disable all features, but keep GCS itself enabled.
+        PR_SHADOW_STACK_ENABLE = 1
+        self.runCmd(f"register write gcs_features_enabled 0x{PR_SHADOW_STACK_ENABLE:x}")
+
+        enabled, locked, spr_el0 = self.check_gcs_registers()
+        # We restore everything apart GCS being enabled, as we are not allowed to
+        # go from disabled -> enabled via ptrace.
+        self.expect("p change_gcs_config(false)", substrs=["true"])
+        enabled &= ~1
+        self.check_gcs_registers(enabled, locked, spr_el0)
+
+    @skipUnlessPlatform(["linux"])
+    def test_gcs_expression_enable_gcs(self):
+        if not self.isAArch64GCS():
+            self.skipTest("Target must support GCS.")
+
+        self.build()
+        self.runCmd("file " + self.getBuildArtifact("a.out"), CURRENT_EXECUTABLE_SET)
+
+        # Break before GCS is enabled.
+        self.runCmd("b main")
+
+        self.runCmd("run", RUN_SUCCEEDED)
+
+        if self.process().GetState() == lldb.eStateExited:
+            self.fail("Test program failed to run.")
+
+        self.expect(
+            "thread list",
+            STOPPED_DUE_TO_BREAKPOINT,
+            substrs=["stopped", "stop reason = breakpoint"],
+        )
+
+        # Unlock all features so the expression can enable them again.
+        self.runCmd("register write gcs_features_locked 0")
+        # Disable all features. The program needs PR_SHADOW_STACK_PUSH, but it
+        # will enable that itself.
+        self.runCmd(f"register write gcs_features_enabled 0")
+
+        enabled, locked, spr_el0 = self.check_gcs_registers()
+        self.expect("p change_gcs_config(true)", substrs=["true"])
+        # Though we could disable GCS with ptrace, we choose not to to be
+        # consistent with the disabled -> enabled behaviour.
+        enabled |= 1
+        self.check_gcs_registers(enabled, locked, spr_el0)
diff --git a/lldb/test/API/linux/aarch64/gcs/main.c b/lldb/test/API/linux/aarch64/gcs/main.c
index 09354639af376..396aef7499ca9 100644
--- a/lldb/test/API/linux/aarch64/gcs/main.c
+++ b/lldb/test/API/linux/aarch64/gcs/main.c
@@ -1,4 +1,5 @@
 #include <asm/hwcap.h>
+#include <stdbool.h>
 #include <sys/auxv.h>
 #include <sys/prctl.h>
 
@@ -8,7 +9,12 @@
 
 #define PR_GET_SHADOW_STACK_STATUS 74
 #define PR_SET_SHADOW_STACK_STATUS 75
-#define PR_SHADOW_STACK_ENABLE (1UL)
+#define PR_LOCK_SHADOW_STACK_STATUS 76
+
+#define PR_SHADOW_STACK_ENABLE (1UL << 0)
+#define PR_SHADOW_STACK_WRITE (1UL << 1)
+#define PR_SHADOW_STACK_PUSH (1UL << 2)
+
 #define PRCTL_SYSCALL_NO 167
 
 // Once we enable GCS, we cannot return from the function that made the syscall
@@ -36,6 +42,36 @@ unsigned long get_gcs_status() {
   return mode;
 }
 
+extern void _start();
+bool change_gcs_config(bool enable) {
+  // The test unlocks and disables all features (excluding the main enable bit)
+  // before calling this expression. Enable them again.
+  unsigned long new_status =
+      enable | PR_SHADOW_STACK_PUSH | PR_SHADOW_STACK_WRITE;
+
+  if (enable) {
+    // We would not be able to return from prctl().
+    my_prctl(PR_SET_SHADOW_STACK_STATUS, new_status, 0, 0, 0);
+
+    // This is a stack, so we must push in reverse order to the pops we want to
+    // have later. So push the return of __lldb_expr (_start), then the return
+    // address of this function (__lldb_expr).
+    __asm__ __volatile__("sys	#3, C7, C7, #0, %0\n"  // gcspushm _start
+                         "sys	#3, C7, C7, #0, x30\n" // gcspushm x30
+                         :
+                         : "r"(_start));
+  } else {
+    if (prctl(PR_SET_SHADOW_STACK_STATUS, new_status, 0, 0, 0) != 0)
+      return false;
+  }
+
+  // Turn back on all locks.
+  if (prctl(PR_LOCK_SHADOW_STACK_STATUS, ~(0UL), 0, 0, 0) != 0)
+    return false;
+
+  return true;
+}
+
 void gcs_signal() {
   // If we enabled GCS manually, then we could just return from main to generate
   // a signal. However, if the C library enabled it, then we'd just exit
@@ -50,10 +86,11 @@ void gcs_signal() {
 }
 
 // These functions are used to observe gcspr_el0 changing as we enter them, and
-// the fault we cause by changing its value.
-void test_func2() { volatile int i = 99; }
+// the fault we cause by changing its value. Also used to check expression
+// eval can handle function calls.
+int test_func2() { return 99; }
 
-void test_func() { test_func2(); }
+int test_func() { return test_func2(); }
 
 int main() {
   if (!(getauxval(AT_HWCAP) & HWCAP_GCS))
@@ -71,7 +108,7 @@ int main() {
   // By now we should have one memory region where the GCS is stored.
 
   // For register read/write tests.
-  test_func();
+  volatile int i = test_func();
 
   // If this was a register test, we would have disabled GCS during the
   // test_func call. We cannot re-enable it from ptrace so skip this part in

From ef54e0bbfbef59932a59a1640f1f9e14b70cc41b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 27 Jan 2025 13:12:11 +0000
Subject: [PATCH 185/432] [AArch64] Avoid generating LDAPUR on certain cores
 (#124274)

On the CPUs listed below, we want to avoid LDAPUR for performance
reasons. Add a tuning feature to disable them when using:
 -mcpu=neoverse-v2
 -mcpu=neoverse-v3
 -mcpu=cortex-x3
 -mcpu=cortex-x4
 -mcpu=cortex-x925
---
 llvm/lib/Target/AArch64/AArch64Features.td    |   6 +-
 .../lib/Target/AArch64/AArch64InstrAtomics.td |   4 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   2 +
 llvm/lib/Target/AArch64/AArch64Processors.td  |   6 +
 .../Atomics/aarch64-atomic-load-rcpc_immo.ll  | 144 ++++++++++++++----
 5 files changed, 127 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 0a91edb4c1661..20db70ee38572 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -805,10 +805,14 @@ def FeatureLdpAlignedOnly : SubtargetFeature<"ldp-aligned-only", "HasLdpAlignedO
 def FeatureStpAlignedOnly : SubtargetFeature<"stp-aligned-only", "HasStpAlignedOnly",
     "true", "In order to emit stp, first check if the store will be aligned to 2 * element_size">;
 
-def FeatureUseFixedOverScalableIfEqualCost: SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
+def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-scalable-if-equal-cost",
   "UseFixedOverScalableIfEqualCost", "true",
   "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">;
 
+// For performance reasons we prefer to use ldapr to ldapur on certain cores.
+def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true",
+  "Prefer add+ldapr to offset ldapur">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index de94cf64c9801..5e6db9d007a55 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -575,7 +575,7 @@ let Predicates = [HasRCPC3, HasNEON] in {
 }
 
 // v8.4a FEAT_LRCPC2 patterns
-let Predicates = [HasRCPC_IMMO] in {
+let Predicates = [HasRCPC_IMMO, UseLDAPUR] in {
   // Load-Acquire RCpc Register unscaled loads
   def : Pat<(acquiring_load<atomic_load_az_8>
                (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
@@ -589,7 +589,9 @@ let Predicates = [HasRCPC_IMMO] in {
   def : Pat<(acquiring_load<atomic_load_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (LDAPURXi GPR64sp:$Rn, simm9:$offset)>;
+}
 
+let Predicates = [HasRCPC_IMMO] in {
   // Store-Release Register unscaled stores
   def : Pat<(releasing_store<atomic_store_8>
                (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index fa6385409f30c..9d0bd44544134 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -389,6 +389,8 @@ def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
+def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;
+
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td
index 0e3c4e8397f52..8a2c0442a0c0d 100644
--- a/llvm/lib/Target/AArch64/AArch64Processors.td
+++ b/llvm/lib/Target/AArch64/AArch64Processors.td
@@ -240,6 +240,7 @@ def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3",
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeatureUseFixedOverScalableIfEqualCost,
+                               FeatureAvoidLDAPUR,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
@@ -250,6 +251,7 @@ def TuneX4 : SubtargetFeature<"cortex-x4", "ARMProcFamily", "CortexX4",
                                FeaturePostRAScheduler,
                                FeatureEnableSelectOptimize,
                                FeatureUseFixedOverScalableIfEqualCost,
+                               FeatureAvoidLDAPUR,
                                FeaturePredictableSelectIsExpensive]>;
 
 def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
@@ -260,6 +262,7 @@ def TuneX925 : SubtargetFeature<"cortex-x925", "ARMProcFamily",
                                 FeaturePostRAScheduler,
                                 FeatureEnableSelectOptimize,
                                 FeatureUseFixedOverScalableIfEqualCost,
+                                FeatureAvoidLDAPUR,
                                 FeaturePredictableSelectIsExpensive]>;
 
 def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
@@ -540,6 +543,7 @@ def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
                                       FeatureUseFixedOverScalableIfEqualCost,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3",
@@ -549,6 +553,7 @@ def TuneNeoverseV3 : SubtargetFeature<"neoversev3", "ARMProcFamily", "NeoverseV3
                                       FeatureFuseAdrpAdd,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "NeoverseV3",
@@ -558,6 +563,7 @@ def TuneNeoverseV3AE : SubtargetFeature<"neoversev3AE", "ARMProcFamily", "Neover
                                       FeatureFuseAdrpAdd,
                                       FeaturePostRAScheduler,
                                       FeatureEnableSelectOptimize,
+                                      FeatureAvoidLDAPUR,
                                       FeaturePredictableSelectIsExpensive]>;
 
 def TuneSaphira  : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
index 9687ba683fb7e..b475e68db411a 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "(?!^\s*lda.*\bsp\b)^\s*.*\bsp\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
 ; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
-; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-NOAVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo,avoid-ldapur -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v2 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-v3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x3 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x4 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mcpu=cortex-x925 -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG,SDAG-AVOIDLDAPUR
 
 define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_unordered:
@@ -39,8 +45,12 @@ define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_aligned_acquire:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -51,8 +61,12 @@ define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_aligned_acquire_const:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -113,8 +127,12 @@ define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #8
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_aligned_acquire:
-; SDAG:    ldapurh w0, [x0, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [x0, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #8
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -125,8 +143,12 @@ define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #8
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_aligned_acquire_const:
-; SDAG:    ldapurh w0, [x0, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [x0, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #8
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -183,16 +205,30 @@ define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i32_aligned_acquire:
-; CHECK:    ldapur w0, [x0, #16]
+; GISEL-LABEL: load_atomic_i32_aligned_acquire:
+; GISEL:    ldapur w0, [x0, #16]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [x0, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #16
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
 }
 
 define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i32_aligned_acquire_const:
-; CHECK:    ldapur w0, [x0, #16]
+; GISEL-LABEL: load_atomic_i32_aligned_acquire_const:
+; GISEL:    ldapur w0, [x0, #16]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [x0, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #16
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
@@ -249,16 +285,30 @@ define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) {
 }
 
 define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) {
-; CHECK-LABEL: load_atomic_i64_aligned_acquire:
-; CHECK:    ldapur x0, [x0, #32]
+; GISEL-LABEL: load_atomic_i64_aligned_acquire:
+; GISEL:    ldapur x0, [x0, #32]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [x0, #32]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #32
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
 }
 
 define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) {
-; CHECK-LABEL: load_atomic_i64_aligned_acquire_const:
-; CHECK:    ldapur x0, [x0, #32]
+; GISEL-LABEL: load_atomic_i64_aligned_acquire_const:
+; GISEL:    ldapur x0, [x0, #32]
+;
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [x0, #32]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_aligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #32
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
@@ -387,8 +437,12 @@ define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_unaligned_acquire:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -399,8 +453,12 @@ define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) {
 ; GISEL:    add x8, x0, #4
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_unaligned_acquire_const:
-; SDAG:    ldapurb w0, [x0, #4]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const:
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [x0, #4]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_unaligned_acquire_const:
+; SDAG-AVOIDLDAPUR:    add x8, x0, #4
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -846,9 +904,14 @@ define i8 @load_atomic_i8_from_gep() {
 ; GISEL:    add x8, x8, #1
 ; GISEL:    ldaprb w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i8_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapurb w0, [sp, #13]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i8_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapurb w0, [sp, #13]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i8_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    orr x8, x19, #0x1
+; SDAG-AVOIDLDAPUR:    ldaprb w0, [x8]
   %a = alloca [3 x i8]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i8], ptr %a, i64 0, i64 1
@@ -862,9 +925,14 @@ define i16 @load_atomic_i16_from_gep() {
 ; GISEL:    add x8, x8, #2
 ; GISEL:    ldaprh w0, [x8]
 ;
-; SDAG-LABEL: load_atomic_i16_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapurh w0, [sp, #10]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i16_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapurh w0, [sp, #10]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i16_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    orr x8, x19, #0x2
+; SDAG-AVOIDLDAPUR:    ldaprh w0, [x8]
   %a = alloca [3 x i16]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i16], ptr %a, i64 0, i64 1
@@ -877,9 +945,14 @@ define i32 @load_atomic_i32_from_gep() {
 ; GISEL:    bl init
 ; GISEL:    ldapur w0, [x8, #4]
 ;
-; SDAG-LABEL: load_atomic_i32_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapur w0, [sp, #8]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i32_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapur w0, [sp, #8]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i32_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    add x8, x19, #4
+; SDAG-AVOIDLDAPUR:    ldapr w0, [x8]
   %a = alloca [3 x i32]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i32], ptr %a, i64 0, i64 1
@@ -892,9 +965,14 @@ define i64 @load_atomic_i64_from_gep() {
 ; GISEL:    bl init
 ; GISEL:    ldapur x0, [x8, #8]
 ;
-; SDAG-LABEL: load_atomic_i64_from_gep:
-; SDAG:    bl init
-; SDAG:    ldapur x0, [sp, #16]
+; SDAG-NOAVOIDLDAPUR-LABEL: load_atomic_i64_from_gep:
+; SDAG-NOAVOIDLDAPUR:    bl init
+; SDAG-NOAVOIDLDAPUR:    ldapur x0, [sp, #16]
+;
+; SDAG-AVOIDLDAPUR-LABEL: load_atomic_i64_from_gep:
+; SDAG-AVOIDLDAPUR:    bl init
+; SDAG-AVOIDLDAPUR:    add x8, x19, #8
+; SDAG-AVOIDLDAPUR:    ldapr x0, [x8]
   %a = alloca [3 x i64]
   call void @init(ptr %a)
   %arrayidx  = getelementptr [3 x i64], ptr %a, i64 0, i64 1

From 347fb208c1e390a4f108e566efc81bd945837307 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 27 Jan 2025 13:25:37 +0000
Subject: [PATCH 186/432] [libclc] Optimize CLC vector relational builtins
 (#124537)

Clang knows how to perform relational operations on OpenCL vectors, so
we don't need to use the Clang builtins. The builtins we were using
didn't support vector types, so we were previously scalarizing.

This commit generates the same LLVM fcmp operations as before, just
without the scalarization.
---
 .../clc/include/clc/relational/relational.h   | 26 +++++++++++++
 .../clc/lib/generic/relational/clc_isequal.cl | 38 ++++++-------------
 .../lib/generic/relational/clc_isgreater.cl   | 23 +++--------
 .../generic/relational/clc_isgreaterequal.cl  | 30 ++++++---------
 .../clc/lib/generic/relational/clc_isless.cl  | 31 ++++++---------
 .../lib/generic/relational/clc_islessequal.cl | 24 ++++--------
 .../generic/relational/clc_islessgreater.cl   | 27 +++++--------
 .../lib/generic/relational/clc_isnotequal.cl  | 21 ++++------
 8 files changed, 90 insertions(+), 130 deletions(-)

diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h
index 54241b6493c8e..f32e7630203e4 100644
--- a/libclc/clc/include/clc/relational/relational.h
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -142,4 +142,30 @@
   _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,         \
                                         ARG1_TYPE)
 
+#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \
+                                             ARG1_TYPE, ARG2_TYPE)             \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 FUNCTION(ARG1_TYPE##2 x,              \
+                                                  ARG2_TYPE##2 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 FUNCTION(ARG1_TYPE##3 x,              \
+                                                  ARG2_TYPE##3 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 FUNCTION(ARG1_TYPE##4 x,              \
+                                                  ARG2_TYPE##4 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 FUNCTION(ARG1_TYPE##8 x,              \
+                                                  ARG2_TYPE##8 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 FUNCTION(ARG1_TYPE##16 x,            \
+                                                   ARG2_TYPE##16 y) {          \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }
+
 #endif // __CLC_RELATIONAL_RELATIONAL_H__
diff --git a/libclc/clc/lib/generic/relational/clc_isequal.cl b/libclc/clc/lib/generic/relational/clc_isequal.cl
index 7664df7767cb3..053a237289fd6 100644
--- a/libclc/clc/lib/generic/relational/clc_isequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isequal.cl
@@ -1,44 +1,28 @@
 #include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
 
-#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x == y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) == (Y)
 
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, float, float)
-_CLC_DEFINE_ISEQUAL(int2, __clc_isequal, float2, float2)
-_CLC_DEFINE_ISEQUAL(int3, __clc_isequal, float3, float3)
-_CLC_DEFINE_ISEQUAL(int4, __clc_isequal, float4, float4)
-_CLC_DEFINE_ISEQUAL(int8, __clc_isequal, float8, float8)
-_CLC_DEFINE_ISEQUAL(int16, __clc_isequal, float16, float16)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isequal, float, float)
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-// The scalar version of __clc_isequal(double) returns an int, but the vector
-// versions return long.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, double, double)
-_CLC_DEFINE_ISEQUAL(long2, __clc_isequal, double2, double2)
-_CLC_DEFINE_ISEQUAL(long3, __clc_isequal, double3, double3)
-_CLC_DEFINE_ISEQUAL(long4, __clc_isequal, double4, double4)
-_CLC_DEFINE_ISEQUAL(long8, __clc_isequal, double8, double8)
-_CLC_DEFINE_ISEQUAL(long16, __clc_isequal, double16, double16)
+// The scalar version of __clc_isequal(double, double) returns an int, but the
+// vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isequal, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isequal(half) returns an int, but the vector
-// versions return short.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, half, half)
-_CLC_DEFINE_ISEQUAL(short2, __clc_isequal, half2, half2)
-_CLC_DEFINE_ISEQUAL(short3, __clc_isequal, half3, half3)
-_CLC_DEFINE_ISEQUAL(short4, __clc_isequal, half4, half4)
-_CLC_DEFINE_ISEQUAL(short8, __clc_isequal, half8, half8)
-_CLC_DEFINE_ISEQUAL(short16, __clc_isequal, half16, half16)
+// The scalar version of __clc_isequal(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isequal, half, half)
 
 #endif
 
-#undef _CLC_DEFINE_ISEQUAL
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isgreater.cl b/libclc/clc/lib/generic/relational/clc_isgreater.cl
index 39fb6b07fb185..ec14fa9a2ec08 100644
--- a/libclc/clc/lib/generic/relational/clc_isgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreater.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isgreater with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) > (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
-                              float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreater, float, float)
 
 #ifdef cl_khr_fp64
 
@@ -14,12 +11,7 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
 
 // The scalar version of __clc_isgreater(double, double) returns an int, but the
 // vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(double x, double y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreater, double, double)
 
 #endif
 
@@ -29,11 +21,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
 
 // The scalar version of __clc_isgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(half x, half y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreater, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
index ccf7c881a5549..e96f2325cbad4 100644
--- a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isgreaterequal with vector inputs,
-// but it seems to only take scalar values as input, which will produce
-// incorrect output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) >= (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
-                              __builtin_isgreaterequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreaterequal, float,
+                                     float)
 
 #ifdef cl_khr_fp64
 
@@ -14,26 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
 
 // The scalar version of __clc_isgreaterequal(double, double) returns an int,
 // but the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(double x, double y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreaterequal, double,
-                                      double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreaterequal, double,
+                                     double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isgreaterequal(half, half) returns an int, but
+// The scalar version of __clc_isgreaterequal(half, hafl) returns an int, but
 // the vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(half x, half y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreaterequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreaterequal, half,
+                                     half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isless.cl b/libclc/clc/lib/generic/relational/clc_isless.cl
index 1204a5057d864..0ce001d31d696 100644
--- a/libclc/clc/lib/generic/relational/clc_isless.cl
+++ b/libclc/clc/lib/generic/relational/clc_isless.cl
@@ -1,37 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isless with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) < (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isless, __builtin_isless, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isless, float, float)
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-// The scalar version of __clc_isless(double, double) returns an int, but the
-// vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(double x, double y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isless, double, double)
+// The scalar version of __clc_isless(double, double) returns an int, but
+// the vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isless, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isless(half, half) returns an int, but the vector
-// versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(half x, half y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isless, half, half)
+// The scalar version of __clc_isless(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isless, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_islessequal.cl b/libclc/clc/lib/generic/relational/clc_islessequal.cl
index 6fde763263e2b..2d1d6d199fdab 100644
--- a/libclc/clc/lib/generic/relational/clc_islessequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessequal.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_islessequal with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) <= (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessequal, float, float)
 
 #ifdef cl_khr_fp64
 
@@ -14,12 +11,8 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
 
 // The scalar version of __clc_islessequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(double x, double y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessequal, double,
+                                     double)
 
 #endif
 
@@ -29,11 +22,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
 
 // The scalar version of __clc_islessequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(half x, half y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessequal, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_islessgreater.cl b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
index 5106c9f460e2c..3ca3c37731d15 100644
--- a/libclc/clc/lib/generic/relational/clc_islessgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_islessgreater with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) ((X) < (Y)) || ((X) > (Y))
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessgreater, float,
+                                     float)
 
 #ifdef cl_khr_fp64
 
@@ -14,25 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
 
 // The scalar version of __clc_islessgreater(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(double x, double y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessgreater, double,
+                                     double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_islessgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(half x, half y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessgreater, half,
+                                     half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isnotequal.cl b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
index 9f90713b2da50..d1ee4deab25c8 100644
--- a/libclc/clc/lib/generic/relational/clc_isnotequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
@@ -1,33 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)       \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x != y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) != (Y)
 
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isnotequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isnotequal, float, float)
 
 #ifdef cl_khr_fp64
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // The scalar version of __clc_isnotequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isnotequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isnotequal, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isnotequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isnotequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isnotequal, half, half)
 
 #endif
 
-#undef _CLC_DEFINE_ISNOTEQUAL
+#undef _CLC_RELATIONAL_OP

From e9e06bea8661ddd474557a0db2cdc8770a55b66f Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 27 Jan 2025 13:27:31 +0000
Subject: [PATCH 187/432] [lldb][AArch64][NFC] Move a comment in GCS tests

Got put in the wrong place during a rebase.
---
 lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index fd46a42b3c69f..797551b061a23 100644
--- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -83,6 +83,8 @@ def test_gcs_fault(self):
             ],
         )
 
+    # This helper reads all the GCS registers and optionally compares them
+    # against a previous state, then returns the current register values.
     def check_gcs_registers(
         self,
         expected_gcs_features_enabled=None,
@@ -115,8 +117,6 @@ def check_gcs_registers(
 
         return gcs_features_enabled, gcs_features_locked, gcspr_el0
 
-    # This helper reads all the GCS registers and optionally compares them
-    # against a previous state, then returns the current register values.
     @skipUnlessArch("aarch64")
     @skipUnlessPlatform(["linux"])
     def test_gcs_registers(self):

From d7e561b913d2a75c7c1807bf1c1e0bddc270a2b3 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Mon, 27 Jan 2025 15:02:38 +0100
Subject: [PATCH 188/432] [flang][OpenMP] Support `bind` clause code-gen for
 standalone `loop`s (#122674)

Extends rewriting of `loop` directives by supporting `bind` clause for
standalone directives. This follows both the spec and the current state
of clang as follows:
* No `bind` or `bind(thread)`: the `loop` is rewritten to `simd`.
* `bind(parallel)`: the `loop` is rewritten to `do`.
* `bind(teams)`: the `loop` is rewritten to `distribute`.

This is a follow-up PR for
https://github.com/llvm/llvm-project/pull/122632, only the latest commit
in this PR is relevant to the PR.
---
 .../OpenMP/GenericLoopConversion.cpp          | 86 +++++++++++++++----
 flang/test/Lower/OpenMP/loop-directive.f90    | 42 ++++++++-
 2 files changed, 109 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index 555601c5e92df..c95d625d7240b 100644
--- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -53,7 +53,7 @@ class GenericLoopConversionPattern
 
     switch (combinedInfo) {
     case GenericLoopCombinedInfo::Standalone:
-      rewriteToSimdLoop(loopOp, rewriter);
+      rewriteStandaloneLoop(loopOp, rewriter);
       break;
     case GenericLoopCombinedInfo::TargetParallelLoop:
       llvm_unreachable("not yet implemented: `parallel loop` direcitve");
@@ -87,7 +87,10 @@ class GenericLoopConversionPattern
              << loopOp->getName() << " operation";
     };
 
-    if (loopOp.getBindKind())
+    // For standalone directives, `bind` is already supported. Other combined
+    // forms will be supported in a follow-up PR.
+    if (combinedInfo != GenericLoopCombinedInfo::Standalone &&
+        loopOp.getBindKind())
       return todo("bind");
 
     if (loopOp.getOrder())
@@ -119,7 +122,27 @@ class GenericLoopConversionPattern
     return result;
   }
 
-  /// Rewrites standalone `loop` directives to equivalent `simd` constructs.
+  void rewriteStandaloneLoop(mlir::omp::LoopOp loopOp,
+                             mlir::ConversionPatternRewriter &rewriter) const {
+    using namespace mlir::omp;
+    std::optional<ClauseBindKind> bindKind = loopOp.getBindKind();
+
+    if (!bindKind.has_value())
+      return rewriteToSimdLoop(loopOp, rewriter);
+
+    switch (*loopOp.getBindKind()) {
+    case ClauseBindKind::Parallel:
+      return rewriteToWsloop(loopOp, rewriter);
+    case ClauseBindKind::Teams:
+      return rewriteToDistrbute(loopOp, rewriter);
+    case ClauseBindKind::Thread:
+      return rewriteToSimdLoop(loopOp, rewriter);
+    }
+  }
+
+  /// Rewrites standalone `loop` (without `bind` clause or with
+  /// `bind(parallel)`) directives to equivalent `simd` constructs.
+  ///
   /// The reasoning behind this decision is that according to the spec (version
   /// 5.2, section 11.7.1):
   ///
@@ -147,30 +170,57 @@ class GenericLoopConversionPattern
   /// the directive.
   void rewriteToSimdLoop(mlir::omp::LoopOp loopOp,
                          mlir::ConversionPatternRewriter &rewriter) const {
-    loopOp.emitWarning("Detected standalone OpenMP `loop` directive, the "
-                       "associated loop will be rewritten to `simd`.");
-    mlir::omp::SimdOperands simdClauseOps;
-    simdClauseOps.privateVars = loopOp.getPrivateVars();
+    loopOp.emitWarning(
+        "Detected standalone OpenMP `loop` directive with thread binding, "
+        "the associated loop will be rewritten to `simd`.");
+    rewriteToSingleWrapperOp<mlir::omp::SimdOp, mlir::omp::SimdOperands>(
+        loopOp, rewriter);
+  }
+
+  void rewriteToDistrbute(mlir::omp::LoopOp loopOp,
+                          mlir::ConversionPatternRewriter &rewriter) const {
+    rewriteToSingleWrapperOp<mlir::omp::DistributeOp,
+                             mlir::omp::DistributeOperands>(loopOp, rewriter);
+  }
+
+  void rewriteToWsloop(mlir::omp::LoopOp loopOp,
+                       mlir::ConversionPatternRewriter &rewriter) const {
+    rewriteToSingleWrapperOp<mlir::omp::WsloopOp, mlir::omp::WsloopOperands>(
+        loopOp, rewriter);
+  }
+
+  // TODO Suggestion by Sergio: tag auto-generated operations for constructs
+  // that weren't part of the original program, that would be useful
+  // information for debugging purposes later on. This new attribute could be
+  // used for `omp.loop`, but also for `do concurrent` transformations,
+  // `workshare`, `workdistribute`, etc. The tag could be used for all kinds of
+  // auto-generated operations using a dialect attribute (named something like
+  // `omp.origin` or `omp.derived`) and perhaps hold the name of the operation
+  // it was derived from, the reason it was transformed or something like that
+  // we could use when emitting any messages related to it later on.
+  template <typename OpTy, typename OpOperandsTy>
+  void
+  rewriteToSingleWrapperOp(mlir::omp::LoopOp loopOp,
+                           mlir::ConversionPatternRewriter &rewriter) const {
+    OpOperandsTy clauseOps;
+    clauseOps.privateVars = loopOp.getPrivateVars();
 
     auto privateSyms = loopOp.getPrivateSyms();
     if (privateSyms)
-      simdClauseOps.privateSyms.assign(privateSyms->begin(),
-                                       privateSyms->end());
+      clauseOps.privateSyms.assign(privateSyms->begin(), privateSyms->end());
 
-    Fortran::common::openmp::EntryBlockArgs simdArgs;
-    simdArgs.priv.vars = simdClauseOps.privateVars;
+    Fortran::common::openmp::EntryBlockArgs args;
+    args.priv.vars = clauseOps.privateVars;
 
-    auto simdOp =
-        rewriter.create<mlir::omp::SimdOp>(loopOp.getLoc(), simdClauseOps);
-    mlir::Block *simdBlock =
-        genEntryBlock(rewriter, simdArgs, simdOp.getRegion());
+    auto wrapperOp = rewriter.create<OpTy>(loopOp.getLoc(), clauseOps);
+    mlir::Block *opBlock = genEntryBlock(rewriter, args, wrapperOp.getRegion());
 
     mlir::IRMapping mapper;
     mlir::Block &loopBlock = *loopOp.getRegion().begin();
 
-    for (auto [loopOpArg, simdopArg] :
-         llvm::zip_equal(loopBlock.getArguments(), simdBlock->getArguments()))
-      mapper.map(loopOpArg, simdopArg);
+    for (auto [loopOpArg, opArg] :
+         llvm::zip_equal(loopBlock.getArguments(), opBlock->getArguments()))
+      mapper.map(loopOpArg, opArg);
 
     rewriter.clone(*loopOp.begin(), mapper);
   }
diff --git a/flang/test/Lower/OpenMP/loop-directive.f90 b/flang/test/Lower/OpenMP/loop-directive.f90
index 9fa0de3bfe171..845905da0fcba 100644
--- a/flang/test/Lower/OpenMP/loop-directive.f90
+++ b/flang/test/Lower/OpenMP/loop-directive.f90
@@ -92,7 +92,7 @@ subroutine test_reduction()
 ! CHECK-LABEL: func.func @_QPtest_bind
 subroutine test_bind()
   integer :: i, dummy = 1
-  ! CHECK: omp.loop bind(thread) private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
+  ! CHECK: omp.simd private(@{{.*}} %{{.*}}#0 -> %{{.*}} : {{.*}}) {
   ! CHECK: }
   !$omp loop bind(thread)
   do i=1,10
@@ -139,3 +139,43 @@ subroutine test_nested_directives
   end do
   !$omp end target teams
 end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_standalone_bind_teams
+subroutine test_standalone_bind_teams
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK:     omp.distribute
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK:       omp.loop_nest {{.*}} {
+  ! CHECK:       }
+  ! CHECK:     }
+  !$omp loop bind(teams) private(a)
+  do i=1,N
+    c(i) = a(i) * b(i)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_standalone_bind_parallel
+subroutine test_standalone_bind_parallel
+  implicit none
+  integer, parameter :: N = 100000
+  integer a(N), b(N), c(N)
+  integer j,i, num, flag;
+  num = N
+
+  ! CHECK:     omp.wsloop
+  ! CHECK-SAME:  private(@{{.*}}Ea_private_ref_100000xi32 {{[^,]*}},
+  ! CHECK-SAME:          @{{.*}}Ei_private_ref_i32 {{.*}} : {{.*}}) {
+  ! CHECK:       omp.loop_nest {{.*}} {
+  ! CHECK:       }
+  ! CHECK:     }
+  !$omp loop bind(parallel) private(a)
+  do i=1,N
+    c(i) = a(i) * b(i)
+  end do
+end subroutine

From e7592d83e0ac58f61cfe8dcf61bcc8e7a8bd67b3 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 08:00:07 -0600
Subject: [PATCH 189/432] [Offload][NFC] Make sure the thread is not running
 already

---
 offload/plugins-nextgen/common/include/RPC.h | 2 +-
 offload/plugins-nextgen/common/src/RPC.cpp   | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/common/include/RPC.h b/offload/plugins-nextgen/common/include/RPC.h
index f3a8e7555020d..42fca4aa4aebc 100644
--- a/offload/plugins-nextgen/common/include/RPC.h
+++ b/offload/plugins-nextgen/common/include/RPC.h
@@ -99,7 +99,7 @@ struct RPCServerTy {
     /// Initialize the worker thread to run in the background.
     ServerThread(void *Buffers[], plugin::GenericDeviceTy *Devices[],
                  size_t Length)
-        : Running(true), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length),
+        : Running(false), NumUsers(0), CV(), Mutex(), Buffers(Buffers, Length),
           Devices(Devices, Length) {}
 
     ~ServerThread() { assert(!Running && "Thread not shut down explicitly\n"); }
diff --git a/offload/plugins-nextgen/common/src/RPC.cpp b/offload/plugins-nextgen/common/src/RPC.cpp
index 81ad9ca66808a..e6750a540b391 100644
--- a/offload/plugins-nextgen/common/src/RPC.cpp
+++ b/offload/plugins-nextgen/common/src/RPC.cpp
@@ -99,10 +99,15 @@ static rpc::Status runServer(plugin::GenericDeviceTy &Device, void *Buffer) {
 }
 
 void RPCServerTy::ServerThread::startThread() {
+  assert(!Running.load(std::memory_order_relaxed) &&
+         "Attempting to start thread that is already running");
+  Running.store(true, std::memory_order_release);
   Worker = std::thread([this]() { run(); });
 }
 
 void RPCServerTy::ServerThread::shutDown() {
+  assert(Running.load(std::memory_order_relaxed) &&
+         "Attempting to shut down a thread that is not running");
   {
     std::lock_guard<decltype(Mutex)> Lock(Mutex);
     Running.store(false, std::memory_order_release);

From 86705eb6242b5e2d6153708ddedffbfc95491756 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 27 Jan 2025 14:00:03 +0000
Subject: [PATCH 190/432] [X86] huge-stack-offset.ll - add gnux32 test coverage

This should match x86 for the basic implementation, but its useful to check it actually runs correctly.
---
 llvm/test/CodeGen/X86/huge-stack-offset.ll | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/huge-stack-offset.ll b/llvm/test/CodeGen/X86/huge-stack-offset.ll
index 6629811a59b23..d6080cfd3f753 100644
--- a/llvm/test/CodeGen/X86/huge-stack-offset.ll
+++ b/llvm/test/CodeGen/X86/huge-stack-offset.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-64
 ; RUN: llc < %s -mtriple=i386-linux-unknown -verify-machineinstrs | FileCheck %s --check-prefix=CHECK-32
+; RUN: llc < %s -mtriple=x86_64-linux-gnux32 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-32
 
 ; Test that a large stack offset uses a single add/sub instruction to
 ; adjust the stack pointer.
@@ -11,7 +12,7 @@ define void @foo() nounwind {
 ; CHECK-64-NOT:  subq    $2147483647, %rsp
 ; CHECK-64:      movabsq $50000000{{..}}, [[RAX:%r..]]
 ; CHECK-64-NEXT: addq    [[RAX]], %rsp
-
+;
 ; CHECK-32-LABEL: foo:
 ; CHECK-32:      ud2
 ; CHECK-32-NOT:  subl    $2147483647, %esp
@@ -27,7 +28,7 @@ define i32 @foo2() nounwind {
 ; CHECK-64-LABEL: foo2:
 ; CHECK-64:     movl    $10, %eax
 ; CHECK-64-NOT: movabsq ${{.*}}, %rax
-
+;
 ; CHECK-32-LABEL: foo2:
 ; CHECK-32:     movl    $10, %eax
 ; CHECK-32-NOT: movl    ${{.*}}, %eax
@@ -42,7 +43,7 @@ define i32 @foo3(i32 inreg %x) nounwind {
 ; CHECK-64-LABEL: foo3:
 ; CHECK-64:      movabsq $50000000{{..}}, %rax
 ; CHECK-64-NEXT: subq    %rax, %rsp
-
+;
 ; CHECK-32-LABEL: foo3:
 ; CHECK-32:      ud2
 ; CHECK-32-NOT:  movl ${{.*}}, %eax

From 3684ec425904424fc4dc80c8661f82bc676d7197 Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 09:18:47 -0500
Subject: [PATCH 191/432] [flang] IEEE underflow control for Arm (#124170)

Update IEEE_SUPPORT_UNDERFLOW_CONTROL, IEEE_GET_UNDERFLOW_MODE, and
IEEE_SET_UNDERFLOW_MODE code for Arm.
---
 flang/include/flang/Tools/TargetSetup.h | 31 ++++++++++-----------
 flang/runtime/exceptions.cpp            | 36 ++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index d1b0da3a42c89..5d23df6823a94 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -24,34 +24,35 @@ namespace Fortran::tools {
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
-  // FIXME: Handle real(3) ?
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
-    targetCharacteristics.DisableType(
-        Fortran::common::TypeCategory::Real, /*kind=*/10);
-  }
+
+  targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true);
+
   if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
   }
+
   if (targetTriple.isARM() || targetTriple.isAArch64()) {
     targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime();
     targetCharacteristics.set_ieeeFeature(
         evaluate::IeeeFeature::Halting, false);
-  } else {
-    targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
+  }
+
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
+    targetCharacteristics.DisableType(
+        Fortran::common::TypeCategory::Real, /*kind=*/10);
   }
 
-  // Figure out if we can support F128: see
-  // flang/runtime/Float128Math/math-entries.h
-  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
-  // cross-compilation
+  // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h.
+  // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation.
 #ifdef FLANG_RUNTIME_F128_MATH_LIB
-  // we can use libquadmath wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libquadmath wrappers
 #elif HAS_LDBL128
-  // we can use libm wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libm wrappers
 #else
   constexpr bool f128Support = false;
 #endif
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index f541b8e844ade..7fca0c431f8cd 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,7 +11,9 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if __x86_64__
+#if __aarch64__
+#include <fpu_control.h>
+#elif __x86_64__
 #include <xmmintrin.h>
 #endif
 
@@ -90,20 +92,40 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
 #endif
 }
 
+// A hardware FZ (flush to zero) bit is the negation of the
+// ieee_[get|set]_underflow_mode GRADUAL argument.
+#if defined(_MM_FLUSH_ZERO_MASK)
+// The MXCSR FZ bit affects computations of real kinds 3, 4, and 8.
+#elif defined(_FPU_GETCW)
+// The FPCR FZ bit affects computations of real kinds 3, 4, and 8.
+// bit 24: FZ   -- single, double precision flush to zero bit
+// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant]
+#define _FPU_FPCR_FZ_MASK_ 0x01080000
+#endif
+
 bool RTNAME(GetUnderflowMode)(void) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
+#elif defined(_FPU_GETCW)
+  uint32_t fpcr;
+  _FPU_GETCW(fpcr);
+  return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
 #endif
 }
 void RTNAME(SetUnderflowMode)(bool flag) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
+#elif defined(_FPU_GETCW)
+  uint32_t fpcr;
+  _FPU_GETCW(fpcr);
+  if (flag) {
+    fpcr &= ~_FPU_FPCR_FZ_MASK_;
+  } else {
+    fpcr |= _FPU_FPCR_FZ_MASK_;
+  }
+  _FPU_SETCW(fpcr);
 #endif
 }
 

From 3a4376b8f90686f754ee51b296a064ab03c12895 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Mon, 27 Jan 2025 14:21:14 +0000
Subject: [PATCH 192/432] LAA: handle 0 return from getPtrStride correctly
 (#124539)

getPtrStride returns 0 when the PtrScev is loop-invariant, and this is
not an erroneous value: it returns std::nullopt to communicate that it
was not able to find a valid pointer stride. In analyzeLoop, we call
getPtrStride with a value_or(0) conflating the zero return value with
std::nullopt. Fix this, handling loop-invariant loads correctly.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      |   4 +-
 .../LoopAccessAnalysis/pointer-phis.ll        |   8 --
 .../LoopDistribute/pointer-phi-in-loop.ll     | 132 +++++++++---------
 .../Inputs/loop-distribute.ll                 |   6 +-
 .../Inputs/loop-distribute.ll.expected        |  46 +++---
 5 files changed, 101 insertions(+), 95 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 11e0a221fc887..697b40403902c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -1438,7 +1438,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
                    bool Assume, bool ShouldCheckWrap) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
   if (PSE.getSE()->isLoopInvariant(PtrScev, Lp))
-    return {0};
+    return 0;
 
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Unexpected non-ptr");
@@ -2593,7 +2593,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
     if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, LD->getType(), Ptr, TheLoop, SymbolicStrides).value_or(0)) {
+        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
index a214451bfd3fd..48586ee9d9ed9 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
@@ -501,14 +501,6 @@ define void @phi_load_store_memdep_check(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:            %lv3 = load i16, ptr %c.sink, align 2 ->
 ; CHECK-NEXT:            store i16 %add, ptr %c.sink, align 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:        Unknown:
-; CHECK-NEXT:            %lv = load i16, ptr %A, align 1 ->
-; CHECK-NEXT:            store i16 %lv, ptr %A, align 1
-; CHECK-EMPTY:
-; CHECK-NEXT:        Unknown:
-; CHECK-NEXT:            store i16 %lv, ptr %A, align 1 ->
-; CHECK-NEXT:            %lv2 = load i16, ptr %A, align 1
-; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
 ; CHECK-NEXT:        Comparing group ([[GRP10:0x[0-9a-f]+]]):
diff --git a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
index 2ab9140baf866..b95551eb94f4c 100644
--- a/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
+++ b/llvm/test/Transforms/LoopDistribute/pointer-phi-in-loop.ll
@@ -3,26 +3,73 @@
 
 ; Testcases inspired by PR50296, PR50288.
 
-define void @phi_load_store_distribute(i1 %c, ptr %A, ptr %B, ptr %C) {
+define void @phi_load_store_distribute(i1 %cond, ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @phi_load_store_distribute(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[A:%.*]], align 1
+; CHECK:       for.body.lver.check:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 2
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[ENTRY:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
+; CHECK:       for.body.ph.lver.orig:
+; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
+; CHECK:       for.body.lver.orig:
+; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ]
+; CHECK-NEXT:    [[LV:%.*]] = load i16, ptr [[A]], align 1
 ; CHECK-NEXT:    store i16 [[LV]], ptr [[A]], align 1
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK:       if.then:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK:       if.then.lver.orig:
 ; CHECK-NEXT:    [[LV2:%.*]] = load i16, ptr [[A]], align 1
 ; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end.lver.orig:
+; CHECK-NEXT:    [[C_SINK_LVER_ORIG:%.*]] = phi ptr [ [[B]], [[IF_THEN]] ], [ [[C]], [[FOR_BODY_LVER_ORIG]] ]
+; CHECK-NEXT:    [[LV3_LVER_ORIG:%.*]] = load i16, ptr [[C_SINK_LVER_ORIG]], align 2
+; CHECK-NEXT:    [[ADD_LVER_ORIG:%.*]] = add i16 [[LV3_LVER_ORIG]], 10
+; CHECK-NEXT:    store i16 [[ADD_LVER_ORIG]], ptr [[C_SINK_LVER_ORIG]], align 1
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]]
+; CHECK:       for.body.ph.ldist1:
+; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
+; CHECK:       for.body.ldist1:
+; CHECK-NEXT:    [[IV_LDIST1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LDIST1]] ], [ [[IV_NEXT_LDIST1:%.*]], [[IF_END_LDIST1:%.*]] ]
+; CHECK-NEXT:    [[LV_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    store i16 [[LV_LDIST1]], ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN_LDIST1:%.*]], label [[IF_END_LDIST1]]
+; CHECK:       if.then.ldist1:
+; CHECK-NEXT:    [[LV2_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    br label [[IF_END_LDIST1]]
+; CHECK:       if.end.ldist1:
+; CHECK-NEXT:    [[IV_NEXT_LDIST1]] = add nuw nsw i16 [[IV_LDIST1]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT_LDIST1:%.*]] = icmp eq i16 [[IV_NEXT_LDIST1]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LDIST1]], label [[FOR_BODY_PH:%.*]], label [[FOR_BODY_LDIST1]]
+; CHECK:       for.body.ph:
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH]] ], [ [[IV_NEXT1:%.*]], [[IF_END1:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN1:%.*]], label [[IF_END1]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[IF_END1]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[C_SINK:%.*]] = phi ptr [ [[B:%.*]], [[IF_THEN]] ], [ [[C:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C_SINK:%.*]] = phi ptr [ [[B]], [[IF_THEN1]] ], [ [[C]], [[FOR_BODY1]] ]
 ; CHECK-NEXT:    [[LV3:%.*]] = load i16, ptr [[C_SINK]], align 2
 ; CHECK-NEXT:    [[ADD:%.*]] = add i16 [[LV3]], 10
 ; CHECK-NEXT:    store i16 [[ADD]], ptr [[C_SINK]], align 1
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i16 [[IV1]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT1]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT_LOOPEXIT6:%.*]], label [[FOR_BODY1]]
+; CHECK:       for.end.loopexit.loopexit:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       for.end.loopexit.loopexit6:
+; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
@@ -33,7 +80,7 @@ for.body:                                         ; preds = %if.end, %entry
   %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
   %lv = load i16, ptr %A, align 1
   store i16 %lv, ptr %A, align 1
-  br i1 %c, label %if.then, label %if.end
+  br i1 %cond, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
   %lv2 = load i16, ptr %A, align 1
@@ -55,66 +102,21 @@ for.end.loopexit:                                 ; preds = %if.end
 define void @phi_load_distribute(i1 %cond, ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @phi_load_distribute(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_BODY_LVER_CHECK:%.*]]
-; CHECK:       for.body.lver.check:
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 2
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 2
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 2
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[C]], [[SCEVGEP]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
-; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
-; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
-; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PH_LVER_ORIG:%.*]], label [[FOR_BODY_PH_LDIST1:%.*]]
-; CHECK:       for.body.ph.lver.orig:
 ; CHECK-NEXT:    br label [[FOR_BODY_LVER_ORIG:%.*]]
-; CHECK:       for.body.lver.orig:
-; CHECK-NEXT:    [[IV_LVER_ORIG:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LVER_ORIG]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ]
-; CHECK-NEXT:    [[LV_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV_LVER_ORIG:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LVER_ORIG:%.*]] ], [ [[IV_NEXT_LVER_ORIG:%.*]], [[IF_END_LVER_ORIG:%.*]] ]
+; CHECK-NEXT:    [[LV_LVER_ORIG:%.*]] = load i16, ptr [[A:%.*]], align 1
 ; CHECK-NEXT:    store i16 [[LV_LVER_ORIG]], ptr [[A]], align 1
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN_LVER_ORIG:%.*]], label [[IF_END_LVER_ORIG]]
-; CHECK:       if.then.lver.orig:
-; CHECK-NEXT:    [[LV2_LVER_ORIG:%.*]] = load i16, ptr [[A]], align 1
-; CHECK-NEXT:    br label [[IF_END_LVER_ORIG]]
-; CHECK:       if.end.lver.orig:
-; CHECK-NEXT:    [[C_SINK_LVER_ORIG:%.*]] = phi ptr [ [[B]], [[IF_THEN_LVER_ORIG]] ], [ [[C]], [[FOR_BODY_LVER_ORIG]] ]
-; CHECK-NEXT:    [[LV3_LVER_ORIG:%.*]] = load i16, ptr [[C_SINK_LVER_ORIG]], align 2
-; CHECK-NEXT:    [[IV_NEXT_LVER_ORIG]] = add nuw nsw i16 [[IV_LVER_ORIG]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT_LVER_ORIG:%.*]] = icmp eq i16 [[IV_NEXT_LVER_ORIG]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LVER_ORIG]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]]
-; CHECK:       for.body.ph.ldist1:
-; CHECK-NEXT:    br label [[FOR_BODY_LDIST1:%.*]]
-; CHECK:       for.body.ldist1:
-; CHECK-NEXT:    [[IV_LDIST1:%.*]] = phi i16 [ 0, [[FOR_BODY_PH_LDIST1]] ], [ [[IV_NEXT_LDIST1:%.*]], [[IF_END_LDIST1:%.*]] ]
-; CHECK-NEXT:    [[LV_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
-; CHECK-NEXT:    store i16 [[LV_LDIST1]], ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN_LDIST1:%.*]], label [[IF_END_LDIST1]]
-; CHECK:       if.then.ldist1:
-; CHECK-NEXT:    [[LV2_LDIST1:%.*]] = load i16, ptr [[A]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    br label [[IF_END_LDIST1]]
-; CHECK:       if.end.ldist1:
-; CHECK-NEXT:    [[IV_NEXT_LDIST1]] = add nuw nsw i16 [[IV_LDIST1]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT_LDIST1:%.*]] = icmp eq i16 [[IV_NEXT_LDIST1]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT_LDIST1]], label [[FOR_BODY_PH:%.*]], label [[FOR_BODY_LDIST1]]
-; CHECK:       for.body.ph:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i16 [ 0, [[FOR_BODY_PH]] ], [ [[IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    br label [[IF_END]]
+; CHECK-NEXT:    [[LV2:%.*]] = load i16, ptr [[A]], align 1
+; CHECK-NEXT:    br label [[IF_END_LVER_ORIG]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[C_SINK:%.*]] = phi ptr [ [[B]], [[IF_THEN]] ], [ [[C]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C_SINK:%.*]] = phi ptr [ [[B:%.*]], [[IF_THEN_LVER_ORIG]] ], [ [[C:%.*]], [[FOR_BODY_LVER_ORIG]] ]
 ; CHECK-NEXT:    [[LV3:%.*]] = load i16, ptr [[C_SINK]], align 2
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT_LOOPEXIT6:%.*]], label [[FOR_BODY]]
-; CHECK:       for.end.loopexit.loopexit:
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       for.end.loopexit.loopexit6:
-; CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
+; CHECK-NEXT:    [[IV_NEXT_LVER_ORIG]] = add nuw nsw i16 [[IV_LVER_ORIG]], 1
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i16 [[IV_NEXT_LVER_ORIG]], 1000
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY_LVER_ORIG]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll
index 48f80533c6379..548aa0ab2673b 100644
--- a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute \
 ; RUN:   -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s
 
-define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) {
+define void @ldist(i1 %cond, ptr %A, ptr %B, ptr %C) {
 entry:
   br label %for.body
 
@@ -9,7 +9,7 @@ for.body:                                         ; preds = %if.end, %entry
   %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
   %lv = load i16, ptr %A, align 1
   store i16 %lv, ptr %A, align 1
-  br i1 %c, label %if.then, label %if.end
+  br i1 %cond, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
   %lv2 = load i16, ptr %A, align 1
@@ -18,6 +18,8 @@ if.then:                                          ; preds = %for.body
 if.end:                                           ; preds = %if.then, %for.body
   %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ]
   %lv3 = load i16, ptr %c.sink
+  %add = add i16 %lv3, 10
+  store i16 %add, ptr %c.sink, align 1
   %iv.next = add nuw nsw i16 %iv, 1
   %tobool.not = icmp eq i16 %iv.next, 1000
   br i1 %tobool.not, label %for.end.loopexit, label %for.body
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected
index baef851b84ee5..eba378c175091 100644
--- a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-distribute.ll.expected
@@ -2,51 +2,55 @@
 ; RUN: opt -passes=loop-distribute -enable-loop-distribute \
 ; RUN:   -debug-only=loop-distribute -disable-output 2>&1 %s | FileCheck %s
 
-define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) {
+define void @ldist(i1 %cond, ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: 'ldist'
 ; CHECK-NEXT:  LDist: Found a candidate loop: for.body
 ; CHECK-NEXT:  LDist: Backward dependences:
 ; CHECK-NEXT:    Unknown:
-; CHECK-NEXT:        %lv = load i16, ptr %A, align 1 ->
-; CHECK-NEXT:        store i16 %lv, ptr %A, align 1
+; CHECK-NEXT:        %lv3 = load i16, ptr %c.sink, align 2 ->
+; CHECK-NEXT:        store i16 %add, ptr %c.sink, align 1
 ; CHECK-NEXT:    Unknown:
-; CHECK-NEXT:        store i16 %lv, ptr %A, align 1 ->
-; CHECK-NEXT:        %lv2 = load i16, ptr %A, align 1
+; CHECK-NEXT:        %lv3 = load i16, ptr %c.sink, align 2 ->
+; CHECK-NEXT:        store i16 %add, ptr %c.sink, align 1
 ; CHECK-NEXT:  LDist: Seeded partitions:
-; CHECK-NEXT:  LDist: Partition 0: (cycle)
+; CHECK-NEXT:  LDist: Partition 0:
 ; CHECK-NEXT:    for.body: %lv = load i16, ptr %A, align 1
-; CHECK-NEXT:    for.body: store i16 %lv, ptr %A, align 1
-; CHECK-NEXT:    if.then: %lv2 = load i16, ptr %A, align 1
 ; CHECK-NEXT:  LDist: Partition 1:
-; CHECK-NEXT:    if.end: %lv3 = load i16, ptr %c.sink, align 2
+; CHECK-NEXT:    for.body: store i16 %lv, ptr %A, align 1
 ; CHECK-NEXT:  LDist: Partition 2:
+; CHECK-NEXT:    if.then: %lv2 = load i16, ptr %A, align 1
+; CHECK-NEXT:  LDist: Partition 3: (cycle)
 ; CHECK-NEXT:    if.end: %lv3 = load i16, ptr %c.sink, align 2
+; CHECK-NEXT:    if.end: store i16 %add, ptr %c.sink, align 1
 ; CHECK-NEXT:  LDist: Merged partitions:
-; CHECK-NEXT:  LDist: Partition 0: (cycle)
+; CHECK-NEXT:  LDist: Partition 0:
 ; CHECK-NEXT:    for.body: %lv = load i16, ptr %A, align 1
 ; CHECK-NEXT:    for.body: store i16 %lv, ptr %A, align 1
 ; CHECK-NEXT:    if.then: %lv2 = load i16, ptr %A, align 1
-; CHECK-NEXT:  LDist: Partition 1:
+; CHECK-NEXT:  LDist: Partition 1: (cycle)
 ; CHECK-NEXT:    if.end: %lv3 = load i16, ptr %c.sink, align 2
+; CHECK-NEXT:    if.end: store i16 %add, ptr %c.sink, align 1
 ; CHECK-NEXT:  LDist: Populated partitions:
-; CHECK-NEXT:  LDist: Partition 0: (cycle)
+; CHECK-NEXT:  LDist: Partition 0:
 ; CHECK-NEXT:    for.body: %lv = load i16, ptr %A, align 1
 ; CHECK-NEXT:    for.body: store i16 %lv, ptr %A, align 1
 ; CHECK-NEXT:    if.then: %lv2 = load i16, ptr %A, align 1
-; CHECK-NEXT:    for.body: br i1 %c, label %if.then, label %if.end
+; CHECK-NEXT:    for.body: br i1 %cond, label %if.then, label %if.end
 ; CHECK-NEXT:    if.then: br label %if.end
 ; CHECK-NEXT:    if.end: br i1 %tobool.not, label %for.end.loopexit, label %for.body
 ; CHECK-NEXT:    if.end: %tobool.not = icmp eq i16 %iv.next, 1000
 ; CHECK-NEXT:    if.end: %iv.next = add nuw nsw i16 %iv, 1
 ; CHECK-NEXT:    for.body: %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
-; CHECK-NEXT:  LDist: Partition 1:
+; CHECK-NEXT:  LDist: Partition 1: (cycle)
 ; CHECK-NEXT:    if.end: %lv3 = load i16, ptr %c.sink, align 2
-; CHECK-NEXT:    for.body: br i1 %c, label %if.then, label %if.end
+; CHECK-NEXT:    if.end: store i16 %add, ptr %c.sink, align 1
+; CHECK-NEXT:    for.body: br i1 %cond, label %if.then, label %if.end
 ; CHECK-NEXT:    if.then: br label %if.end
 ; CHECK-NEXT:    if.end: br i1 %tobool.not, label %for.end.loopexit, label %for.body
 ; CHECK-NEXT:    if.end: %tobool.not = icmp eq i16 %iv.next, 1000
 ; CHECK-NEXT:    if.end: %iv.next = add nuw nsw i16 %iv, 1
 ; CHECK-NEXT:    for.body: %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
+; CHECK-NEXT:    if.end: %add = add i16 %lv3, 10
 ; CHECK-NEXT:    if.end: %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ]
 ; CHECK-NEXT:  LDist: Distributing loop: for.body
 ; CHECK-NEXT:  LDist: Pointers:
@@ -56,19 +60,21 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    ptr %A
 ; CHECK-NEXT:    Against group ([[GRP2:0x[0-9a-f]+]]):
 ; CHECK-NEXT:    ptr %C
+; CHECK-NEXT:    ptr %C
 ; CHECK-NEXT:  Check 1:
 ; CHECK-NEXT:    Comparing group ([[GRP1]]):
 ; CHECK-NEXT:    ptr %A
 ; CHECK-NEXT:    ptr %A
 ; CHECK-NEXT:    Against group ([[GRP3:0x[0-9a-f]+]]):
 ; CHECK-NEXT:    ptr %B
+; CHECK-NEXT:    ptr %B
 ; CHECK-NEXT:  LDist: After removing unused Instrs:
 ; CHECK-NEXT:  LDist: Partition 0:
 ; CHECK-NEXT:  for.body.ldist1: ; preds = %if.end.ldist1, %for.body.ph.ldist1
 ; CHECK-NEXT:    %iv.ldist1 = phi i16 [ 0, %for.body.ph.ldist1 ], [ %iv.next.ldist1, %if.end.ldist1 ]
 ; CHECK-NEXT:    %lv.ldist1 = load i16, ptr %A, align 1, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    store i16 %lv.ldist1, ptr %A, align 1, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    br i1 %c, label %if.then.ldist1, label %if.end.ldist1
+; CHECK-NEXT:    br i1 %cond, label %if.then.ldist1, label %if.end.ldist1
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  if.then.ldist1: ; preds = %for.body.ldist1
 ; CHECK-NEXT:    %lv2.ldist1 = load i16, ptr %A, align 1, !alias.scope !0, !noalias !3
@@ -81,7 +87,7 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:  LDist: Partition 1:
 ; CHECK-NEXT:  for.body: ; preds = %if.end, %for.body.ph
 ; CHECK-NEXT:    %iv = phi i16 [ 0, %for.body.ph ], [ %iv.next, %if.end ]
-; CHECK-NEXT:    br i1 %c, label %if.then, label %if.end
+; CHECK-NEXT:    br i1 %cond, label %if.then, label %if.end
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  if.then: ; preds = %for.body
 ; CHECK-NEXT:    br label %if.end
@@ -89,6 +95,8 @@ define void @ldist(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:  if.end: ; preds = %if.then, %for.body
 ; CHECK-NEXT:    %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ]
 ; CHECK-NEXT:    %lv3 = load i16, ptr %c.sink, align 2
+; CHECK-NEXT:    %add = add i16 %lv3, 10
+; CHECK-NEXT:    store i16 %add, ptr %c.sink, align 1
 ; CHECK-NEXT:    %iv.next = add nuw nsw i16 %iv, 1
 ; CHECK-NEXT:    %tobool.not = icmp eq i16 %iv.next, 1000
 ; CHECK-NEXT:    br i1 %tobool.not, label %for.end.loopexit.loopexit6, label %for.body
@@ -100,7 +108,7 @@ for.body:                                         ; preds = %if.end, %entry
   %iv = phi i16 [ 0, %entry ], [ %iv.next, %if.end ]
   %lv = load i16, ptr %A, align 1
   store i16 %lv, ptr %A, align 1
-  br i1 %c, label %if.then, label %if.end
+  br i1 %cond, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
   %lv2 = load i16, ptr %A, align 1
@@ -109,6 +117,8 @@ if.then:                                          ; preds = %for.body
 if.end:                                           ; preds = %if.then, %for.body
   %c.sink = phi ptr [ %B, %if.then ], [ %C, %for.body ]
   %lv3 = load i16, ptr %c.sink
+  %add = add i16 %lv3, 10
+  store i16 %add, ptr %c.sink, align 1
   %iv.next = add nuw nsw i16 %iv, 1
   %tobool.not = icmp eq i16 %iv.next, 1000
   br i1 %tobool.not, label %for.end.loopexit, label %for.body

From f07505849c8e683bf8f444e205d3dd3284759b7d Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 08:29:20 -0600
Subject: [PATCH 193/432] [Offload] Fix server thread from being shut down if
 unused

---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index c9acabea6977d..0d5169191c5af 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1633,11 +1633,12 @@ Error GenericPluginTy::deinit() {
   if (GlobalHandler)
     delete GlobalHandler;
 
-  if (RPCServer) {
+  if (RPCServer->Thread->Running.load(std::memory_order_relaxed))
     if (Error Err = RPCServer->shutDown())
       return Err;
+
+  if (RPCServer)
     delete RPCServer;
-  }
 
   if (RecordReplay)
     delete RecordReplay;

From 54928a10c8dba7c07c6224c1ead5c02a335890e6 Mon Sep 17 00:00:00 2001
From: Dipesh Sharma <76941383+dipeshs809@users.noreply.github.com>
Date: Mon, 27 Jan 2025 20:00:53 +0530
Subject: [PATCH 194/432] [clang]  __STDC_NO_THREADS__ is no longer necessary
 for VS 2022 1939 and above (#117149)

Since `__STDC_NO_THREADS__` is a reserved identifier,
- If `MSVC version < 17.9`
- C version < C11(201112L)
- When `<threads.h>` is unavailable `!__has_include(<threads.h>)` is
`__has_include` is defined.

Closes #115529
---
 clang/docs/ReleaseNotes.rst                       |  1 +
 clang/include/clang/Basic/LangOptions.h           |  1 +
 clang/lib/Basic/Targets/OSTargets.cpp             |  6 ++++--
 .../deprecate-threads-macro-definition-msvc1939.c | 15 +++++++++++++++
 clang/test/Preprocessor/init-aarch64.c            |  2 +-
 5 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c60565a568234..55a4a2e32383a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -387,6 +387,7 @@ C Language Changes
 ------------------
 
 - Extend clang's ``<limits.h>`` to define ``LONG_LONG_*`` macros for Android's bionic.
+- Macro ``__STDC_NO_THREADS__`` is no longer necessary for MSVC 2022 1939 and later.
 
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 949c8f5d448bc..114a5d34a008b 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -144,6 +144,7 @@ class LangOptionsBase {
     MSVC2019_5 = 1925,
     MSVC2019_8 = 1928,
     MSVC2022_3 = 1933,
+    MSVC2022_9 = 1939,
   };
 
   enum SYCLMajorVersion {
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index bf10f9a725567..8af6623e5cb15 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -259,8 +259,10 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
     Builder.defineMacro("_KERNEL_MODE");
 
   Builder.defineMacro("_INTEGRAL_MAX_BITS", "64");
-  Builder.defineMacro("__STDC_NO_THREADS__");
-
+  // Define __STDC_NO_THREADS__ based on MSVC version, threads.h availability,
+  // and language standard.
+  if (!(Opts.isCompatibleWithMSVC(LangOptions::MSVC2022_9) && Opts.C11))
+    Builder.defineMacro("__STDC_NO_THREADS__");
   // Starting with VS 2022 17.1, MSVC predefines the below macro to inform
   // users of the execution character set defined at compile time.
   // The value given is the Windows Code Page Identifier:
diff --git a/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c b/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c
new file mode 100644
index 0000000000000..e197d8d403a3f
--- /dev/null
+++ b/clang/test/Preprocessor/deprecate-threads-macro-definition-msvc1939.c
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c89 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C89_MSVC33 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c99 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C99_MSVC33 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.33 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC33 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c89 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C89_MSVC39 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c99 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C99_MSVC39 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.39 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC39 %s
+// RUN: %clang_cc1 -E -dM -triple=arm64ec-windows-msvc -std=c11 -fms-compatibility-version=19.40 -ffreestanding < /dev/null | FileCheck -check-prefix=C11_MSVC40 %s
+
+// C89_MSVC33: #define __STDC_NO_THREADS__ 1
+// C99_MSVC33: #define __STDC_NO_THREADS__ 1
+// C11_MSVC33: #define __STDC_NO_THREADS__ 1
+// C89_MSVC39: #define __STDC_NO_THREADS__ 1
+// C99_MSVC39: #define __STDC_NO_THREADS__ 1
+// C11_MSVC39-NOT: #define __STDC_NO_THREADS__
+// C11_MSVC40-NOT: #define __STDC_NO_THREADS__
diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index b5e77ba10c347..5f47de4b49b69 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -772,7 +772,7 @@
 // AARCH64-MSVC: #define __WINT_WIDTH__ 16
 // AARCH64-MSVC: #define __aarch64__ 1
 
-// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64ec-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM64EC-MSVC %s
+// RUN: %clang_cc1 -E -dM -fms-compatibility-version=19.33 -ffreestanding -triple=arm64ec-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM64EC-MSVC %s
 
 // ARM64EC-MSVC: #define _INTEGRAL_MAX_BITS 64
 // ARM64EC-MSVC: #define _M_AMD64 100

From f95a8bde3425ada0ef004186eb8ccda6e723241c Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 14:31:41 +0000
Subject: [PATCH 195/432] [AArch64] Refactor implementation of FP8 types (NFC)
 (#123604)

- The FP8 scalar type (`__mfp8`) was described as a vector type
- The FP8 vector types were described/assumed to have integer element
type (the element type ought to be `__mfp8`)
- Add support for `m` type specifier (denoting `__mfp8`) in
`DecodeTypeFromStr` and create builtin function prototypes using that
specifier, instead of `int8_t`
---
 .../clang/Basic/AArch64SVEACLETypes.def       | 35 +++++++++----------
 clang/lib/AST/ASTContext.cpp                  | 30 +++++++++-------
 clang/lib/AST/ItaniumMangle.cpp               |  2 +-
 clang/lib/AST/Type.cpp                        |  4 +--
 clang/lib/CodeGen/CodeGenTypes.cpp            | 22 +++++++-----
 clang/lib/CodeGen/Targets/AArch64.cpp         |  7 ++--
 clang/utils/TableGen/SveEmitter.cpp           |  4 +--
 7 files changed, 58 insertions(+), 46 deletions(-)

diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 2dd2754e778d6..a408bb0c54057 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -57,6 +57,11 @@
 //  - IsBF true for vector of brain float elements.
 //===----------------------------------------------------------------------===//
 
+#ifndef SVE_SCALAR_TYPE
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \
+  SVE_TYPE(Name, Id, SingletonId)
+#endif
+
 #ifndef SVE_VECTOR_TYPE
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
@@ -72,6 +77,11 @@
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_MFLOAT
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false)
+#endif
+
 #ifndef SVE_VECTOR_TYPE_FLOAT
 #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
@@ -97,16 +107,6 @@
   SVE_TYPE(Name, Id, SingletonId)
 #endif
 
-#ifndef AARCH64_VECTOR_TYPE
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
-  SVE_TYPE(Name, Id, SingletonId)
-#endif
-
-#ifndef AARCH64_VECTOR_TYPE_MFLOAT
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
-  AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
-#endif
-
 //===- Vector point types -----------------------------------------------===//
 
 SVE_VECTOR_TYPE_INT("__SVInt8_t",  "__SVInt8_t",  SveInt8,  SveInt8Ty, 16,  8, 1, true)
@@ -125,8 +125,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
-// This is a 8 bits opaque type.
-SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
@@ -148,7 +147,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
@@ -170,7 +169,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
@@ -192,7 +191,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
 
 SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
 SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
@@ -200,15 +199,15 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
-AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
+SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
 #undef SVE_VECTOR_TYPE_FLOAT
 #undef SVE_VECTOR_TYPE_INT
 #undef SVE_PREDICATE_TYPE
 #undef SVE_PREDICATE_TYPE_ALL
 #undef SVE_OPAQUE_TYPE
-#undef AARCH64_VECTOR_TYPE_MFLOAT
-#undef AARCH64_VECTOR_TYPE
+#undef SVE_SCALAR_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a4ba9fd055346..cd1bcb3b9a063 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2269,11 +2269,10 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Width = 0;                                                                 \
     Align = 16;                                                                \
     break;
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \
-                                   ElBits, NF)                                 \
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
   case BuiltinType::Id:                                                        \
-    Width = NumEls * ElBits * NF;                                              \
-    Align = NumEls * ElBits;                                                   \
+    Width = Bits;                                                              \
+    Align = Bits;                                                              \
     break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
@@ -4423,15 +4422,14 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
                                ElBits, NF)                                     \
   case BuiltinType::Id:                                                        \
     return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF};
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   case BuiltinType::Id:                                                        \
     return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
-#define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \
-                                   ElBits, NF)                                 \
-  case BuiltinType::Id:                                                        \
-    return {getIntTypeForBitwidth(ElBits, false),                              \
-            llvm::ElementCount::getFixed(NumEls), NF};
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
@@ -4493,11 +4491,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
       EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->isMFloat8Type() && EltTySize == ElBits &&                         \
+      NumElts == (NumEls * NF) && NumFields == 1) {                            \
+    return SingletonId;                                                        \
+  }
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
@@ -12382,6 +12385,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   case 'p':
     Type = Context.getProcessIDType();
     break;
+  case 'm':
+    Type = Context.MFloat8Ty;
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 9948963d7f44b..49089c0ea3c8a 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3433,7 +3433,7 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     type_name = MangledName;                                                   \
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
   case BuiltinType::Id:                                                        \
     type_name = MangledName;                                                   \
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index caa0ac858a1be..fde0746a17570 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const {
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     return true;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
-  case BuiltinType::Id:                                                        \
-    return false;
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
     default:
       return false;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 950b23f4e13b9..405242e97e75c 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -505,15 +505,18 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
-  case BuiltinType::Id:
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-        auto VTy =
-            llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC);
+        // The `__mfp8` type maps to `<1 x i8>` which can't be used to build
+        // a <N x i8> vector type, hence bypass the call to `ConvertType` for
+        // the element type and create the vector type directly.
+        auto *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getLLVMContext())
+                          : ConvertType(Info.ElementType);
+        auto *VTy = llvm::VectorType::get(EltTy, Info.EC);
         switch (Info.NumVectors) {
         default:
           llvm_unreachable("Expected 1, 2, 3 or 4 vectors!");
@@ -529,6 +532,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    case BuiltinType::MFloat8:
+      return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1,
+                                   false);
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
     case BuiltinType::Id: \
       ResultType = \
@@ -650,9 +656,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
-                               : (VT->getElementType()->isMFloat8Type()
-                                      ? llvm::Type::getInt8Ty(getLLVMContext())
-                                      : ConvertType(VT->getElementType()));
+                           : VT->getElementType()->isMFloat8Type()
+                               ? llvm::Type::getInt8Ty(getLLVMContext())
+                               : ConvertType(VT->getElementType());
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
   }
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index c702e79ff8eb9..057199c66f5a1 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const {
 
     case BuiltinType::SChar:
     case BuiltinType::UChar:
+    case BuiltinType::MFloat8:
       return llvm::ScalableVectorType::get(
           llvm::Type::getInt8Ty(getVMContext()), 16);
 
@@ -776,8 +777,10 @@ bool AArch64ABIInfo::passAsPureScalableType(
     NPred += Info.NumVectors;
   else
     NVec += Info.NumVectors;
-  auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue());
+  llvm::Type *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getVMContext())
+                          : CGT.ConvertType(Info.ElementType);
+  auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue());
 
   if (CoerceToSeq.size() + Info.NumVectors > 12)
     return false;
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 0ecbf7cede1da..687d344163e20 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -449,7 +449,7 @@ std::string SVEType::builtinBaseType() const {
   case TypeKind::PredicatePattern:
     return "i";
   case TypeKind::Fpm:
-    return "Wi";
+    return "UWi";
   case TypeKind::Predicate:
     return "b";
   case TypeKind::BFloat16:
@@ -457,7 +457,7 @@ std::string SVEType::builtinBaseType() const {
     return "y";
   case TypeKind::MFloat8:
     assert(ElementBitwidth == 8 && "Invalid MFloat8!");
-    return "c";
+    return "m";
   case TypeKind::Float:
     switch (ElementBitwidth) {
     case 16:

From f1d5e70a00fbc80f42977800e9299353b06d48cb Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 27 Jan 2025 06:28:45 -0800
Subject: [PATCH 196/432] [SLP][NFC]Do not check poison values for
 corresponding vectorized entries

No need to check poison values if they have been vectorized and/or mark
them as vectorized, it should work only for instructions.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index eea6b32460d70..640fcb56aab19 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3643,6 +3643,8 @@ class BoUpSLP {
     }
     if (!Last->isGather()) {
       for (Value *V : VL) {
+        if (isa<PoisonValue>(V))
+          continue;
         const TreeEntry *TE = getTreeEntry(V);
         assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
                "Scalar already in tree!");

From eaa5897534cbd263d0cdbf780f72133c2fe8d8d4 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 27 Jan 2025 14:41:40 +0000
Subject: [PATCH 197/432] [libclc] Optimize CLC vector is(un)ordered builtins
 (#124546)

These are similar to 347fb208, but these builtins are expressed in terms
of other builtins. The LLVM IR generated features the same fcmp ord/uno
comparisons as before, but consistently in vector form.
---
 .../clc/include/clc/relational/relational.h   | 79 -------------------
 .../lib/generic/relational/clc_isordered.cl   | 22 +++---
 .../lib/generic/relational/clc_isunordered.cl | 26 +++---
 3 files changed, 18 insertions(+), 109 deletions(-)

diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h
index f32e7630203e4..f269715cfc83c 100644
--- a/libclc/clc/include/clc/relational/relational.h
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -63,85 +63,6 @@
                                       ARG_TYPE)                                \
   _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)
 
-#define _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME, \
-                                             ARG0_TYPE, ARG1_TYPE)             \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return BUILTIN_NAME(x, y);                                                 \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC(RET_TYPE, FUNCTION, ARG0_TYPE,       \
-                                          ARG1_TYPE)                           \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo),                         \
-                                 FUNCTION(x.hi, y.hi)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE, FUNCTION, ARG0_TYPE,      \
-                                           ARG1_TYPE)                          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo, y.lo),                         \
-                                 FUNCTION(x.hi, y.hi)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE, FUNCTION, ARG0_TYPE,      \
-                                           ARG1_TYPE)                          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
-                                 FUNCTION(x.s2, y.s2)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE, FUNCTION, ARG0_TYPE,      \
-                                           ARG1_TYPE)                          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
-                                 FUNCTION(x.s2, y.s2),                         \
-                                 FUNCTION(x.s3, y.s3)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE, FUNCTION, ARG0_TYPE,      \
-                                           ARG1_TYPE)                          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
-                                 FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3),   \
-                                 FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5),   \
-                                 FUNCTION(x.s6, y.s6),                         \
-                                 FUNCTION(x.s7, y.s7)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE, FUNCTION, ARG0_TYPE,     \
-                                            ARG1_TYPE)                         \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG0_TYPE x, ARG1_TYPE y) {         \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1),   \
-                                 FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3),   \
-                                 FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5),   \
-                                 FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7),   \
-                                 FUNCTION(x.s8, y.s8), FUNCTION(x.s9, y.s9),   \
-                                 FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb),   \
-                                 FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd),   \
-                                 FUNCTION(x.se, y.se),                         \
-                                 FUNCTION(x.sf, y.sf)} != (RET_TYPE)0);        \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,   \
-                                              ARG1_TYPE)                       \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC2(RET_TYPE##2, FUNCTION, ARG0_TYPE##2,      \
-                                     ARG1_TYPE##2)                             \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC3(RET_TYPE##3, FUNCTION, ARG0_TYPE##3,      \
-                                     ARG1_TYPE##3)                             \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC4(RET_TYPE##4, FUNCTION, ARG0_TYPE##4,      \
-                                     ARG1_TYPE##4)                             \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC8(RET_TYPE##8, FUNCTION, ARG0_TYPE##8,      \
-                                     ARG1_TYPE##8)                             \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC16(RET_TYPE##16, FUNCTION, ARG0_TYPE##16,   \
-                                      ARG1_TYPE##16)
-
-#define _CLC_DEFINE_RELATIONAL_BINARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,    \
-                                      ARG0_TYPE, ARG1_TYPE)                    \
-  _CLC_DEFINE_RELATIONAL_BINARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,   \
-                                       ARG0_TYPE, ARG1_TYPE)                   \
-  _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,         \
-                                        ARG1_TYPE)
-
 #define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \
                                              ARG1_TYPE, ARG2_TYPE)             \
   _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
diff --git a/libclc/clc/lib/generic/relational/clc_isordered.cl b/libclc/clc/lib/generic/relational/clc_isordered.cl
index 6183d1ddf918f..73cd96a0a56ed 100644
--- a/libclc/clc/lib/generic/relational/clc_isordered.cl
+++ b/libclc/clc/lib/generic/relational/clc_isordered.cl
@@ -2,33 +2,29 @@
 #include <clc/relational/clc_isequal.h>
 #include <clc/relational/relational.h>
 
-#define _CLC_DEFINE_ISORDERED(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)        \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return __clc_isequal(x, x) && __clc_isequal(y, y);                         \
-  }
+#define _CLC_RELATIONAL_OP(X, Y)                                               \
+  __clc_isequal((X), (X)) && __clc_isequal((Y), (Y))
 
-_CLC_DEFINE_ISORDERED(int, __clc_isordered, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isordered, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isordered, float, float)
 
 #ifdef cl_khr_fp64
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // The scalar version of __clc_isordered(double, double) returns an int, but the
 // vector versions return long.
-
-_CLC_DEFINE_ISORDERED(int, __clc_isordered, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isordered, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isordered, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isordered(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEFINE_ISORDERED(int, __clc_isordered, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isordered, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isordered, half, half)
 
 #endif
 
-#undef _CLC_DEFINE_ISORDERED
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isunordered.cl b/libclc/clc/lib/generic/relational/clc_isunordered.cl
index dbbec031a65e5..fefda8e567517 100644
--- a/libclc/clc/lib/generic/relational/clc_isunordered.cl
+++ b/libclc/clc/lib/generic/relational/clc_isunordered.cl
@@ -1,12 +1,11 @@
 #include <clc/internal/clc.h>
+#include <clc/relational/clc_isequal.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isunordered with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y)                                               \
+  !__clc_isequal((X), (X)) || !__clc_isequal((Y), (Y))
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isunordered, __builtin_isunordered,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isunordered, float, float)
 
 #ifdef cl_khr_fp64
 
@@ -14,25 +13,18 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isunordered, __builtin_isunordered,
 
 // The scalar version of __clc_isunordered(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(double x, double y) {
-  return __builtin_isunordered(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isunordered, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isunordered, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isunordered(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isunordered(half x, half y) {
-  return __builtin_isunordered(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isunordered, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isunordered, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP

From 561132e71b29d9b747dfda1509f715847852f77b Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 27 Jan 2025 15:50:51 +0100
Subject: [PATCH 198/432] [Clang] Fix immediate escalation of template function
 specializations. (#124404)

We record whether an expression is immediate escalating in the
FunctionScope.
However, that only happen when parsing or transforming an expression.
This might not happen when transforming a non dependent expression.

This patch fixes that by considering a function immediate when
instantiated from an immediate function.

Fixes #123405
---
 clang/docs/ReleaseNotes.rst                    |  1 +
 clang/lib/AST/Decl.cpp                         |  4 ++++
 .../test/SemaCXX/cxx2b-consteval-propagate.cpp | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 55a4a2e32383a..031c5d84e49f9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1003,6 +1003,7 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
+- Fixed immediate escalation of non-dependent expressions. (#GH123405)
 - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
   template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498)
 - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234)
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 74bcb618f2950..728556614e632 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3314,6 +3314,10 @@ bool FunctionDecl::isImmediateFunction() const {
         .getConstructor()
         ->isImmediateFunction();
 
+  if (FunctionDecl *P = getTemplateInstantiationPattern();
+      P && P->isImmediateFunction())
+    return true;
+
   if (const auto *MD = dyn_cast<CXXMethodDecl>(this);
       MD && MD->isLambdaStaticInvoker())
     return MD->getParent()->getLambdaCallOperator()->isImmediateFunction();
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 3f3123eaee76b..222d482f40aa5 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -528,3 +528,21 @@ D d(0); // expected-note {{in implicit initialization for inherited constructor
 // expected-error@-1 {{call to immediate function 'GH112677::D::SimpleCtor' is not a constant expression}}
 
 }
+
+namespace GH123405 {
+
+consteval void fn() {}
+
+template <typename>
+constexpr int tfn(int) {
+    auto p = &fn;  // expected-note {{'tfn<int>' is an immediate function because its body evaluates the address of a consteval function 'fn'}}
+    return int(p); // expected-error {{cast from pointer to smaller type 'int' loses information}}
+}
+
+int g() {
+   int a; // expected-note {{declared here}}
+   return tfn<int>(a); // expected-error {{call to immediate function 'GH123405::tfn<int>' is not a constant expression}}\
+                       // expected-note {{read of non-const variable 'a' is not allowed in a constant expression}}
+}
+
+}

From 081723b9db84e78d7dd240b46af2aeb3b51b00be Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 27 Jan 2025 14:56:47 +0000
Subject: [PATCH 199/432] [lldb][TypeSystem] Ensure that ParmVarDecls have the
 correct DeclContext (#124279)

While sifting through this part of the code I noticed that when we parse
C++ methods, `DWARFASTParserClang` creates two sets of `ParmVarDecls`,
one in `ParseChildParameters` and once in `AddMethodToCXXRecordType`.
The former is unused when we're dealing with methods. Moreover, the
`ParmVarDecls` we created in `ParseChildParameters` were created with an
incorrect `clang::DeclContext` (namely the DeclContext of the function,
and not the function itself). In Clang, there's
`ParmVarDecl::setOwningFunction` to adjust the DeclContext of a
parameter if the parameter was created before the FunctionDecl. But we
never used it.

This patch removes the `ParmVarDecl` creation from
`ParseChildParameters` and instead creates a
`TypeSystemClang::CreateParameterDeclarations` that ensures we set the
DeclContext correctly.

Note there is one differences in how `ParmVarDecl`s would be created
now: we won't set a ClangASTMetadata entry for any of the parameters. I
don't think this was ever actually useful for parameter DIEs anyway.

This wasn't causing any concrete issues (that I know of), but was quite
surprising. And this way of setting the parameters seems easier to
reason about (in my opinion).
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  36 ++--
 .../SymbolFile/DWARF/DWARFASTParserClang.h    |  12 +-
 .../SymbolFile/NativePDB/PdbAstBuilder.cpp    |   2 +-
 .../Plugins/SymbolFile/PDB/PDBASTParser.cpp   |   4 +-
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  50 +++---
 .../TypeSystem/Clang/TypeSystemClang.h        |  21 ++-
 lldb/unittests/Symbol/TestTypeSystemClang.cpp |  42 +++++
 .../DWARF/DWARFASTParserClangTests.cpp        | 170 ++++++++++++++++++
 8 files changed, 285 insertions(+), 52 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index e77188bfbd2e4..6602dd763ba69 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1272,7 +1272,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
     return_clang_type = m_ast.GetBasicType(eBasicTypeVoid);
 
   std::vector<CompilerType> function_param_types;
-  std::vector<clang::ParmVarDecl *> function_param_decls;
+  llvm::SmallVector<llvm::StringRef> function_param_names;
 
   // Parse the function children for the parameters
 
@@ -1284,7 +1284,7 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
   if (die.HasChildren()) {
     ParseChildParameters(containing_decl_ctx, die, is_variadic,
                          has_template_params, function_param_types,
-                         function_param_decls);
+                         function_param_names);
   }
 
   bool is_cxx_method = DeclKindIsCXXClass(containing_decl_ctx->getDeclKind());
@@ -1414,12 +1414,14 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die,
 
           LinkDeclContextToDIE(function_decl, die);
 
-          if (!function_param_decls.empty()) {
-            m_ast.SetFunctionParameters(function_decl, function_param_decls);
-            if (template_function_decl)
-              m_ast.SetFunctionParameters(template_function_decl,
-                                          function_param_decls);
-          }
+          const clang::FunctionProtoType *function_prototype(
+              llvm::cast<clang::FunctionProtoType>(
+                  ClangUtil::GetQualType(clang_type).getTypePtr()));
+          const auto params = m_ast.CreateParameterDeclarations(
+              function_decl, *function_prototype, function_param_names);
+          function_decl->setParams(params);
+          if (template_function_decl)
+            template_function_decl->setParams(params);
 
           ClangASTMetadata metadata;
           metadata.SetUserID(die.GetID());
@@ -2380,7 +2382,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) {
   bool is_variadic = false;
   bool has_template_params = false;
   std::vector<CompilerType> param_types;
-  std::vector<clang::ParmVarDecl *> param_decls;
+  llvm::SmallVector<llvm::StringRef> param_names;
   StreamString sstr;
 
   DWARFDeclContext decl_ctx = die.GetDWARFDeclContext();
@@ -2394,7 +2396,7 @@ DWARFASTParserClang::ConstructDemangledNameFromDWARF(const DWARFDIE &die) {
       die, GetCXXObjectParameter(die, *containing_decl_ctx));
 
   ParseChildParameters(containing_decl_ctx, die, is_variadic,
-                       has_template_params, param_types, param_decls);
+                       has_template_params, param_types, param_names);
   sstr << "(";
   for (size_t i = 0; i < param_types.size(); i++) {
     if (i > 0)
@@ -3157,7 +3159,7 @@ void DWARFASTParserClang::ParseChildParameters(
     clang::DeclContext *containing_decl_ctx, const DWARFDIE &parent_die,
     bool &is_variadic, bool &has_template_params,
     std::vector<CompilerType> &function_param_types,
-    std::vector<clang::ParmVarDecl *> &function_param_decls) {
+    llvm::SmallVectorImpl<llvm::StringRef> &function_param_names) {
   if (!parent_die)
     return;
 
@@ -3168,22 +3170,14 @@ void DWARFASTParserClang::ParseChildParameters(
       if (die.GetAttributeValueAsUnsigned(DW_AT_artificial, 0))
         continue;
 
-      const char *name = die.GetName();
       DWARFDIE param_type_die = die.GetAttributeValueAsReferenceDIE(DW_AT_type);
 
       Type *type = die.ResolveTypeUID(param_type_die);
       if (!type)
         break;
 
+      function_param_names.emplace_back(die.GetName());
       function_param_types.push_back(type->GetForwardCompilerType());
-
-      clang::ParmVarDecl *param_var_decl = m_ast.CreateParameterDeclaration(
-          containing_decl_ctx, GetOwningClangModule(die), name,
-          type->GetForwardCompilerType(), clang::StorageClass::SC_None);
-      assert(param_var_decl);
-      function_param_decls.push_back(param_var_decl);
-
-      m_ast.SetMetadataAsUserID(param_var_decl, die.GetID());
     } break;
 
     case DW_TAG_unspecified_parameters:
@@ -3205,6 +3199,8 @@ void DWARFASTParserClang::ParseChildParameters(
       break;
     }
   }
+
+  assert(function_param_names.size() == function_param_names.size());
 }
 
 clang::Decl *DWARFASTParserClang::GetClangDeclForDIE(const DWARFDIE &die) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
index a5c3746ada4c3..d1eb2bcc2592e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h
@@ -186,12 +186,12 @@ class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser {
       const lldb::AccessType default_accessibility,
       lldb_private::ClangASTImporter::LayoutInfo &layout_info);
 
-  void
-  ParseChildParameters(clang::DeclContext *containing_decl_ctx,
-                       const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
-                       bool &is_variadic, bool &has_template_params,
-                       std::vector<lldb_private::CompilerType> &function_args,
-                       std::vector<clang::ParmVarDecl *> &function_param_decls);
+  void ParseChildParameters(
+      clang::DeclContext *containing_decl_ctx,
+      const lldb_private::plugin::dwarf::DWARFDIE &parent_die,
+      bool &is_variadic, bool &has_template_params,
+      std::vector<lldb_private::CompilerType> &function_param_types,
+      llvm::SmallVectorImpl<llvm::StringRef> &function_param_names);
 
   size_t ParseChildEnumerators(
       const lldb_private::CompilerType &compiler_type, bool is_signed,
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
index 0c71df625ae34..5d4b22d08b111 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/PdbAstBuilder.cpp
@@ -1137,7 +1137,7 @@ void PdbAstBuilder::CreateFunctionParameters(PdbCompilandSymId func_id,
   }
 
   if (!params.empty() && params.size() == param_count)
-    m_clang.SetFunctionParameters(&function_decl, params);
+    function_decl.setParams(params);
 }
 
 clang::QualType PdbAstBuilder::CreateEnumType(PdbTypeSymId id,
diff --git a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
index fa3530a0c22ff..990bacd89bf34 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/PDBASTParser.cpp
@@ -975,8 +975,8 @@ PDBASTParser::GetDeclForSymbol(const llvm::pdb::PDBSymbol &symbol) {
         }
       }
     }
-    if (params.size())
-      m_ast.SetFunctionParameters(decl, params);
+    if (params.size() && decl)
+      decl->setParams(params);
 
     m_uid_to_decl[sym_id] = decl;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index 47051f2e68090..fc3dbfa311c9b 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2217,12 +2217,6 @@ ParmVarDecl *TypeSystemClang::CreateParameterDeclaration(
   return decl;
 }
 
-void TypeSystemClang::SetFunctionParameters(
-    FunctionDecl *function_decl, llvm::ArrayRef<ParmVarDecl *> params) {
-  if (function_decl)
-    function_decl->setParams(params);
-}
-
 CompilerType
 TypeSystemClang::CreateBlockPointerType(const CompilerType &function_type) {
   QualType block_type = m_ast_up->getBlockPointerType(
@@ -7708,6 +7702,32 @@ void TypeSystemClang::SetFloatingInitializerForVariable(
       ast, init_value, true, qt.getUnqualifiedType(), SourceLocation()));
 }
 
+llvm::SmallVector<clang::ParmVarDecl *>
+TypeSystemClang::CreateParameterDeclarations(
+    clang::FunctionDecl *func, const clang::FunctionProtoType &prototype,
+    const llvm::SmallVector<llvm::StringRef> &parameter_names) {
+  assert(func);
+  assert(parameter_names.empty() ||
+         parameter_names.size() == prototype.getNumParams());
+
+  llvm::SmallVector<clang::ParmVarDecl *, 12> params;
+  for (unsigned param_index = 0; param_index < prototype.getNumParams();
+       ++param_index) {
+    llvm::StringRef name =
+        !parameter_names.empty() ? parameter_names[param_index] : "";
+
+    auto *param =
+        CreateParameterDeclaration(func, /*owning_module=*/{}, name.data(),
+                                   GetType(prototype.getParamType(param_index)),
+                                   clang::SC_None, /*add_decl=*/false);
+    assert(param);
+
+    params.push_back(param);
+  }
+
+  return params;
+}
+
 clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
     lldb::opaque_compiler_type_t type, llvm::StringRef name,
     const char *mangled_name, const CompilerType &method_clang_type,
@@ -7848,20 +7868,10 @@ clang::CXXMethodDecl *TypeSystemClang::AddMethodToCXXRecordType(
         getASTContext(), mangled_name, /*literal=*/false));
   }
 
-  // Populate the method decl with parameter decls
-
-  llvm::SmallVector<clang::ParmVarDecl *, 12> params;
-
-  for (unsigned param_index = 0; param_index < num_params; ++param_index) {
-    params.push_back(clang::ParmVarDecl::Create(
-        getASTContext(), cxx_method_decl, clang::SourceLocation(),
-        clang::SourceLocation(),
-        nullptr, // anonymous
-        method_function_prototype->getParamType(param_index), nullptr,
-        clang::SC_None, nullptr));
-  }
-
-  cxx_method_decl->setParams(llvm::ArrayRef<clang::ParmVarDecl *>(params));
+  // Parameters on member function declarations in DWARF generally don't
+  // have names, so we omit them when creating the ParmVarDecls.
+  cxx_method_decl->setParams(CreateParameterDeclarations(
+      cxx_method_decl, *method_function_prototype, /*parameter_names=*/{}));
 
   AddAccessSpecifierDecl(cxx_record_decl, getASTContext(),
                          GetCXXRecordDeclAccess(cxx_record_decl),
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 678eaed381fd4..83f954270e309 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -489,9 +489,6 @@ class TypeSystemClang : public TypeSystem {
                              const char *name, const CompilerType &param_type,
                              int storage, bool add_decl = false);
 
-  void SetFunctionParameters(clang::FunctionDecl *function_decl,
-                             llvm::ArrayRef<clang::ParmVarDecl *> params);
-
   CompilerType CreateBlockPointerType(const CompilerType &function_type);
 
   // Array Types
@@ -976,6 +973,24 @@ class TypeSystemClang : public TypeSystem {
   SetFloatingInitializerForVariable(clang::VarDecl *var,
                                     const llvm::APFloat &init_value);
 
+  /// For each parameter type of \c prototype, creates a \c clang::ParmVarDecl
+  /// whose \c clang::DeclContext is \c context.
+  ///
+  /// \param[in] context Non-null \c clang::FunctionDecl which will be the \c
+  /// clang::DeclContext of each parameter created/returned by this function.
+  /// \param[in] prototype The \c clang::FunctionProtoType of \c context.
+  /// \param[in] param_names The ith element of this vector contains the name
+  /// of the ith parameter. This parameter may be unnamed, in which case the
+  /// ith entry in \c param_names is an empty string. This vector is either
+  /// empty, or will have an entry for *each* parameter of the prototype
+  /// regardless of whether a parameter is unnamed or not.
+  ///
+  /// \returns A list of newly created of non-null \c clang::ParmVarDecl (one
+  /// for each parameter of \c prototype).
+  llvm::SmallVector<clang::ParmVarDecl *> CreateParameterDeclarations(
+      clang::FunctionDecl *context, const clang::FunctionProtoType &prototype,
+      const llvm::SmallVector<llvm::StringRef> &param_names);
+
   clang::CXXMethodDecl *AddMethodToCXXRecordType(
       lldb::opaque_compiler_type_t type, llvm::StringRef name,
       const char *mangled_name, const CompilerType &method_type,
diff --git a/lldb/unittests/Symbol/TestTypeSystemClang.cpp b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
index a2d1f6db80277..23374062127e0 100644
--- a/lldb/unittests/Symbol/TestTypeSystemClang.cpp
+++ b/lldb/unittests/Symbol/TestTypeSystemClang.cpp
@@ -1040,3 +1040,45 @@ TEST_F(TestTypeSystemClang, GetDeclContextByNameWhenMissingSymbolFile) {
 
   EXPECT_TRUE(decls.empty());
 }
+
+TEST_F(TestTypeSystemClang, AddMethodToCXXRecordType_ParmVarDecls) {
+  // Tests that AddMethodToCXXRecordType creates ParmVarDecl's with
+  // a correct clang::DeclContext.
+
+  llvm::StringRef class_name = "S";
+  CompilerType t = clang_utils::createRecord(*m_ast, class_name);
+  m_ast->StartTagDeclarationDefinition(t);
+
+  CompilerType return_type = m_ast->GetBasicType(lldb::eBasicTypeVoid);
+  const bool is_virtual = false;
+  const bool is_static = false;
+  const bool is_inline = false;
+  const bool is_explicit = true;
+  const bool is_attr_used = false;
+  const bool is_artificial = false;
+
+  llvm::SmallVector<CompilerType> param_types{
+      m_ast->GetBasicType(lldb::eBasicTypeInt),
+      m_ast->GetBasicType(lldb::eBasicTypeShort)};
+  CompilerType function_type = m_ast->CreateFunctionType(
+      return_type, param_types.data(), /*num_params*/ param_types.size(),
+      /*variadic=*/false, /*quals*/ 0U);
+  m_ast->AddMethodToCXXRecordType(
+      t.GetOpaqueQualType(), "myFunc", nullptr, function_type,
+      lldb::AccessType::eAccessPublic, is_virtual, is_static, is_inline,
+      is_explicit, is_attr_used, is_artificial);
+
+  // Complete the definition and check the created record.
+  m_ast->CompleteTagDeclarationDefinition(t);
+
+  auto *record = llvm::cast<CXXRecordDecl>(ClangUtil::GetAsTagDecl(t));
+
+  auto method_it = record->method_begin();
+  ASSERT_NE(method_it, record->method_end());
+
+  EXPECT_EQ(method_it->getNumParams(), param_types.size());
+
+  // DeclContext of each parameter should be the CXXMethodDecl itself.
+  EXPECT_EQ(method_it->getParamDecl(0)->getDeclContext(), *method_it);
+  EXPECT_EQ(method_it->getParamDecl(1)->getDeclContext(), *method_it);
+}
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
index 8adda6fba3a0b..6c77736113da3 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFASTParserClangTests.cpp
@@ -1082,3 +1082,173 @@ TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ExplicitObjectParameter) {
             clang::Qualifiers::fromCVRMask(clang::Qualifiers::Const |
                                            clang::Qualifiers::Volatile));
 }
+
+TEST_F(DWARFASTParserClangTests, TestParseSubroutine_ParameterCreation) {
+  // Tests parsing of a C++ free function will create clang::ParmVarDecls with
+  // the correct clang::DeclContext.
+  //
+  // Also ensures we attach names to the ParmVarDecls (even when DWARF contains
+  // a mix of named/unnamed parameters).
+
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_AARCH64
+DWARF:
+  debug_str:
+    - func
+    - int
+    - short
+    - namedParam
+  debug_abbrev:
+    - ID:              0
+      Table:
+        - Code:            0x1
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x2
+          Tag:             DW_TAG_structure_type
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x3
+          Tag:             DW_TAG_subprogram
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_declaration
+              Form:            DW_FORM_flag_present
+            - Attribute:       DW_AT_external
+              Form:            DW_FORM_flag_present
+        - Code:            0x4
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+        - Code:            0x5
+          Tag:             DW_TAG_formal_parameter
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_type
+              Form:            DW_FORM_ref4
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x6
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+            - Attribute:       DW_AT_encoding
+              Form:            DW_FORM_data1
+            - Attribute:       DW_AT_byte_size
+              Form:            DW_FORM_data1
+  debug_info:
+     - Version:         5
+       UnitType:        DW_UT_compile
+       AddrSize:        8
+       Entries:
+
+# DW_TAG_compile_unit
+#   DW_AT_language [DW_FORM_data2]    (DW_LANG_C_plus_plus)
+
+        - AbbrCode:        0x1
+          Values:
+            - Value:           0x04
+
+#     DW_TAG_subprogram
+#       DW_AT_name [DW_FORM_strp] ("func")
+        - AbbrCode:        0x3
+          Values:
+            - Value:           0x0
+            - Value:           0x1
+            - Value:           0x1
+
+#       DW_TAG_formal_parameter
+#         DW_AT_type [DW_FORM_ref4] (int)
+        - AbbrCode:        0x4
+          Values:
+            - Value: 0x23
+
+#       DW_TAG_formal_parameter
+#         DW_AT_type [DW_FORM_ref4] (short)
+#         DW_AT_name [DW_FORM_strp] ("namedParam")
+        - AbbrCode:        0x5
+          Values:
+            - Value: 0x2a
+            - Value: 0xf
+
+        - AbbrCode: 0x0
+
+#   DW_TAG_base_type
+#     DW_AT_name      [DW_FORM_strp] ("int")
+#     DW_AT_encoding  [DW_FORM_data1]
+#     DW_AT_byte_size [DW_FORM_data1]
+
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0x0000000000000005
+            - Value:           0x0000000000000005 # DW_ATE_signed
+            - Value:           0x0000000000000004
+
+#   DW_TAG_base_type
+#     DW_AT_name      [DW_FORM_strp] ("short")
+#     DW_AT_encoding  [DW_FORM_data1]
+#     DW_AT_byte_size [DW_FORM_data1]
+
+        - AbbrCode:        0x6
+          Values:
+            - Value:           0x0000000000000009
+            - Value:           0x0000000000000005 # DW_ATE_signed
+            - Value:           0x0000000000000004
+
+        - AbbrCode: 0x0
+...
+)";
+  YAMLModuleTester t(yamldata);
+
+  DWARFUnit *unit = t.GetDwarfUnit();
+  ASSERT_NE(unit, nullptr);
+  const DWARFDebugInfoEntry *cu_entry = unit->DIE().GetDIE();
+  ASSERT_EQ(cu_entry->Tag(), DW_TAG_compile_unit);
+  ASSERT_EQ(unit->GetDWARFLanguageType(), DW_LANG_C_plus_plus);
+  DWARFDIE cu_die(unit, cu_entry);
+
+  auto ts_or_err =
+      cu_die.GetDWARF()->GetTypeSystemForLanguage(eLanguageTypeC_plus_plus);
+  ASSERT_TRUE(static_cast<bool>(ts_or_err));
+  llvm::consumeError(ts_or_err.takeError());
+
+  auto *ts = static_cast<TypeSystemClang *>(ts_or_err->get());
+  auto *parser = static_cast<DWARFASTParserClang *>(ts->GetDWARFParser());
+
+  auto subprogram = cu_die.GetFirstChild();
+  ASSERT_TRUE(subprogram.IsValid());
+  ASSERT_EQ(subprogram.Tag(), DW_TAG_subprogram);
+
+  SymbolContext sc;
+  bool new_type;
+  auto type_sp = parser->ParseTypeFromDWARF(sc, subprogram, &new_type);
+  ASSERT_NE(type_sp, nullptr);
+
+  auto result = ts->GetTranslationUnitDecl()->lookup(
+      clang_utils::getDeclarationName(*ts, "func"));
+  ASSERT_TRUE(result.isSingleResult());
+
+  auto const *func = llvm::cast<clang::FunctionDecl>(result.front());
+
+  EXPECT_EQ(func->getNumParams(), 2U);
+  EXPECT_EQ(func->getParamDecl(0)->getDeclContext(), func);
+  EXPECT_TRUE(func->getParamDecl(0)->getName().empty());
+  EXPECT_EQ(func->getParamDecl(1)->getDeclContext(), func);
+  EXPECT_EQ(func->getParamDecl(1)->getName(), "namedParam");
+}

From 5c5bbffe75caaaefdc68305e85a625a057b09159 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 27 Jan 2025 14:57:09 +0000
Subject: [PATCH 200/432] [clang][ASTImporter] Import source location of
 explicit object parameter instead of copying it (#124305)

We used to copy the `SourceLocation` instead of importing it, which
isn't correct since the `SourceManager`'s of the source and target
ASTContext might differ.

Also adds test that confirms that we import the explicit object
parameter location for `ParmVarDecl`s. This is how Clang determines
whether a parameter `isExplicitObjectParamater`. The LLDB expression
evaluator relies on this for calling "explicit object member functions".
---
 clang/lib/AST/ASTImporter.cpp           |  8 ++++++--
 clang/unittests/AST/ASTImporterTest.cpp | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 0669aa1b809c3..be1a65a49622d 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4701,9 +4701,13 @@ ExpectedDecl ASTNodeImporter::VisitImplicitParamDecl(ImplicitParamDecl *D) {
 
 Error ASTNodeImporter::ImportDefaultArgOfParmVarDecl(
     const ParmVarDecl *FromParam, ParmVarDecl *ToParam) {
+
+  if (auto LocOrErr = import(FromParam->getExplicitObjectParamThisLoc()))
+    ToParam->setExplicitObjectParameterLoc(*LocOrErr);
+  else
+    return LocOrErr.takeError();
+
   ToParam->setHasInheritedDefaultArg(FromParam->hasInheritedDefaultArg());
-  ToParam->setExplicitObjectParameterLoc(
-      FromParam->getExplicitObjectParamThisLoc());
   ToParam->setKNRPromoted(FromParam->isKNRPromoted());
 
   if (FromParam->hasUninstantiatedDefaultArg()) {
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 791248e7a394f..e77860521335e 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -3441,6 +3441,7 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl) {
   ASSERT_TRUE(FromVar);
   ASSERT_TRUE(FromVar->hasUninstantiatedDefaultArg());
   ASSERT_TRUE(FromVar->getUninstantiatedDefaultArg());
+  ASSERT_FALSE(FromVar->isExplicitObjectParameter());
 
   const auto *ToVar = Import(FromVar, Lang_CXX11);
   EXPECT_TRUE(ToVar);
@@ -3448,6 +3449,25 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl) {
   EXPECT_TRUE(ToVar->getUninstantiatedDefaultArg());
   EXPECT_NE(FromVar->getUninstantiatedDefaultArg(),
             ToVar->getUninstantiatedDefaultArg());
+  EXPECT_FALSE(ToVar->isExplicitObjectParameter());
+}
+
+TEST_P(ASTImporterOptionSpecificTestBase, ImportParmVarDecl_Explicit) {
+  const auto *Code = R"(
+    struct Wrapper {
+      void func(this Wrapper) {}
+    };
+    )";
+  Decl *FromTU = getTuDecl(Code, Lang_CXX23);
+  auto *FromVar = FirstDeclMatcher<ParmVarDecl>().match(FromTU, parmVarDecl());
+  ASSERT_TRUE(FromVar);
+  ASSERT_TRUE(FromVar->isExplicitObjectParameter());
+
+  const auto *ToVar = Import(FromVar, Lang_CXX23);
+  EXPECT_TRUE(ToVar);
+  EXPECT_TRUE(ToVar->isExplicitObjectParameter());
+  EXPECT_NE(ToVar->getExplicitObjectParamThisLoc(),
+            FromVar->getExplicitObjectParamThisLoc());
 }
 
 TEST_P(ASTImporterOptionSpecificTestBase, ImportOfNonEquivalentField) {

From 1f5335c1db5d54b4465677c224b48e0ffc78e6d9 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:04:50 -0800
Subject: [PATCH 201/432] Make index computation used divsi/remsi (#124390)

The index computation is meant to be signed. Using unsigned could lead
to subtle errors. Fix places where some index math was using unsigned
operations.

Signed-off-by: MaheshRavishankar <mahesh.ravishankar@gmail.com>
---
 mlir/lib/Dialect/Arith/Utils/Utils.cpp        |  2 +-
 .../Linalg/Transforms/ElementwiseOpFusion.cpp |  5 +-
 .../TosaToTensor/tosa-to-tensor.mlir          | 24 +++----
 .../Linalg/data-layout-propagation.mlir       |  6 +-
 .../fuse-with-reshape-by-collapsing.mlir      | 38 +++++------
 .../Dialect/Linalg/fusion-push-reshape.mlir   |  2 +-
 mlir/test/Dialect/Linalg/reshape_fusion.mlir  | 66 +++++++++----------
 mlir/test/Dialect/Tensor/bufferize.mlir       |  4 +-
 8 files changed, 74 insertions(+), 73 deletions(-)

diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
index 39c9005e449e3..8dde9866b22b3 100644
--- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
@@ -69,7 +69,7 @@ mlir::inferExpandShapeOutputShape(OpBuilder &b, Location loc,
     Value indexGroupSize = cast<Value>(inputShape[inputIndex]);
     Value indexGroupStaticSizesProduct =
         b.create<arith::ConstantIndexOp>(loc, indexGroupStaticSizesProductInt);
-    Value dynamicDimSize = b.createOrFold<arith::DivUIOp>(
+    Value dynamicDimSize = b.createOrFold<arith::DivSIOp>(
         loc, indexGroupSize, indexGroupStaticSizesProduct);
     outputShapeValues.push_back(dynamicDimSize);
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index 3a57f368d4425..60cae77644291 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -13,6 +13,7 @@
 #include "mlir/Dialect/Linalg/Passes.h"
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -1572,9 +1573,9 @@ void generateCollapsedIndexingRegion(Location loc, Block *block,
         rewriter.create<linalg::IndexOp>(loc, foldedDims.index());
     for (auto dim : llvm::reverse(foldedDimsRef.drop_front())) {
       indexReplacementVals[dim] =
-          rewriter.create<arith::RemUIOp>(loc, newIndexVal, loopRange[dim]);
+          rewriter.create<arith::RemSIOp>(loc, newIndexVal, loopRange[dim]);
       newIndexVal =
-          rewriter.create<arith::DivUIOp>(loc, newIndexVal, loopRange[dim]);
+          rewriter.create<arith::DivSIOp>(loc, newIndexVal, loopRange[dim]);
     }
     indexReplacementVals[foldedDims.value().front()] = newIndexVal;
   }
diff --git a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
index 2f11b31aad230..27018fb79f60d 100644
--- a/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
+++ b/mlir/test/Conversion/TosaToTensor/tosa-to-tensor.mlir
@@ -86,7 +86,7 @@ func.func @test_reshape_1d_down_s2s_explicit(%arg0: tensor<1xf32>) -> tensor<f32
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %arg0, %[[C0]] : tensor<?xf32>
 // CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C2]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[ARG_0]] {{\[\[}}0, 1]] output_shape [2, %[[VAL_0]]] : tensor<?xf32> into tensor<2x?xf32>
 // CHECK: return %[[EXPANDED]] : tensor<2x?xf32>
 func.func @test_reshape_1d_up_d2d_auto(%arg0: tensor<?xf32>) -> tensor<2x?xf32> {
@@ -135,7 +135,7 @@ func.func @test_reshape_2d_down_s2s_explicit(%arg0: tensor<2x3xf32>) -> tensor<6
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C2:.*]] = arith.constant 2 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C2]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1]] output_shape [2, %[[DIV]]] : tensor<?xf32> into tensor<2x?xf32>
 // CHECK: return %[[EXPANDED]] : tensor<2x?xf32>
 func.func @test_reshape_2d_same_d2d_auto(%arg0: tensor<?x2xf32>) -> tensor<2x?xf32> {
@@ -189,7 +189,7 @@ func.func @test_reshape_2d_same_s2s_explicit(%arg0: tensor<3x2xf32>) -> tensor<2
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C0_0:.*]] = arith.constant 0 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C0_0]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C0_0]] : index
 // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [0, 3, %[[DIV]]] : tensor<?xf32> into tensor<0x3x?xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<0x3x?xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
@@ -206,7 +206,7 @@ func.func @test_reshape_3d_same_d2d_auto_empty(%arg0: tensor<3x2x?xf32>) -> tens
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C8]] : index
 // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor<?xf32> into tensor<2x?x4xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[VAL_1]] : tensor<2x?x4xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
@@ -223,7 +223,7 @@ func.func @test_reshape_3d_same_d2d_auto(%arg0: tensor<2x?x?xf32>) -> tensor<?x?
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C6:.*]] = arith.constant 6 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C6]] : index
 // CHECK: %[[VAL_1:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, 3, %[[DIV]]] : tensor<?xf32> into tensor<2x3x?xf32>
 // CHECK: return %[[VAL_1]] : tensor<2x3x?xf32>
 func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor<?x3x4xf32>) -> tensor<2x3x?xf32> {
@@ -239,7 +239,7 @@ func.func @test_reshape_3d_same_d2d_auto_identity(%arg0: tensor<?x3x4xf32>) -> t
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C6:.*]] = arith.constant 6 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C6]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 2] : tensor<?xf32> into tensor<?x3x2xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x2xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
@@ -256,7 +256,7 @@ func.func @test_reshape_3d_same_d2d_explicit_empty(%arg0: tensor<3x2x?xf32>) ->
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C12:.*]] = arith.constant 12 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C12]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor<?xf32> into tensor<?x3x4xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x4xf32> to tensor<?x?x?xf32>
 // CHECK: return %[[VAL_2]] : tensor<?x?x?xf32>
@@ -284,7 +284,7 @@ func.func @test_reshape_3d_same_d2d_explicit_identity(%arg0: tensor<?x3x4xf32>)
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C8:.*]] = arith.constant 8 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C8]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C8]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [2, %[[DIV]], 4] : tensor<?xf32> into tensor<2x?x4xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<2x?x4xf32> to tensor<2x3x4xf32>
 // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32>
@@ -301,7 +301,7 @@ func.func @test_reshape_3d_same_d2s_auto(%arg0: tensor<?x?x?xf32>) -> tensor<2x3
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[VAL_0]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C12:.*]] = arith.constant 12 : index
-// CHECK: %[[DIV:.*]] = arith.divui %[[DIM]], %[[C12]] : index
+// CHECK: %[[DIV:.*]] = arith.divsi %[[DIM]], %[[C12]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[VAL_0]] {{\[\[}}0, 1, 2]] output_shape [%[[DIV]], 3, 4] : tensor<?xf32> into tensor<?x3x4xf32>
 // CHECK: %[[VAL_2:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x4xf32> to tensor<2x3x4xf32>
 // CHECK: return %[[VAL_2]] : tensor<2x3x4xf32>
@@ -328,7 +328,7 @@ func.func @test_reshape_3d_same_s2s_explicit_identity(%arg0: tensor<2x3x4xf32>)
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C6:.*]] = arith.constant 6 : index
-// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C6]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2, 3]] output_shape [%[[VAL_0]], 3, 2, 1] : tensor<?xf32> into tensor<?x3x2x1xf32>
 // CHECK: %[[CAST:.*]] = tensor.cast %[[EXPANDED]] : tensor<?x3x2x1xf32> to tensor<1x3x2x1xf32>
 // CHECK: return %[[CAST]] : tensor<1x3x2x1xf32>
@@ -357,7 +357,7 @@ func.func @test_reshape_4d_down_d2s_explicit(%arg0: tensor<?x?x?x?xf32>) -> tens
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C6:.*]] = arith.constant 6 : index
-// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C6]] : index
+// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C6]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 2, 3] : tensor<?xf32> into tensor<?x2x3xf32>
 // CHECK: return %[[EXPANDED]] : tensor<?x2x3xf32>
 func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor<?x?x?x2x3xf32>) -> tensor<?x2x3xf32> {
@@ -373,7 +373,7 @@ func.func @test_reshape_5d_down_d2d_auto(%arg0: tensor<?x?x?x2x3xf32>) -> tensor
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[DIM:.*]] = tensor.dim %[[COLLAPSED]], %[[C0]] : tensor<?xf32>
 // CHECK: %[[C385:.*]] = arith.constant 385 : index
-// CHECK: %[[VAL_0:.*]] = arith.divui %[[DIM]], %[[C385]] : index
+// CHECK: %[[VAL_0:.*]] = arith.divsi %[[DIM]], %[[C385]] : index
 // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSED]] {{\[\[}}0, 1, 2]] output_shape [%[[VAL_0]], 5, 77] : tensor<?xf32> into tensor<?x5x77xf32>
 // CHECK: return %[[EXPANDED]] : tensor<?x5x77xf32>
 func.func @test_reshape_6d_down_d2d_auto(%arg0: tensor<1x2x?x5x7x11xf32>) -> tensor<?x5x77xf32> {
diff --git a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
index 07708231a6e2f..cb8064411bbae 100644
--- a/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
+++ b/mlir/test/Dialect/Linalg/data-layout-propagation.mlir
@@ -1301,7 +1301,7 @@ func.func @push_down_unpack_through_expand(%5: tensor<?x32x8x8xf32>, %dim: index
 // CHECK:         %[[C32:.+]] = arith.constant 32 : index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
 // CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8x8xf32>
-// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index
+// CHECK:         %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C32]] : index
 // CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
 // CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x32x32x8x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
@@ -1322,7 +1322,7 @@ func.func @push_down_unpack_through_expand_empty_outer_dims_perm(%5: tensor<?x32
 // CHECK:         %[[C32:.+]] = arith.constant 32 : index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
 // CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8x8xf32>
-// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C32]] : index
+// CHECK:         %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C32]] : index
 // CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3], [4]] output_shape [%[[SZ0]], 32, 32, 8, 8] : tensor<?x32x8x8xf32> into tensor<?x32x32x8x8xf32>
 // CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x32x32x8x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
@@ -1373,7 +1373,7 @@ func.func @push_down_unpack_through_expand_on_outer_dims(%5: tensor<?x32x8xf32>,
 // CHECK:         %[[C256:.+]] = arith.constant 256 : index
 // CHECK:         %[[C0:.+]] = arith.constant 0 : index
 // CHECK:         %[[DIM0:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x32x8xf32>
-// CHECK:         %[[SZ0:.+]] = arith.divui %[[DIM0]], %[[C256]] : index
+// CHECK:         %[[SZ0:.+]] = arith.divsi %[[DIM0]], %[[C256]] : index
 // CHECK:         %[[EXPANDED:.+]] = tensor.expand_shape %[[ARG0]] {{\[}}[0, 1], [2], [3]] output_shape [%[[SZ0]], 256, 32, 8] : tensor<?x32x8xf32> into tensor<?x256x32x8xf32>
 // CHECK:         %[[DIM:.+]] = tensor.dim %[[EXPANDED]], %[[C0]] : tensor<?x256x32x8xf32>
 // CHECK:         %[[EMPTY:.+]] = tensor.empty(%[[DIM]]) : tensor<?x256x256xf32>
diff --git a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
index f17881d59a266..7db997cd4c0b5 100644
--- a/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
+++ b/mlir/test/Dialect/Linalg/fuse-with-reshape-by-collapsing.mlir
@@ -99,14 +99,14 @@ func.func @fuse_by_collapsing_indexing_op(%arg0 : tensor<2x12x5x336x9xi32>,
 //   CHECK-DAG:   %[[C7:.+]] = arith.constant 7 : index
 //       CHECK:     %[[IV0:.+]] = linalg.index 0
 //       CHECK:     %[[IV1:.+]] = linalg.index 1
-//       CHECK:     %[[REM_IV1:.+]] = arith.remui %[[IV1]], %[[C4]]
-//       CHECK:     %[[DIV_IV1:.+]] = arith.divui %[[IV1]], %[[C4]]
+//       CHECK:     %[[REM_IV1:.+]] = arith.remsi %[[IV1]], %[[C4]]
+//       CHECK:     %[[DIV_IV1:.+]] = arith.divsi %[[IV1]], %[[C4]]
 //       CHECK:     %[[IV2:.+]] = linalg.index 2
 //       CHECK:     %[[IV3:.+]] = linalg.index 3
-//       CHECK:     %[[REM1_IV3:.+]] = arith.remui %[[IV3]], %[[C8]]
-//       CHECK:     %[[DIV1_IV3:.+]] = arith.divui %[[IV3]], %[[C8]]
-//       CHECK:     %[[REM2_IV3:.+]] = arith.remui %[[DIV1_IV3]], %[[C7]]
-//       CHECK:     %[[DIV2_IV3:.+]] = arith.divui %[[DIV1_IV3]], %[[C7]]
+//       CHECK:     %[[REM1_IV3:.+]] = arith.remsi %[[IV3]], %[[C8]]
+//       CHECK:     %[[DIV1_IV3:.+]] = arith.divsi %[[IV3]], %[[C8]]
+//       CHECK:     %[[REM2_IV3:.+]] = arith.remsi %[[DIV1_IV3]], %[[C7]]
+//       CHECK:     %[[DIV2_IV3:.+]] = arith.divsi %[[DIV1_IV3]], %[[C7]]
 //       CHECK:     %[[IV4:.+]] = linalg.index 4
 //       CHECK:     %[[T0:.+]] = arith.addi %[[IV0]], %[[DIV_IV1]]
 //       CHECK:     %[[T1:.+]] = arith.addi %[[T0]], %[[REM_IV1]]
@@ -215,13 +215,13 @@ func.func @fuse_by_collapsing_dynamic(%arg0 : tensor<?x?x?x?x?xi32>,
 //  CHECK-DAG:   %[[D1:.+]] = tensor.dim %[[EXPAND]], %[[C5]]
 //      CHECK:   linalg.generic
 //      CHECK:     %[[IV0:.+]] = linalg.index 1
-//      CHECK:     %[[REM1_IV0:.+]] = arith.remui %[[IV0]], %[[C5]]
-//      CHECK:     %[[DIV1_IV0:.+]] = arith.divui %[[IV0]], %[[C5]]
-//      CHECK:     %[[REM2_IV0:.+]] = arith.remui %[[DIV1_IV0]], %[[D1]]
-//      CHECK:     %[[DIV2_IV0:.+]] = arith.divui %[[DIV1_IV0]], %[[D1]]
+//      CHECK:     %[[REM1_IV0:.+]] = arith.remsi %[[IV0]], %[[C5]]
+//      CHECK:     %[[DIV1_IV0:.+]] = arith.divsi %[[IV0]], %[[C5]]
+//      CHECK:     %[[REM2_IV0:.+]] = arith.remsi %[[DIV1_IV0]], %[[D1]]
+//      CHECK:     %[[DIV2_IV0:.+]] = arith.divsi %[[DIV1_IV0]], %[[D1]]
 //      CHECK:     %[[IV1:.+]] = linalg.index 3
-//      CHECK:     %[[REM1_IV1:.+]] = arith.remui %[[IV1]], %[[D0]]
-//      CHECK:     %[[DIV1_IV1:.+]] = arith.divui %[[IV1]], %[[D0]]
+//      CHECK:     %[[REM1_IV1:.+]] = arith.remsi %[[IV1]], %[[D0]]
+//      CHECK:     %[[DIV1_IV1:.+]] = arith.divsi %[[IV1]], %[[D0]]
 
 // -----
 
@@ -439,7 +439,7 @@ func.func @fuse_only_one_reassociation(%arg0 : tensor<?x?xf32>, %arg1 : tensor<4
 // CHECK-SAME:       outs(%[[COLLAPSE_ARG1_1]] :
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<4x?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C2]] : tensor<4x?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index
 //      CHECK:   %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0], [1], [2, 3]] output_shape [4, %[[DIM]], %[[VAL_1]], 8] : tensor<4x?x?xf32> into tensor<4x?x?x8xf32>
 //      CHECK:   return %[[EXPANDED_3]]
 
@@ -492,11 +492,11 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>, %sz0: index, %sz1:
 // CHECK-SAME:       outs(%[[COLLAPSE_INIT]] :
 // CHECK-NEXT:   ^bb{{[0-9]}}
 //      CHECK:       %[[ID0:.+]] = linalg.index 0
-//  CHECK-DAG:       %[[T0:.+]] = arith.remui %[[ID0]], %[[C4]]
-//  CHECK-DAG:       %[[T1:.+]] = arith.divui %[[ID0]], %[[C4]]
+//  CHECK-DAG:       %[[T0:.+]] = arith.remsi %[[ID0]], %[[C4]]
+//  CHECK-DAG:       %[[T1:.+]] = arith.divsi %[[ID0]], %[[C4]]
 //      CHECK:       %[[ID1:.+]] = linalg.index 1
-//  CHECK-DAG:       %[[T2:.+]] = arith.remui %[[ID1]], %[[C8]]
-//  CHECK-DAG:       %[[T3:.+]] = arith.divui %[[ID1]], %[[C8]]
+//  CHECK-DAG:       %[[T2:.+]] = arith.remsi %[[ID1]], %[[C8]]
+//  CHECK-DAG:       %[[T3:.+]] = arith.divsi %[[ID1]], %[[C8]]
 //  CHECK-DAG:       %[[T4:.+]] = arith.addi %[[T1]], %[[T2]]
 //  CHECK-DAG:       %[[T5:.+]] = arith.addi %[[T4]], %[[T0]]
 //  CHECK-DAG:       %[[T6:.+]] = arith.addi %[[T5]], %[[T3]]
@@ -504,8 +504,8 @@ func.func @fold_non_consecutive_dims(%arg0 : tensor<?x?xi32>, %sz0: index, %sz1:
 //      CHECK:       linalg.yield %[[T7]]
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[GENERIC]], %[[C0]] : tensor<?x?xi32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[GENERIC]], %[[C1]] : tensor<?x?xi32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index
-//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C4]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C4]] : index
 //      CHECK:   %[[EXPANDED_3:.+]] = tensor.expand_shape %[[GENERIC]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 4] : tensor<?x?xi32> into tensor<?x8x?x4xi32>
 //      CHECK:   return %[[EXPANDED_3]]
 
diff --git a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
index 751ece37bc094..7acbd843cd1e7 100644
--- a/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
+++ b/mlir/test/Dialect/Linalg/fusion-push-reshape.mlir
@@ -12,7 +12,7 @@
 // CHECK-SAME: iterator_types = ["parallel", "parallel"]}
 // CHECK-SAME: ins(%[[A]], %[[B]] : tensor<?x16xf32>, tensor<16xf32>) outs(%[[RI]] : tensor<?x16xf32>)
 //      CHECK: %[[DIM:.*]] = tensor.dim %[[R]], %[[C0]] : tensor<?x16xf32>
-//      CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C112]] : index
+//      CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C112]] : index
 //      CHECK: %[[RR:.*]] = tensor.expand_shape %[[R]] {{\[\[}}0, 1], [2]] output_shape [%[[VAL_1]], 112, 16] : tensor<?x16xf32> into tensor<?x112x16xf32>
 //      CHECK: return %[[RR]] : tensor<?x112x16xf32>
 func.func @reshape(%A: tensor<?x16xf32>, %B: tensor<16xf32>, %init: tensor<?x112x16xf32>, %sz0: index) -> tensor<?x112x16xf32> {
diff --git a/mlir/test/Dialect/Linalg/reshape_fusion.mlir b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
index b8df5fc88e199..ef853e4d662a7 100644
--- a/mlir/test/Dialect/Linalg/reshape_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/reshape_fusion.mlir
@@ -37,12 +37,12 @@ func.func @generic_op_reshape_producer_fusion(%arg0 : tensor<?x?x4x?xf32>,
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_1]], %[[C4]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM_1]], %[[C4]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM]], %[[DIM_0]], %[[VAL_0]], 4] : tensor<?x?x?xf32> into tensor<?x?x?x4xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_4]], %[[C4]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_4]], %[[C4]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1], [2, 3]] output_shape [%[[DIM_2]], %[[DIM_3]], %[[VAL_1]], 4] : tensor<?x?x?xf32> into tensor<?x?x?x4xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP5]], #[[MAP6]], #[[MAP7]], #[[MAP6]]]
@@ -93,15 +93,15 @@ func.func @generic_op_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM_0]], %[[C20]] : index
 //      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], 4, %[[VAL_0]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], 4, %[[VAL_1]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
 //      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], 4, %[[VAL_2]], 5] : tensor<?x?xf32> into tensor<?x4x?x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP2]], #[[MAP2]], #[[MAP3]], #[[MAP2]]]
@@ -144,18 +144,18 @@ func.func @reshape_as_consumer_permutation
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C12]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C2]] : index
 //      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor<?x?x?xf32> into tensor<3x4x?x?x2x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_2]], %[[C12]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor<?x?xf32> into tensor<3x4x?x?xf32>
 //      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 //      CHECK:   %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index
-//      CHECK:   %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divsi %[[DIM_5]], %[[C2]] : index
+//      CHECK:   %[[VAL_4:.+]] = arith.divsi %[[DIM_7]], %[[C12]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP8]], #[[MAP9]], #[[MAP10]]]
@@ -463,15 +463,15 @@ func.func @generic_op_reshape_consumer_fusion_projected(%arg0 : tensor<?x?xf32>,
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C20]] : index
 //      CHECK:   %[[T0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_1]], %[[C20]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_1]], %[[C20]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_1]], 4, 5, %[[DIM_2]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
 //      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP4]], #[[MAP4]], #[[MAP5]]]
@@ -569,24 +569,24 @@ func.func @reshape_as_consumer_permutation_with_multiple_results
 //       CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-//       CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C12]] : index
-//       CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C2]] : index
+//       CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C12]] : index
+//       CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C2]] : index
 //       CHECK:   %[[RESHAPE0:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1, 2], [3, 4], [5]] output_shape [3, 4, %[[VAL_0]], %[[VAL_1]], 2, %[[DIM_1]]] : tensor<?x?x?xf32> into tensor<3x4x?x?x2x?xf32>
 //       CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //       CHECK:   %[[DIM_3:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//       CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_2]], %[[C12]] : index
+//       CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_2]], %[[C12]] : index
 //       CHECK:   %[[RESHAPE1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [3, 4, %[[VAL_2]], %[[DIM_3]]] : tensor<?x?xf32> into tensor<3x4x?x?xf32>
 //       CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_6:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_7:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-//       CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_5]], %[[C2]] : index
-//       CHECK:   %[[VAL_4:.+]] = arith.divui %[[DIM_7]], %[[C12]] : index
+//       CHECK:   %[[VAL_3:.+]] = arith.divsi %[[DIM_5]], %[[C2]] : index
+//       CHECK:   %[[VAL_4:.+]] = arith.divsi %[[DIM_7]], %[[C12]] : index
 //       CHECK:   %[[RESHAPE2:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1], [2], [3, 4, 5]] output_shape [%[[VAL_3]], 2, %[[DIM_6]], 3, 4, %[[VAL_4]]] : tensor<?x?x?xf32> into tensor<?x2x?x3x4x?xf32>
 //       CHECK:   %[[DIM_9:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_10:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?x?xf32>
 //       CHECK:   %[[DIM_11:.+]] = tensor.dim %[[ARG0]], %[[C2]] : tensor<?x?x?xf32>
-//       CHECK:   %[[VAL_5:.+]] = arith.divui %[[DIM_10]], %[[C2]] : index
-//       CHECK:   %[[VAL_6:.+]] = arith.divui %[[DIM_11]], %[[C12]] : index
+//       CHECK:   %[[VAL_5:.+]] = arith.divsi %[[DIM_10]], %[[C2]] : index
+//       CHECK:   %[[VAL_6:.+]] = arith.divsi %[[DIM_11]], %[[C12]] : index
 //       CHECK:   %[[RESHAPE3:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2], [3, 4, 5]] output_shape [%[[DIM_9]], %[[VAL_5]], 2, 3, 4, %[[VAL_6]]] : tensor<?x?x?xf32> into tensor<?x?x2x3x4x?xf32>
 //       CHECK:   %[[GENERIC:.+]]:2 = linalg.generic
 //  CHECK-SAME:      indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]], #[[MAP3]]]
@@ -667,11 +667,11 @@ func.func @generic_op_reshape_consumer_fusion_reduction(%arg0 : tensor<?x?xf32>,
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C20]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C20]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1, 2], [3]] output_shape [%[[VAL_0]], 4, 5, %[[DIM_0]]] : tensor<?x?xf32> into tensor<?x4x5x?xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP0]], #[[MAP1]], #[[MAP2]]]
@@ -719,13 +719,13 @@ func.func @generic_op_reshape_producer_fusion_with_reduction(%arg0 : tensor<?x7x
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x4x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C2]] : tensor<?x4x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C8]] : index
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C7]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C8]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C7]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2], [3, 4]] output_shape [%[[VAL_0]], 8, 4, %[[VAL_1]], 7] : tensor<?x4x?xf32> into tensor<?x8x4x?x7xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C8]] : index
-//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C7]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C8]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C7]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 8, %[[VAL_3]], 7] : tensor<?x?xf32> into tensor<?x8x?x7xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
@@ -764,15 +764,15 @@ func.func @linalg_add_reshape_consumer_fusion(%arg0 : tensor<?x?xf32>,
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG0]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM_0]], %[[C20]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM_0]], %[[C20]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM]], %[[VAL_0]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_2]], %[[C20]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_2]], %[[C20]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_1]], %[[VAL_1]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[DIM_4:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_5:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_5]], %[[C20]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_5]], %[[C20]] : index
 //      CHECK:   %[[T3:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0], [1, 2, 3]] output_shape [%[[DIM_4]], %[[VAL_2]], 4, 5] : tensor<?x?xf32> into tensor<?x?x4x5xf32>
 //      CHECK:   %[[T4:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[MAP]], #[[MAP]], #[[MAP]]]
@@ -809,13 +809,13 @@ func.func @linalg_add_reshape_producer_fusion(%arg0 : tensor<?x7x?x8xf32>,
 //      CHECK:   %[[C0:.+]] = arith.constant 0 : index
 //      CHECK:   %[[DIM:.+]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_0:.+]] = tensor.dim %[[ARG1]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_0:.+]] = arith.divui %[[DIM]], %[[C7]] : index
-//      CHECK:   %[[VAL_1:.+]] = arith.divui %[[DIM_0]], %[[C8]] : index
+//      CHECK:   %[[VAL_0:.+]] = arith.divsi %[[DIM]], %[[C7]] : index
+//      CHECK:   %[[VAL_1:.+]] = arith.divsi %[[DIM_0]], %[[C8]] : index
 //      CHECK:   %[[T1:.+]] = tensor.expand_shape %[[ARG1]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_0]], 7, %[[VAL_1]], 8] : tensor<?x?xf32> into tensor<?x7x?x8xf32>
 //      CHECK:   %[[DIM_1:.+]] = tensor.dim %[[ARG2]], %[[C0]] : tensor<?x?xf32>
 //      CHECK:   %[[DIM_2:.+]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
-//      CHECK:   %[[VAL_2:.+]] = arith.divui %[[DIM_1]], %[[C7]] : index
-//      CHECK:   %[[VAL_3:.+]] = arith.divui %[[DIM_2]], %[[C8]] : index
+//      CHECK:   %[[VAL_2:.+]] = arith.divsi %[[DIM_1]], %[[C7]] : index
+//      CHECK:   %[[VAL_3:.+]] = arith.divsi %[[DIM_2]], %[[C8]] : index
 //      CHECK:   %[[T2:.+]] = tensor.expand_shape %[[ARG2]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_2]], 7, %[[VAL_3]], 8] : tensor<?x?xf32> into tensor<?x7x?x8xf32>
 //      CHECK:   %[[T3:.+]] = linalg.generic
 // CHECK-SAME:     indexing_maps = [#[[$MAP]], #[[$MAP]], #[[$MAP]]]
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index ecd285be46194..9ea0a15f31185 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -372,7 +372,7 @@ func.func @tensor.expand_shape(%t1: tensor<?x10xf32>, %sz0: index) -> tensor<2x?
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[DIM:.*]] = memref.dim %[[m1]], %[[C0]] : memref<?x10xf32>
   // CHECK: %[[C2:.*]] = arith.constant 2 : index
-  // CHECK: %[[VAL_1:.*]] = arith.divui %[[DIM]], %[[C2]] : index
+  // CHECK: %[[VAL_1:.*]] = arith.divsi %[[DIM]], %[[C2]] : index
   // CHECK: %[[expanded:.*]] = memref.expand_shape %[[m1]] {{\[\[}}0, 1], [2]] output_shape [2, %[[VAL_1]], 10] : memref<?x10xf32> into memref<2x?x10xf32>
   %0 = tensor.expand_shape %t1 [[0, 1], [2]] output_shape [2, %sz0, 10]
       : tensor<?x10xf32> into tensor<2x?x10xf32>
@@ -393,7 +393,7 @@ func.func @tensor.expand_shape_of_slice(
   %0 = tensor.extract_slice %t1[%o1, 5][%s1, 10][1, 1] :
       tensor<?x20xf32> to tensor<?x10xf32>
   // CHECK: %[[C7:.*]] = arith.constant 7 : index
-  // CHECK: %[[VAL_1:.*]] = arith.divui %{{.*}}, %[[C7]] : index
+  // CHECK: %[[VAL_1:.*]] = arith.divsi %{{.*}}, %[[C7]] : index
   // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] {{\[\[}}0, 1], [2, 3]] output_shape [%[[VAL_1]], 7, 2, 5] : memref<?x10xf32, strided<[20, 1], offset: ?>> into memref<?x7x2x5xf32, strided<[140, 20, 5, 1], offset: ?>>
   %1 = tensor.expand_shape %0 [[0, 1], [2, 3]] output_shape [%sz0, 7, 2, 5] :
       tensor<?x10xf32> into tensor<?x7x2x5xf32>

From 27c917307563eae93c7fef9c3944e56e1f5b5f6d Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Mon, 27 Jan 2025 23:04:57 +0800
Subject: [PATCH 202/432] [Clang] Remove unnecessary Decl transform & profiles
 for SizeOfPackExpr (#124533)

We used to always transform the pattern declaration for SizeOfPackExpr
to ensure the constraint expression's profile produced the desired
result. However, this approach failed to handle pack expansions when the
pack referred to function parameters. In such cases, the function
parameters were formerly expanded to 1 to avoid building Subst* nodes
(see e6974daa7). That workaround caused us to transform a pack without a
proper ArgumentPackSubstitutionIndex, leading to crashes when
transforming the pattern.

It turns out that profiling the pattern for partially substituted
SizeOfPackExprs is unnecessary because their transformed forms are also
profiled within the partial arguments.

Fixes https://github.com/llvm/llvm-project/issues/124161
---
 clang/include/clang/AST/ExprCXX.h             |  2 --
 clang/lib/AST/StmtProfile.cpp                 |  2 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    | 26 -----------------
 .../SemaTemplate/concepts-out-of-line-def.cpp | 28 +++++++++++++++++++
 4 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 2a130bc6da79a..7b0450b90d564 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -4326,8 +4326,6 @@ class SizeOfPackExpr final
   /// Retrieve the parameter pack.
   NamedDecl *getPack() const { return Pack; }
 
-  void setPack(NamedDecl *NewPack) { Pack = NewPack; }
-
   /// Retrieve the length of the parameter pack.
   ///
   /// This routine may only be invoked when the expression is not
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index 85b59f714ba84..5d1f370cac19f 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2270,13 +2270,13 @@ void StmtProfiler::VisitPackExpansionExpr(const PackExpansionExpr *S) {
 
 void StmtProfiler::VisitSizeOfPackExpr(const SizeOfPackExpr *S) {
   VisitExpr(S);
-  VisitDecl(S->getPack());
   if (S->isPartiallySubstituted()) {
     auto Args = S->getPartialArguments();
     ID.AddInteger(Args.size());
     for (const auto &TA : Args)
       VisitTemplateArgument(TA);
   } else {
+    VisitDecl(S->getPack());
     ID.AddInteger(0);
   }
 }
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 3dc5696bd3821..3c6b7ce2949c1 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1762,23 +1762,6 @@ namespace {
       return inherited::TransformLambdaBody(E, Body);
     }
 
-    ExprResult TransformSizeOfPackExpr(SizeOfPackExpr *E) {
-      ExprResult Transformed = inherited::TransformSizeOfPackExpr(E);
-      if (!Transformed.isUsable())
-        return Transformed;
-      auto *TransformedExpr = cast<SizeOfPackExpr>(Transformed.get());
-      if (SemaRef.CodeSynthesisContexts.back().Kind ==
-              Sema::CodeSynthesisContext::ConstraintNormalization &&
-          TransformedExpr->getPack() == E->getPack()) {
-        Decl *NewPack =
-            TransformDecl(E->getPackLoc(), TransformedExpr->getPack());
-        if (!NewPack)
-          return ExprError();
-        TransformedExpr->setPack(cast<NamedDecl>(NewPack));
-      }
-      return TransformedExpr;
-    }
-
     ExprResult TransformRequiresExpr(RequiresExpr *E) {
       LocalInstantiationScope Scope(SemaRef, /*CombineWithOuterScope=*/true);
       ExprResult TransReq = inherited::TransformRequiresExpr(E);
@@ -1902,15 +1885,6 @@ Decl *TemplateInstantiator::TransformDecl(SourceLocation Loc, Decl *D) {
       TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getPosition());
 
       if (TTP->isParameterPack()) {
-        // We might not have an index for pack expansion when normalizing
-        // constraint expressions. In that case, resort to instantiation scopes
-        // for the transformed declarations.
-        if (SemaRef.ArgumentPackSubstitutionIndex == -1 &&
-            SemaRef.CodeSynthesisContexts.back().Kind ==
-                Sema::CodeSynthesisContext::ConstraintNormalization) {
-          return SemaRef.FindInstantiatedDecl(Loc, cast<NamedDecl>(D),
-                                              TemplateArgs);
-        }
         assert(Arg.getKind() == TemplateArgument::Pack &&
                "Missing argument pack");
         Arg = getPackSubstitutedTemplateArgument(getSema(), Arg);
diff --git a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
index 6c1a229a9fdda..5af4ec75cae90 100644
--- a/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
+++ b/clang/test/SemaTemplate/concepts-out-of-line-def.cpp
@@ -722,6 +722,34 @@ template struct d<int, int>;
 
 } // namespace GH115098
 
+namespace GH123441 {
+
+struct buf {
+  constexpr buf(auto&&... initList) requires (sizeof...(initList) <= 8);
+};
+
+constexpr buf::buf(auto&&... initList) requires (sizeof...(initList) <= 8) {}
+
+template <class>
+struct buffer {
+  constexpr buffer(auto&&... initList) requires (sizeof...(initList) <= 8);
+};
+
+template <class T>
+constexpr buffer<T>::buffer(auto&&... initList) requires (sizeof...(initList) <= 8) {}
+
+template <class...>
+struct foo { // expected-note {{foo defined here}}
+  constexpr foo(auto&&... initList)
+    requires (sizeof...(initList) <= 8);
+};
+
+template <class... T>
+constexpr foo<T...>::foo(auto&&... initList) // expected-error {{does not match any declaration}}
+  requires (sizeof...(T) <= 8) {}
+
+} // namespace GH123441
+
 namespace GH114685 {
 
 template <typename T> struct ptr {

From 092372da15e5165be14cdbb7cac3cf4976fd82d0 Mon Sep 17 00:00:00 2001
From: MaheshRavishankar <1663364+MaheshRavishankar@users.noreply.github.com>
Date: Mon, 27 Jan 2025 07:05:34 -0800
Subject: [PATCH 203/432] [mlir][Tensor] Rework
 `ReifyRankedShapedTypeInterface` implementation for `tensor.expand_shape` op.
 (#113501)

The op carries the output-shape directly. This can be used directly.
Also adds a method to get the shape as a `SmallVector<OpFoldResult>`.

Signed-off-by: MaheshRavishankar <mahesh.ravishankar@gmail.com>
---
 .../mlir/Dialect/Tensor/IR/TensorOps.td       |   3 +
 .../mlir/Dialect/Utils/StaticValueUtils.h     |   3 +
 .../IR/TensorInferTypeOpInterfaceImpl.cpp     | 115 +++---------------
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      |   4 +
 mlir/lib/Dialect/Utils/StaticValueUtils.cpp   |  10 +-
 mlir/lib/Interfaces/InferTypeOpInterface.cpp  |   8 --
 .../resolve-shaped-type-result-dims.mlir      |   7 +-
 mlir/test/Dialect/Tensor/fold-empty-op.mlir   |   9 +-
 8 files changed, 43 insertions(+), 116 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 8ad1b23cb2bfe..3ef7c74fd3af1 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1165,6 +1165,9 @@ def Tensor_ExpandShapeOp : Tensor_ReassociativeReshapeOp<"expand_shape"> {
   let extraClassDeclaration = commonExtraClassDeclaration # [{
     int64_t getCorrespondingSourceDim(int64_t resultDim);
 
+    // Return output shape as mixes static/dynamic shapes.
+    SmallVector<OpFoldResult> getMixedOutputShape();
+
     // Infer the output shape for a tensor.expand_shape when it is possible
     // to do so.
     static FailureOr<SmallVector<OpFoldResult>> inferOutputShape(
diff --git a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
index d1f7ab1156248..2a3a2defb810d 100644
--- a/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StaticValueUtils.h
@@ -144,6 +144,9 @@ bool isEqualConstantIntOrValueArray(ArrayRef<OpFoldResult> ofrs1,
 /// Return a vector of OpFoldResults with the same size a staticValues, but
 /// all elements for which ShapedType::isDynamic is true, will be replaced by
 /// dynamicValues.
+SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
+                                         ValueRange dynamicValues,
+                                         MLIRContext *context);
 SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
                                          ValueRange dynamicValues, Builder &b);
 
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
index 7ff435a033985..f6fea08e2e717 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
@@ -16,24 +16,6 @@
 using namespace mlir;
 using namespace mlir::tensor;
 
-/// Compute a map that for a given dimension of the expanded type gives the
-/// dimension in the collapsed type it maps to. Essentially its the inverse of
-/// the `reassocation` maps.
-static llvm::DenseMap<int64_t, int64_t>
-getExpandedDimToCollapsedDimMap(ArrayRef<AffineMap> reassociation) {
-  llvm::DenseMap<int64_t, int64_t> expandedDimToCollapsedDim;
-  for (const auto &map : enumerate(reassociation)) {
-    unsigned startPos =
-        cast<AffineDimExpr>(map.value().getResults().front()).getPosition();
-    unsigned endPos =
-        cast<AffineDimExpr>(map.value().getResults().back()).getPosition();
-    for (auto dim : llvm::seq_inclusive(startPos, endPos)) {
-      expandedDimToCollapsedDim[dim] = map.index();
-    }
-  }
-  return expandedDimToCollapsedDim;
-}
-
 /// For reshape op compute the shape at dimension `dimIndex` of the output in
 /// terms of shape of the `src`, when the reshape op is a collapsing
 /// operation. It is the product of the shape of the collapsed dimensions of the
@@ -76,84 +58,15 @@ static SmallVector<OpFoldResult, 4> getCollapsedOutputShapeFromInputShape(
       }));
 }
 
-/// For an expanding reshape op, compute the value for a dimension of the output
-/// from the shape of the input.
-static OpFoldResult getExpandedOutputDimFromInputShape(
-    OpBuilder &builder, Location loc, int64_t dimIndex, Value src,
-    ArrayRef<int64_t> dstStaticShape, ArrayRef<AffineMap> reassociation,
-    llvm::DenseMap<int64_t, int64_t> &expandedDimToCollapsedDim) {
-  if (!ShapedType::isDynamic(dstStaticShape[dimIndex])) {
-    // Static dimension: return Attribute.
-    return builder.getIndexAttr(dstStaticShape[dimIndex]);
-  }
-  unsigned sourceDimPos = expandedDimToCollapsedDim[dimIndex];
-  unsigned startPos =
-      cast<AffineDimExpr>(reassociation[sourceDimPos].getResults().front())
-          .getPosition();
-  unsigned endPos =
-      cast<AffineDimExpr>(reassociation[sourceDimPos].getResults().back())
-          .getPosition();
-  int64_t linearizedStaticDim = 1;
-  for (auto d :
-       llvm::enumerate(dstStaticShape.slice(startPos, endPos - startPos + 1))) {
-    if (d.index() + startPos == static_cast<unsigned>(dimIndex))
-      continue;
-    assert(!ShapedType::isDynamic(d.value()) &&
-           "single dimension cannot be expanded into multiple dynamic "
-           "dimensions");
-    linearizedStaticDim *= d.value();
-  }
-  OpFoldResult sourceDim =
-      builder.create<tensor::DimOp>(loc, src, sourceDimPos).getResult();
-
-  // Dynamic dimension: return Value.
-  return affine::makeComposedAffineApply(
-             builder, loc,
-             AffineMap::get(
-                 0, 1,
-                 builder.getAffineSymbolExpr(0).floorDiv(linearizedStaticDim)),
-             sourceDim)
-      ->getResult(0);
-}
-
-/// Given the `src` of an expanding reshape op, the reassociation maps and the
-/// result type, compute the shape of the result of the reshape.
-static SmallVector<OpFoldResult, 4> getExpandedOutputShapeFromInputShape(
-    OpBuilder &builder, Location loc, Value src,
-    ArrayRef<int64_t> dstStaticShape, ArrayRef<AffineMap> reassociation) {
-  llvm::DenseMap<int64_t, int64_t> expandedDimToCollapsedDim =
-      getExpandedDimToCollapsedDimMap(reassociation);
-  return llvm::to_vector<4>(llvm::map_range(
-      llvm::seq<int64_t>(0, dstStaticShape.size()), [&](int64_t dim) {
-        return getExpandedOutputDimFromInputShape(builder, loc, dim, src,
-                                                  dstStaticShape, reassociation,
-                                                  expandedDimToCollapsedDim);
-      }));
-}
-
-static SmallVector<OpFoldResult, 4>
-getReshapeOutputShapeFromInputShape(OpBuilder &builder, Location loc, Value src,
-                                    ArrayRef<int64_t> dstStaticShape,
-                                    ArrayRef<AffineMap> reassocation) {
-  return dstStaticShape.size() >
-                 static_cast<size_t>(
-                     llvm::cast<ShapedType>(src.getType()).getRank())
-             ? getExpandedOutputShapeFromInputShape(
-                   builder, loc, src, dstStaticShape, reassocation)
-             : getCollapsedOutputShapeFromInputShape(
-                   builder, loc, src, dstStaticShape, reassocation);
-}
-
-template <typename OpTy>
-struct ReifyExpandOrCollapseShapeOp
+struct ReifyCollapseShapeOp
     : public ReifyRankedShapedTypeOpInterface::ExternalModel<
-          ReifyExpandOrCollapseShapeOp<OpTy>, OpTy> {
+          ReifyCollapseShapeOp, CollapseShapeOp> {
   LogicalResult
   reifyResultShapes(Operation *op, OpBuilder &b,
                     ReifiedRankedShapedTypeDims &reifiedReturnShapes) const {
     auto loc = op->getLoc();
-    auto reshapeOp = cast<OpTy>(op);
-    reifiedReturnShapes.push_back(getReshapeOutputShapeFromInputShape(
+    auto reshapeOp = cast<tensor::CollapseShapeOp>(op);
+    reifiedReturnShapes.push_back(getCollapsedOutputShapeFromInputShape(
         b, loc, reshapeOp.getSrc(), reshapeOp.getResultType().getShape(),
         reshapeOp.getReassociationMaps()));
     return success();
@@ -162,6 +75,20 @@ struct ReifyExpandOrCollapseShapeOp
 
 namespace {
 
+struct ReifyExpandShapeOp
+    : public ReifyRankedShapedTypeOpInterface::ExternalModel<ReifyExpandShapeOp,
+                                                             ExpandShapeOp> {
+  LogicalResult
+  reifyResultShapes(Operation *op, OpBuilder &b,
+                    ReifiedRankedShapedTypeDims &reifyResultShapes) const {
+    auto expandShapeOp = cast<tensor::ExpandShapeOp>(op);
+    SmallVector<OpFoldResult> resultShapes =
+        expandShapeOp.getMixedOutputShape();
+    reifyResultShapes.emplace_back(std::move(resultShapes));
+    return success();
+  }
+};
+
 struct ReifyPadOp
     : public ReifyRankedShapedTypeOpInterface::ExternalModel<ReifyPadOp,
                                                              PadOp> {
@@ -202,10 +129,8 @@ struct ReifyPadOp
 void mlir::tensor::registerInferTypeOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, TensorDialect *dialect) {
-    ExpandShapeOp::attachInterface<
-        ReifyExpandOrCollapseShapeOp<tensor::ExpandShapeOp>>(*ctx);
-    CollapseShapeOp::attachInterface<
-        ReifyExpandOrCollapseShapeOp<tensor::CollapseShapeOp>>(*ctx);
+    ExpandShapeOp::attachInterface<ReifyExpandShapeOp>(*ctx);
+    CollapseShapeOp::attachInterface<ReifyCollapseShapeOp>(*ctx);
     PadOp::attachInterface<ReifyPadOp>(*ctx);
   });
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 24a1d55315319..117908129561f 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -1732,6 +1732,10 @@ ExpandShapeOp::inferOutputShape(OpBuilder &b, Location loc,
   return *outputShape;
 }
 
+SmallVector<OpFoldResult> ExpandShapeOp::getMixedOutputShape() {
+  return getMixedValues(getStaticOutputShape(), getOutputShape(), getContext());
+}
+
 void ExpandShapeOp::build(OpBuilder &builder, OperationState &result,
                           Type resultType, Value src,
                           ArrayRef<ReassociationIndices> reassociation,
diff --git a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
index 5c8f6ded39ba4..fcb736aa031f3 100644
--- a/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StaticValueUtils.cpp
@@ -191,7 +191,8 @@ bool isEqualConstantIntOrValueArray(ArrayRef<OpFoldResult> ofrs1,
 /// elements for which ShapedType::isDynamic is true, will be replaced by
 /// dynamicValues.
 SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
-                                         ValueRange dynamicValues, Builder &b) {
+                                         ValueRange dynamicValues,
+                                         MLIRContext *context) {
   SmallVector<OpFoldResult> res;
   res.reserve(staticValues.size());
   unsigned numDynamic = 0;
@@ -200,10 +201,15 @@ SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
     int64_t value = staticValues[idx];
     res.push_back(ShapedType::isDynamic(value)
                       ? OpFoldResult{dynamicValues[numDynamic++]}
-                      : OpFoldResult{b.getI64IntegerAttr(staticValues[idx])});
+                      : OpFoldResult{IntegerAttr::get(
+                            IntegerType::get(context, 64), staticValues[idx])});
   }
   return res;
 }
+SmallVector<OpFoldResult> getMixedValues(ArrayRef<int64_t> staticValues,
+                                         ValueRange dynamicValues, Builder &b) {
+  return getMixedValues(staticValues, dynamicValues, b.getContext());
+}
 
 /// Decompose a vector of mixed static or dynamic values into the corresponding
 /// pair of arrays. This is the inverse function of `getMixedValues`.
diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index 3eb401c449980..6b5e103cd36c2 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -48,14 +48,6 @@ mlir::reifyResultShapes(OpBuilder &b, Operation *op,
     assert(shapedType.getRank() ==
                static_cast<int64_t>(reifiedReturnShapes[resultIdx].size()) &&
            "incorrect implementation of ReifyRankedShapedTypeOpInterface");
-    for (int64_t dim = 0; dim < shapedType.getRank(); ++dim) {
-      // reifyResultShapes must return:
-      // * Attribute for static dimensions
-      // * Value for dynamic dimensions
-      assert(shapedType.isDynamicDim(dim) ==
-                 isa<Value>(reifiedReturnShapes[resultIdx][dim]) &&
-             "incorrect implementation of ReifyRankedShapedTypeOpInterface");
-    }
     ++resultIdx;
   }
   // Assert that every shaped value result was reified.
diff --git a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
index 8fb84248c9613..3bc1f56d816d7 100644
--- a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
+++ b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
@@ -210,15 +210,12 @@ func.func @dim_reshape_expansion(%arg0 : tensor<6x5x?xf32>, %sz0: index) -> (ind
   %3 = tensor.dim %0, %c4 : tensor<2x3x5x4x?x7xf32>
   return %1, %2, %3 : index, index, index
 }
-//      CHECK: #[[MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 28)>
 //      CHECK: func @dim_reshape_expansion
 // CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<6x5x?xf32>
-//  CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-SAME:   %[[ARG1:.+]]: index
 //  CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 //  CHECK-DAG:   %[[C4:.+]] = arith.constant 4 : index
-//      CHECK:   %[[D0:.+]] = tensor.dim %[[ARG0]], %[[C2]]
-//      CHECK:   %[[D1:.+]] = affine.apply #[[MAP]]()[%[[D0]]]
-//      CHECK:   return %[[C3]], %[[C4]], %[[D1]]
+//      CHECK:   return %[[C3]], %[[C4]], %[[ARG1]]
 
 // -----
 
diff --git a/mlir/test/Dialect/Tensor/fold-empty-op.mlir b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
index 65ceb4ff3e3df..850bbcee34020 100644
--- a/mlir/test/Dialect/Tensor/fold-empty-op.mlir
+++ b/mlir/test/Dialect/Tensor/fold-empty-op.mlir
@@ -10,7 +10,6 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 floordiv 28)>
 // CHECK: #[[$MAP2:.+]] = affine_map<()[s0] -> (s0 * 28)>
 
 func.func @empty_reshape_expansion(%arg0 : index, %sz0: index) -> tensor<2x3x5x4x?x7xf32> {
@@ -19,11 +18,9 @@ func.func @empty_reshape_expansion(%arg0 : index, %sz0: index) -> tensor<2x3x5x4
   return %1 : tensor<2x3x5x4x?x7xf32>
 }
 // CHECK-LABEL: func @empty_reshape_expansion
-// CHECK-SAME:     %[[ARG0:.+]]: index
-// CHECK:        %[[OLD_INIT:.+]] = tensor.empty(%{{.*}}) : tensor<6x5x?xf32>
-// CHECK-NEXT:   %[[DIM:.*]] = tensor.dim %[[OLD_INIT]]
-// CHECK-NEXT:   %[[D:.+]] = affine.apply #[[$MAP]]()[%[[DIM]]]
-// CHECK-NEXT:   %[[INIT:.+]] = tensor.empty(%[[D]])
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:     %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-NEXT:   %[[INIT:.+]] = tensor.empty(%[[ARG1]])
 // CHECK-NEXT:   return %[[INIT]]
 
 func.func @empty_reshape_collapse(%arg0 : index) -> tensor<6x5x?xf32> {

From 62340ff8d844fc02cd1bd34ff6235f1f0e1e464f Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 27 Jan 2025 10:12:20 -0500
Subject: [PATCH 204/432] [AMDGPU][True16][MC] true16 for v_cmpx_xx_f16
 (#123419)

A bulk commit of true16 support for v_cmpx_xx_f16 instructions
including:

v_cmpx_f_f16
v_cmpx_le_f16
v_cmpx_gt_f16
v_cmpx_lg_f16
v_cmpx_ge_f16
v_cmpx_o_f16
v_cmpx_u_f16
v_cmpx_nge_f16
v_cmpx_nlg_f16
v_cmpx_ngt_f16
v_cmpx_nle_f16
v_cmpx_neq_f16
v_cmpx_nlt_f16
v_cmpx_t_f16

v_cmpx_eq_f16 is not in this patch and will be added in the following
patch
---
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |   28 +-
 .../AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s  |  966 +++++++------
 .../AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s   |  366 +++--
 .../MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s     |  204 ++-
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s         | 1113 +++++++++------
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s   |  963 +++++++------
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s    |  307 ++--
 llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s |  582 +++++---
 .../MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s   |  630 ++++++---
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s        |  168 ++-
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s  |  876 +++++++-----
 llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s   |  396 ++++--
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s         |  864 +++++++-----
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s   |  744 +++++-----
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s    |  216 ++-
 llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s |  504 +++++--
 .../MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s   |  504 +++++--
 .../gfx11_dasm_vop3_dpp16_from_vopcx.txt      |  756 +++++++---
 .../gfx11_dasm_vop3_dpp8_from_vopcx.txt       |  336 ++++-
 .../AMDGPU/gfx11_dasm_vop3_from_vopcx.txt     |  196 ++-
 .../Disassembler/AMDGPU/gfx11_dasm_vopcx.txt  |  910 +++++++++---
 .../AMDGPU/gfx11_dasm_vopcx_dpp16.txt         |  756 +++++++---
 .../AMDGPU/gfx11_dasm_vopcx_dpp8.txt          |  336 ++++-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt |  179 ++-
 .../AMDGPU/gfx12_dasm_vop3cx_dpp16.txt        | 1248 ++++++++++++++---
 .../AMDGPU/gfx12_dasm_vop3cx_dpp8.txt         |  341 ++++-
 .../Disassembler/AMDGPU/gfx12_dasm_vopcx.txt  |  732 +++++++---
 .../AMDGPU/gfx12_dasm_vopcx_dpp16.txt         |  600 +++++---
 .../AMDGPU/gfx12_dasm_vopcx_dpp8.txt          |  168 ++-
 29 files changed, 11102 insertions(+), 4887 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 46cad585b8a82..e16ac4423265e 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1975,22 +1975,22 @@ defm V_CMP_CLASS_F16     : VOPC_Real_t16_and_fake16_gfx11_gfx12<0x07d, "v_cmp_cl
 defm V_CMP_CLASS_F32     : VOPC_Real_gfx11_gfx12<0x07e>;
 defm V_CMP_CLASS_F64     : VOPC_Real_gfx11_gfx12<0x07f>;
 
-defm V_CMPX_F_F16_fake16     : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
+defm V_CMPX_F_F16            : VOPCX_Real_t16_and_fake16_gfx11<0x080, "v_cmpx_f_f16">;
 defm V_CMPX_LT_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
 defm V_CMPX_EQ_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
-defm V_CMPX_LE_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
-defm V_CMPX_GT_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
-defm V_CMPX_LG_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">;
-defm V_CMPX_GE_F16_fake16    : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">;
-defm V_CMPX_O_F16_fake16     : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">;
-defm V_CMPX_U_F16_fake16     : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">;
-defm V_CMPX_NGE_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">;
-defm V_CMPX_NLG_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">;
-defm V_CMPX_NGT_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">;
-defm V_CMPX_NLE_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">;
-defm V_CMPX_NEQ_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">;
-defm V_CMPX_NLT_F16_fake16   : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">;
-defm V_CMPX_T_F16_fake16     : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_fake16", "v_cmpx_t_f16", "v_cmpx_tru_f16">;
+defm V_CMPX_LE_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
+defm V_CMPX_GT_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
+defm V_CMPX_LG_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">;
+defm V_CMPX_GE_F16           : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">;
+defm V_CMPX_O_F16            : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x087, "v_cmpx_o_f16">;
+defm V_CMPX_U_F16            : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x088, "v_cmpx_u_f16">;
+defm V_CMPX_NGE_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">;
+defm V_CMPX_NLG_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">;
+defm V_CMPX_NGT_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">;
+defm V_CMPX_NLE_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">;
+defm V_CMPX_NEQ_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">;
+defm V_CMPX_NLT_F16          : VOPCX_Real_t16_and_fake16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">;
+defm V_CMPX_T_F16            : VOPCX_Real_t16_and_fake16_gfx11<0x08f, "v_cmpx_t_f16", "V_CMPX_TRU_F16", "v_cmpx_tru_f16">;
 
 defm V_CMPX_F_F32     : VOPCX_Real_gfx11<0x090>;
 defm V_CMPX_LT_F32    : VOPCX_Real_gfx11_gfx12<0x091>;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
index e946097f35e23..6864ce20499cb 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16_from_vopcx.s
@@ -375,47 +375,56 @@ v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -543,47 +552,56 @@ v_cmpx_f_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -813,47 +831,56 @@ v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1083,47 +1110,56 @@ v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1353,47 +1389,56 @@ v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1902,47 +1947,56 @@ v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1986,47 +2040,56 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2070,47 +2133,56 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2154,47 +2226,56 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2238,47 +2319,56 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2322,47 +2412,56 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2406,47 +2505,56 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2490,47 +2598,47 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2658,47 +2766,56 @@ v_cmpx_t_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_tru_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_tru_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_tru_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_tru_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_tru_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_tru_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_tru_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2742,47 +2859,56 @@ v_cmpx_tru_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_tru_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_mirror
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
index e60406078f745..0e36812c78dc1 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8_from_vopcx.s
@@ -116,17 +116,26 @@ v_cmpx_eq_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x80,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x80,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -158,17 +167,26 @@ v_cmpx_f_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xc8,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -236,17 +254,26 @@ v_cmpx_ge_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -314,17 +341,26 @@ v_cmpx_gt_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -392,17 +428,26 @@ v_cmpx_le_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -557,17 +602,26 @@ v_cmpx_ne_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -581,17 +635,26 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -605,17 +668,26 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -629,17 +701,26 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -653,17 +734,26 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -677,17 +767,26 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -701,17 +800,26 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -725,17 +833,17 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -767,17 +875,26 @@ v_cmpx_t_u32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcf,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_tru_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_tru_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_tru_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_tru_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_tru_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_tru_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -791,17 +908,26 @@ v_cmpx_tru_f32_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_tru_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_t_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9f,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
index 799a8f86a01e9..a4340919ca6d2 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_from_vopcx.s
@@ -545,11 +545,11 @@ v_cmpx_eq_u64_e64 src_scc, exec
 v_cmpx_eq_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_f16_e64 v1, v2
-// GFX11: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_f_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_f_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_f_f16_e64 v255, v255
-// GFX11: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_f_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_f_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_f_f16_e64 s1, s2
 // GFX11: v_cmpx_f_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00]
@@ -590,6 +590,12 @@ v_cmpx_f_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_f_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_f_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_f_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_f_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_f_f32_e64 v1, v2
 // GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 
@@ -833,11 +839,11 @@ v_cmpx_f_u64_e64 src_scc, exec
 v_cmpx_f_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f16_e64 v1, v2
-// GFX11: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_ge_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_ge_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_ge_f16_e64 v255, v255
-// GFX11: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_ge_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_ge_f16_e64 s1, s2
 // GFX11: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
@@ -878,6 +884,12 @@ v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_ge_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_ge_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_ge_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_ge_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_ge_f32_e64 v1, v2
 // GFX11: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1223,11 +1235,11 @@ v_cmpx_ge_u64_e64 src_scc, exec
 v_cmpx_ge_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f16_e64 v1, v2
-// GFX11: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_gt_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_gt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_gt_f16_e64 v255, v255
-// GFX11: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_gt_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_gt_f16_e64 s1, s2
 // GFX11: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
@@ -1268,6 +1280,12 @@ v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_gt_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_gt_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_gt_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_gt_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_gt_f32_e64 v1, v2
 // GFX11: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1613,11 +1631,11 @@ v_cmpx_gt_u64_e64 src_scc, exec
 v_cmpx_gt_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f16_e64 v1, v2
-// GFX11: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_le_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_le_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_le_f16_e64 v255, v255
-// GFX11: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_le_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_le_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_le_f16_e64 s1, s2
 // GFX11: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
@@ -1658,6 +1676,12 @@ v_cmpx_le_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_le_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_le_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_le_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_le_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_le_f32_e64 v1, v2
 // GFX11: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2003,11 +2027,11 @@ v_cmpx_le_u64_e64 src_scc, exec
 v_cmpx_le_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f16_e64 v1, v2
-// GFX11: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_lg_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_lg_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_lg_f16_e64 v255, v255
-// GFX11: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_lg_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_lg_f16_e64 s1, s2
 // GFX11: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
@@ -2048,6 +2072,12 @@ v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_lg_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_lg_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_lg_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_lg_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_lg_f32_e64 v1, v2
 // GFX11: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2789,11 +2819,11 @@ v_cmpx_ne_u64_e64 src_scc, exec
 v_cmpx_ne_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f16_e64 v1, v2
-// GFX11: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_neq_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_neq_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_neq_f16_e64 v255, v255
-// GFX11: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_neq_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_neq_f16_e64 s1, s2
 // GFX11: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
@@ -2834,6 +2864,12 @@ v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_neq_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_neq_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_neq_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_neq_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_neq_f32_e64 v1, v2
 // GFX11: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2915,11 +2951,11 @@ v_cmpx_neq_f64_e64 -|src_scc|, -|exec|
 v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f16_e64 v1, v2
-// GFX11: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nge_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_nge_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nge_f16_e64 v255, v255
-// GFX11: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nge_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nge_f16_e64 s1, s2
 // GFX11: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
@@ -2960,6 +2996,12 @@ v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nge_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_nge_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nge_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_nge_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nge_f32_e64 v1, v2
 // GFX11: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3041,11 +3083,11 @@ v_cmpx_nge_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f16_e64 v1, v2
-// GFX11: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_ngt_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_ngt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_ngt_f16_e64 v255, v255
-// GFX11: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_ngt_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_ngt_f16_e64 s1, s2
 // GFX11: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
@@ -3086,6 +3128,12 @@ v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_ngt_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_ngt_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_ngt_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_ngt_f32_e64 v1, v2
 // GFX11: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3167,11 +3215,11 @@ v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|
 v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f16_e64 v1, v2
-// GFX11: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nle_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_nle_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nle_f16_e64 v255, v255
-// GFX11: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nle_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nle_f16_e64 s1, s2
 // GFX11: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
@@ -3212,6 +3260,12 @@ v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nle_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_nle_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nle_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_nle_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nle_f32_e64 v1, v2
 // GFX11: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3293,11 +3347,11 @@ v_cmpx_nle_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f16_e64 v1, v2
-// GFX11: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nlg_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_nlg_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nlg_f16_e64 v255, v255
-// GFX11: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nlg_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nlg_f16_e64 s1, s2
 // GFX11: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
@@ -3338,6 +3392,12 @@ v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nlg_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_nlg_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nlg_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_nlg_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nlg_f32_e64 v1, v2
 // GFX11: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3419,11 +3479,11 @@ v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f16_e64 v1, v2
-// GFX11: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nlt_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_nlt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16_e64 v255, v255
-// GFX11: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nlt_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nlt_f16_e64 s1, s2
 // GFX11: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
@@ -3464,6 +3524,12 @@ v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nlt_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_nlt_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nlt_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_nlt_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nlt_f32_e64 v1, v2
 // GFX11: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3545,11 +3611,11 @@ v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f16_e64 v1, v2
-// GFX11: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_o_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_o_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_o_f16_e64 v255, v255
-// GFX11: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_o_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_o_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_o_f16_e64 s1, s2
 // GFX11: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
@@ -3590,6 +3656,12 @@ v_cmpx_o_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_o_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_o_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_o_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_o_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_o_f32_e64 v1, v2
 // GFX11: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3671,11 +3743,11 @@ v_cmpx_o_f64_e64 -|src_scc|, -|exec|
 v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_f16_e64 v1, v2
-// GFX11: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_t_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_t_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_t_f16_e64 v255, v255
-// GFX11: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_t_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_t_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_t_f16_e64 s1, s2
 // GFX11: v_cmpx_t_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00]
@@ -3959,11 +4031,11 @@ v_cmpx_t_u64_e64 src_scc, exec
 v_cmpx_t_u64_e64 0xaf123456, vcc
 // GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_tru_f16_e64 v1, v2
-// GFX11: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_tru_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_t_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_tru_f16_e64 v255, v255
-// GFX11: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_tru_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_t_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_tru_f16_e64 s1, s2
 // GFX11: v_cmpx_t_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00]
@@ -4004,6 +4076,12 @@ v_cmpx_tru_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_tru_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_t_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_t_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_t_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_t_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_tru_f32_e64 v1, v2
 // GFX11: v_cmpx_t_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00]
 
@@ -4085,11 +4163,11 @@ v_cmpx_tru_f64_e64 -|src_scc|, -|exec|
 v_cmpx_tru_f64_e64 0xaf123456, -|vcc| clamp
 // GFX11: v_cmpx_t_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaf,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f16_e64 v1, v2
-// GFX11: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_u_f16_e64 v1.l, v2.l
+// GFX11: v_cmpx_u_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_u_f16_e64 v255, v255
-// GFX11: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_u_f16_e64 v255.l, v255.l
+// GFX11: v_cmpx_u_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_u_f16_e64 s1, s2
 // GFX11: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
@@ -4130,6 +4208,12 @@ v_cmpx_u_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_u_f16_e64 v1.h, v2.l
+// GFX11: v_cmpx_u_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_u_f16_e64 v255.l, v255.h
+// GFX11: v_cmpx_u_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_u_f32_e64 v1, v2
 // GFX11: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
index 88d9fb6cc1357..98aba2b960ad9 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx.s
@@ -566,50 +566,65 @@ v_cmpx_eq_u64 src_scc, v[2:3]
 v_cmpx_eq_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_f16 v1, v2
-// GFX11: v_cmpx_f_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x00,0x7d]
+v_cmpx_f_f16 v1.l, v2.l
+// GFX11: v_cmpx_f_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x00,0x7d]
 
-v_cmpx_f_f16 v127, v2
-// GFX11: v_cmpx_f_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x00,0x7d]
+v_cmpx_f_f16 v127.l, v2.l
+// GFX11: v_cmpx_f_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x00,0x7d]
 
-v_cmpx_f_f16 s1, v2
-// GFX11: v_cmpx_f_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x00,0x7d]
+v_cmpx_f_f16 s1, v2.l
+// GFX11: v_cmpx_f_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 s105, v2
-// GFX11: v_cmpx_f_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x00,0x7d]
+v_cmpx_f_f16 s105, v2.l
+// GFX11: v_cmpx_f_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 vcc_lo, v2
-// GFX11: v_cmpx_f_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x00,0x7d]
+v_cmpx_f_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_f_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 vcc_hi, v2
-// GFX11: v_cmpx_f_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x00,0x7d]
+v_cmpx_f_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_f_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 ttmp15, v2
-// GFX11: v_cmpx_f_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x00,0x7d]
+v_cmpx_f_f16 ttmp15, v2.l
+// GFX11: v_cmpx_f_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 m0, v2
-// GFX11: v_cmpx_f_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x00,0x7d]
+v_cmpx_f_f16 m0, v2.l
+// GFX11: v_cmpx_f_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 exec_lo, v2
-// GFX11: v_cmpx_f_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x00,0x7d]
+v_cmpx_f_f16 exec_lo, v2.l
+// GFX11: v_cmpx_f_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 exec_hi, v2
-// GFX11: v_cmpx_f_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x00,0x7d]
+v_cmpx_f_f16 exec_hi, v2.l
+// GFX11: v_cmpx_f_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 null, v2
-// GFX11: v_cmpx_f_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x00,0x7d]
+v_cmpx_f_f16 null, v2.l
+// GFX11: v_cmpx_f_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 -1, v2
-// GFX11: v_cmpx_f_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x00,0x7d]
+v_cmpx_f_f16 -1, v2.l
+// GFX11: v_cmpx_f_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 0.5, v2
-// GFX11: v_cmpx_f_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x00,0x7d]
+v_cmpx_f_f16 0.5, v2.l
+// GFX11: v_cmpx_f_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 src_scc, v2
-// GFX11: v_cmpx_f_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x00,0x7d]
+v_cmpx_f_f16 src_scc, v2.l
+// GFX11: v_cmpx_f_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x00,0x7d]
 
-v_cmpx_f_f16 0xfe0b, v127
-// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_f_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_f_f16 v1.h, v2.l
+// GFX11: v_cmpx_f_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x00,0x7d]
+
+v_cmpx_f_f16 v127.h, v2.l
+// GFX11: v_cmpx_f_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x00,0x7d]
+
+v_cmpx_f_f16 0.5, v127.l
+// GFX11: v_cmpx_f_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x00,0x7d]
+
+v_cmpx_f_f16 src_scc, v2.h
+// GFX11: v_cmpx_f_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x01,0x7d]
+
+v_cmpx_f_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_f_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_f_f32 v1, v2
 // GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
@@ -854,50 +869,65 @@ v_cmpx_f_u64 src_scc, v[2:3]
 v_cmpx_f_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f16 v1, v2
-// GFX11: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
+v_cmpx_ge_f16 v1.l, v2.l
+// GFX11: v_cmpx_ge_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0c,0x7d]
+
+v_cmpx_ge_f16 v127.l, v2.l
+// GFX11: v_cmpx_ge_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0c,0x7d]
+
+v_cmpx_ge_f16 s1, v2.l
+// GFX11: v_cmpx_ge_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0c,0x7d]
+
+v_cmpx_ge_f16 s105, v2.l
+// GFX11: v_cmpx_ge_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0c,0x7d]
+
+v_cmpx_ge_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0c,0x7d]
+
+v_cmpx_ge_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 v127, v2
-// GFX11: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
+v_cmpx_ge_f16 ttmp15, v2.l
+// GFX11: v_cmpx_ge_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 s1, v2
-// GFX11: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 m0, v2.l
+// GFX11: v_cmpx_ge_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 s105, v2
-// GFX11: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 exec_lo, v2.l
+// GFX11: v_cmpx_ge_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 vcc_lo, v2
-// GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 exec_hi, v2.l
+// GFX11: v_cmpx_ge_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 vcc_hi, v2
-// GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 null, v2.l
+// GFX11: v_cmpx_ge_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 ttmp15, v2
-// GFX11: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 -1, v2.l
+// GFX11: v_cmpx_ge_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 m0, v2
-// GFX11: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 0.5, v2.l
+// GFX11: v_cmpx_ge_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 exec_lo, v2
-// GFX11: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 src_scc, v2.l
+// GFX11: v_cmpx_ge_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 exec_hi, v2
-// GFX11: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_ge_f16 null, v2
-// GFX11: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 v1.h, v2.l
+// GFX11: v_cmpx_ge_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0c,0x7d]
 
-v_cmpx_ge_f16 -1, v2
-// GFX11: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 v127.h, v2.l
+// GFX11: v_cmpx_ge_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0c,0x7d]
 
-v_cmpx_ge_f16 0.5, v2
-// GFX11: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 0.5, v127.l
+// GFX11: v_cmpx_ge_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x0c,0x7d]
 
-v_cmpx_ge_f16 src_scc, v2
-// GFX11: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 src_scc, v2.h
+// GFX11: v_cmpx_ge_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0d,0x7d]
 
-v_cmpx_ge_f16 0xfe0b, v127
-// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_ge_f32 v1, v2
 // GFX11: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
@@ -1262,50 +1292,65 @@ v_cmpx_ge_u64 src_scc, v[2:3]
 v_cmpx_ge_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f16 v1, v2
-// GFX11: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
+v_cmpx_gt_f16 v1.l, v2.l
+// GFX11: v_cmpx_gt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x08,0x7d]
 
-v_cmpx_gt_f16 v127, v2
-// GFX11: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
+v_cmpx_gt_f16 v127.l, v2.l
+// GFX11: v_cmpx_gt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x08,0x7d]
 
-v_cmpx_gt_f16 s1, v2
-// GFX11: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
+v_cmpx_gt_f16 s1, v2.l
+// GFX11: v_cmpx_gt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 s105, v2
-// GFX11: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
+v_cmpx_gt_f16 s105, v2.l
+// GFX11: v_cmpx_gt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 vcc_lo, v2
-// GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
+v_cmpx_gt_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 vcc_hi, v2
-// GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
+v_cmpx_gt_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 ttmp15, v2
-// GFX11: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
+v_cmpx_gt_f16 ttmp15, v2.l
+// GFX11: v_cmpx_gt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 m0, v2
-// GFX11: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
+v_cmpx_gt_f16 m0, v2.l
+// GFX11: v_cmpx_gt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 exec_lo, v2
-// GFX11: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
+v_cmpx_gt_f16 exec_lo, v2.l
+// GFX11: v_cmpx_gt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 exec_hi, v2
-// GFX11: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
+v_cmpx_gt_f16 exec_hi, v2.l
+// GFX11: v_cmpx_gt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 null, v2
-// GFX11: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
+v_cmpx_gt_f16 null, v2.l
+// GFX11: v_cmpx_gt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 -1, v2
-// GFX11: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
+v_cmpx_gt_f16 -1, v2.l
+// GFX11: v_cmpx_gt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 0.5, v2
-// GFX11: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
+v_cmpx_gt_f16 0.5, v2.l
+// GFX11: v_cmpx_gt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 src_scc, v2
-// GFX11: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
+v_cmpx_gt_f16 src_scc, v2.l
+// GFX11: v_cmpx_gt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 0xfe0b, v127
-// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_gt_f16 v1.h, v2.l
+// GFX11: v_cmpx_gt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x08,0x7d]
+
+v_cmpx_gt_f16 v127.h, v2.l
+// GFX11: v_cmpx_gt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x08,0x7d]
+
+v_cmpx_gt_f16 0.5, v127.l
+// GFX11: v_cmpx_gt_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x08,0x7d]
+
+v_cmpx_gt_f16 src_scc, v2.h
+// GFX11: v_cmpx_gt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x09,0x7d]
+
+v_cmpx_gt_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_gt_f32 v1, v2
 // GFX11: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
@@ -1670,50 +1715,65 @@ v_cmpx_gt_u64 src_scc, v[2:3]
 v_cmpx_gt_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f16 v1, v2
-// GFX11: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
+v_cmpx_le_f16 v1.l, v2.l
+// GFX11: v_cmpx_le_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x06,0x7d]
+
+v_cmpx_le_f16 v127.l, v2.l
+// GFX11: v_cmpx_le_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x06,0x7d]
+
+v_cmpx_le_f16 s1, v2.l
+// GFX11: v_cmpx_le_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x06,0x7d]
+
+v_cmpx_le_f16 s105, v2.l
+// GFX11: v_cmpx_le_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x06,0x7d]
+
+v_cmpx_le_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_le_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x06,0x7d]
+
+v_cmpx_le_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_le_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 v127, v2
-// GFX11: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
+v_cmpx_le_f16 ttmp15, v2.l
+// GFX11: v_cmpx_le_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 s1, v2
-// GFX11: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
+v_cmpx_le_f16 m0, v2.l
+// GFX11: v_cmpx_le_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 s105, v2
-// GFX11: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
+v_cmpx_le_f16 exec_lo, v2.l
+// GFX11: v_cmpx_le_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 vcc_lo, v2
-// GFX11: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
+v_cmpx_le_f16 exec_hi, v2.l
+// GFX11: v_cmpx_le_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 vcc_hi, v2
-// GFX11: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
+v_cmpx_le_f16 null, v2.l
+// GFX11: v_cmpx_le_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 ttmp15, v2
-// GFX11: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
+v_cmpx_le_f16 -1, v2.l
+// GFX11: v_cmpx_le_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 m0, v2
-// GFX11: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
+v_cmpx_le_f16 0.5, v2.l
+// GFX11: v_cmpx_le_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 exec_lo, v2
-// GFX11: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
+v_cmpx_le_f16 src_scc, v2.l
+// GFX11: v_cmpx_le_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 exec_hi, v2
-// GFX11: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
+v_cmpx_le_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_le_f16 null, v2
-// GFX11: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
+v_cmpx_le_f16 v1.h, v2.l
+// GFX11: v_cmpx_le_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x06,0x7d]
 
-v_cmpx_le_f16 -1, v2
-// GFX11: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
+v_cmpx_le_f16 v127.h, v2.l
+// GFX11: v_cmpx_le_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x06,0x7d]
 
-v_cmpx_le_f16 0.5, v2
-// GFX11: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
+v_cmpx_le_f16 0.5, v127.l
+// GFX11: v_cmpx_le_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x06,0x7d]
 
-v_cmpx_le_f16 src_scc, v2
-// GFX11: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
+v_cmpx_le_f16 src_scc, v2.h
+// GFX11: v_cmpx_le_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x07,0x7d]
 
-v_cmpx_le_f16 0xfe0b, v127
-// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_le_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_le_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_le_f32 v1, v2
 // GFX11: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
@@ -2078,50 +2138,65 @@ v_cmpx_le_u64 src_scc, v[2:3]
 v_cmpx_le_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f16 v1, v2
-// GFX11: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
+v_cmpx_lg_f16 v1.l, v2.l
+// GFX11: v_cmpx_lg_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0a,0x7d]
 
-v_cmpx_lg_f16 v127, v2
-// GFX11: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
+v_cmpx_lg_f16 v127.l, v2.l
+// GFX11: v_cmpx_lg_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0a,0x7d]
 
-v_cmpx_lg_f16 s1, v2
-// GFX11: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 s1, v2.l
+// GFX11: v_cmpx_lg_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 s105, v2
-// GFX11: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 s105, v2.l
+// GFX11: v_cmpx_lg_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 vcc_lo, v2
-// GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 vcc_hi, v2
-// GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 ttmp15, v2
-// GFX11: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 ttmp15, v2.l
+// GFX11: v_cmpx_lg_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 m0, v2
-// GFX11: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 m0, v2.l
+// GFX11: v_cmpx_lg_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 exec_lo, v2
-// GFX11: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 exec_lo, v2.l
+// GFX11: v_cmpx_lg_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 exec_hi, v2
-// GFX11: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 exec_hi, v2.l
+// GFX11: v_cmpx_lg_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 null, v2
-// GFX11: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 null, v2.l
+// GFX11: v_cmpx_lg_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 -1, v2
-// GFX11: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 -1, v2.l
+// GFX11: v_cmpx_lg_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 0.5, v2
-// GFX11: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 0.5, v2.l
+// GFX11: v_cmpx_lg_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 src_scc, v2
-// GFX11: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 src_scc, v2.l
+// GFX11: v_cmpx_lg_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 0xfe0b, v127
-// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_lg_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_lg_f16 v1.h, v2.l
+// GFX11: v_cmpx_lg_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0a,0x7d]
+
+v_cmpx_lg_f16 v127.h, v2.l
+// GFX11: v_cmpx_lg_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0a,0x7d]
+
+v_cmpx_lg_f16 0.5, v127.l
+// GFX11: v_cmpx_lg_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x0a,0x7d]
+
+v_cmpx_lg_f16 src_scc, v2.h
+// GFX11: v_cmpx_lg_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0b,0x7d]
+
+v_cmpx_lg_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_lg_f32 v1, v2
 // GFX11: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
@@ -2909,50 +2984,65 @@ v_cmpx_ne_u64 src_scc, v[2:3]
 v_cmpx_ne_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f16 v1, v2
-// GFX11: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
+v_cmpx_neq_f16 v1.l, v2.l
+// GFX11: v_cmpx_neq_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1a,0x7d]
+
+v_cmpx_neq_f16 v127.l, v2.l
+// GFX11: v_cmpx_neq_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1a,0x7d]
+
+v_cmpx_neq_f16 s1, v2.l
+// GFX11: v_cmpx_neq_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1a,0x7d]
+
+v_cmpx_neq_f16 s105, v2.l
+// GFX11: v_cmpx_neq_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1a,0x7d]
+
+v_cmpx_neq_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1a,0x7d]
+
+v_cmpx_neq_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 v127, v2
-// GFX11: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
+v_cmpx_neq_f16 ttmp15, v2.l
+// GFX11: v_cmpx_neq_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 s1, v2
-// GFX11: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 m0, v2.l
+// GFX11: v_cmpx_neq_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 s105, v2
-// GFX11: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 exec_lo, v2.l
+// GFX11: v_cmpx_neq_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 vcc_lo, v2
-// GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 exec_hi, v2.l
+// GFX11: v_cmpx_neq_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 vcc_hi, v2
-// GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 null, v2.l
+// GFX11: v_cmpx_neq_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 ttmp15, v2
-// GFX11: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 -1, v2.l
+// GFX11: v_cmpx_neq_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 m0, v2
-// GFX11: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 0.5, v2.l
+// GFX11: v_cmpx_neq_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 exec_lo, v2
-// GFX11: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 src_scc, v2.l
+// GFX11: v_cmpx_neq_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 exec_hi, v2
-// GFX11: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_neq_f16 null, v2
-// GFX11: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 v1.h, v2.l
+// GFX11: v_cmpx_neq_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1a,0x7d]
 
-v_cmpx_neq_f16 -1, v2
-// GFX11: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 v127.h, v2.l
+// GFX11: v_cmpx_neq_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1a,0x7d]
 
-v_cmpx_neq_f16 0.5, v2
-// GFX11: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 0.5, v127.l
+// GFX11: v_cmpx_neq_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x1a,0x7d]
 
-v_cmpx_neq_f16 src_scc, v2
-// GFX11: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 src_scc, v2.h
+// GFX11: v_cmpx_neq_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1b,0x7d]
 
-v_cmpx_neq_f16 0xfe0b, v127
-// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_neq_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_neq_f32 v1, v2
 // GFX11: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
@@ -3035,50 +3125,65 @@ v_cmpx_neq_f64 src_scc, v[2:3]
 v_cmpx_neq_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f16 v1, v2
-// GFX11: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
+v_cmpx_nge_f16 v1.l, v2.l
+// GFX11: v_cmpx_nge_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x12,0x7d]
 
-v_cmpx_nge_f16 v127, v2
-// GFX11: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
+v_cmpx_nge_f16 v127.l, v2.l
+// GFX11: v_cmpx_nge_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x12,0x7d]
 
-v_cmpx_nge_f16 s1, v2
-// GFX11: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
+v_cmpx_nge_f16 s1, v2.l
+// GFX11: v_cmpx_nge_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 s105, v2
-// GFX11: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
+v_cmpx_nge_f16 s105, v2.l
+// GFX11: v_cmpx_nge_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 vcc_lo, v2
-// GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
+v_cmpx_nge_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 vcc_hi, v2
-// GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
+v_cmpx_nge_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 ttmp15, v2
-// GFX11: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
+v_cmpx_nge_f16 ttmp15, v2.l
+// GFX11: v_cmpx_nge_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 m0, v2
-// GFX11: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
+v_cmpx_nge_f16 m0, v2.l
+// GFX11: v_cmpx_nge_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 exec_lo, v2
-// GFX11: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
+v_cmpx_nge_f16 exec_lo, v2.l
+// GFX11: v_cmpx_nge_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 exec_hi, v2
-// GFX11: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
+v_cmpx_nge_f16 exec_hi, v2.l
+// GFX11: v_cmpx_nge_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 null, v2
-// GFX11: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
+v_cmpx_nge_f16 null, v2.l
+// GFX11: v_cmpx_nge_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 -1, v2
-// GFX11: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
+v_cmpx_nge_f16 -1, v2.l
+// GFX11: v_cmpx_nge_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 0.5, v2
-// GFX11: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
+v_cmpx_nge_f16 0.5, v2.l
+// GFX11: v_cmpx_nge_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 src_scc, v2
-// GFX11: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
+v_cmpx_nge_f16 src_scc, v2.l
+// GFX11: v_cmpx_nge_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 0xfe0b, v127
-// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nge_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_nge_f16 v1.h, v2.l
+// GFX11: v_cmpx_nge_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x12,0x7d]
+
+v_cmpx_nge_f16 v127.h, v2.l
+// GFX11: v_cmpx_nge_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x12,0x7d]
+
+v_cmpx_nge_f16 0.5, v127.l
+// GFX11: v_cmpx_nge_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x12,0x7d]
+
+v_cmpx_nge_f16 src_scc, v2.h
+// GFX11: v_cmpx_nge_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x13,0x7d]
+
+v_cmpx_nge_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nge_f32 v1, v2
 // GFX11: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
@@ -3161,50 +3266,65 @@ v_cmpx_nge_f64 src_scc, v[2:3]
 v_cmpx_nge_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f16 v1, v2
-// GFX11: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
+v_cmpx_ngt_f16 v1.l, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x16,0x7d]
+
+v_cmpx_ngt_f16 v127.l, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x16,0x7d]
+
+v_cmpx_ngt_f16 s1, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x16,0x7d]
+
+v_cmpx_ngt_f16 s105, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x16,0x7d]
+
+v_cmpx_ngt_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 v127, v2
-// GFX11: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
+v_cmpx_ngt_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 s1, v2
-// GFX11: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 ttmp15, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 s105, v2
-// GFX11: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 m0, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 vcc_lo, v2
-// GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 exec_lo, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 vcc_hi, v2
-// GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 exec_hi, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 ttmp15, v2
-// GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 null, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 m0, v2
-// GFX11: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 -1, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 exec_lo, v2
-// GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 0.5, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 exec_hi, v2
-// GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 src_scc, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 null, v2
-// GFX11: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_ngt_f16 -1, v2
-// GFX11: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 v1.h, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x16,0x7d]
 
-v_cmpx_ngt_f16 0.5, v2
-// GFX11: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 v127.h, v2.l
+// GFX11: v_cmpx_ngt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x16,0x7d]
 
-v_cmpx_ngt_f16 src_scc, v2
-// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 0.5, v127.l
+// GFX11: v_cmpx_ngt_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x16,0x7d]
 
-v_cmpx_ngt_f16 0xfe0b, v127
-// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ngt_f16 src_scc, v2.h
+// GFX11: v_cmpx_ngt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x17,0x7d]
+
+v_cmpx_ngt_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_ngt_f32 v1, v2
 // GFX11: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
@@ -3287,50 +3407,65 @@ v_cmpx_ngt_f64 src_scc, v[2:3]
 v_cmpx_ngt_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f16 v1, v2
-// GFX11: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
+v_cmpx_nle_f16 v1.l, v2.l
+// GFX11: v_cmpx_nle_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x18,0x7d]
+
+v_cmpx_nle_f16 v127.l, v2.l
+// GFX11: v_cmpx_nle_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x18,0x7d]
+
+v_cmpx_nle_f16 s1, v2.l
+// GFX11: v_cmpx_nle_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x18,0x7d]
+
+v_cmpx_nle_f16 s105, v2.l
+// GFX11: v_cmpx_nle_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x18,0x7d]
+
+v_cmpx_nle_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 v127, v2
-// GFX11: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
+v_cmpx_nle_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 s1, v2
-// GFX11: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
+v_cmpx_nle_f16 ttmp15, v2.l
+// GFX11: v_cmpx_nle_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 s105, v2
-// GFX11: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
+v_cmpx_nle_f16 m0, v2.l
+// GFX11: v_cmpx_nle_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 vcc_lo, v2
-// GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
+v_cmpx_nle_f16 exec_lo, v2.l
+// GFX11: v_cmpx_nle_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 vcc_hi, v2
-// GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
+v_cmpx_nle_f16 exec_hi, v2.l
+// GFX11: v_cmpx_nle_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 ttmp15, v2
-// GFX11: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
+v_cmpx_nle_f16 null, v2.l
+// GFX11: v_cmpx_nle_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 m0, v2
-// GFX11: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
+v_cmpx_nle_f16 -1, v2.l
+// GFX11: v_cmpx_nle_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 exec_lo, v2
-// GFX11: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
+v_cmpx_nle_f16 0.5, v2.l
+// GFX11: v_cmpx_nle_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 exec_hi, v2
-// GFX11: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
+v_cmpx_nle_f16 src_scc, v2.l
+// GFX11: v_cmpx_nle_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 null, v2
-// GFX11: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
+v_cmpx_nle_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nle_f16 -1, v2
-// GFX11: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
+v_cmpx_nle_f16 v1.h, v2.l
+// GFX11: v_cmpx_nle_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x18,0x7d]
 
-v_cmpx_nle_f16 0.5, v2
-// GFX11: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
+v_cmpx_nle_f16 v127.h, v2.l
+// GFX11: v_cmpx_nle_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x18,0x7d]
 
-v_cmpx_nle_f16 src_scc, v2
-// GFX11: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
+v_cmpx_nle_f16 0.5, v127.l
+// GFX11: v_cmpx_nle_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x18,0x7d]
 
-v_cmpx_nle_f16 0xfe0b, v127
-// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nle_f16 src_scc, v2.h
+// GFX11: v_cmpx_nle_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x19,0x7d]
+
+v_cmpx_nle_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nle_f32 v1, v2
 // GFX11: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
@@ -3413,50 +3548,65 @@ v_cmpx_nle_f64 src_scc, v[2:3]
 v_cmpx_nle_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f16 v1, v2
-// GFX11: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
+v_cmpx_nlg_f16 v1.l, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x14,0x7d]
+
+v_cmpx_nlg_f16 v127.l, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x14,0x7d]
+
+v_cmpx_nlg_f16 s1, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x14,0x7d]
+
+v_cmpx_nlg_f16 s105, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x14,0x7d]
+
+v_cmpx_nlg_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 v127, v2
-// GFX11: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
+v_cmpx_nlg_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 s1, v2
-// GFX11: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 ttmp15, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 s105, v2
-// GFX11: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 m0, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 vcc_lo, v2
-// GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 exec_lo, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 vcc_hi, v2
-// GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 exec_hi, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 ttmp15, v2
-// GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 null, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 m0, v2
-// GFX11: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 -1, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 exec_lo, v2
-// GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 0.5, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 exec_hi, v2
-// GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 src_scc, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 null, v2
-// GFX11: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nlg_f16 -1, v2
-// GFX11: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 v1.h, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x14,0x7d]
 
-v_cmpx_nlg_f16 0.5, v2
-// GFX11: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 v127.h, v2.l
+// GFX11: v_cmpx_nlg_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x14,0x7d]
 
-v_cmpx_nlg_f16 src_scc, v2
-// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 0.5, v127.l
+// GFX11: v_cmpx_nlg_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x14,0x7d]
 
-v_cmpx_nlg_f16 0xfe0b, v127
-// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlg_f16 src_scc, v2.h
+// GFX11: v_cmpx_nlg_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x15,0x7d]
+
+v_cmpx_nlg_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nlg_f32 v1, v2
 // GFX11: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
@@ -3539,50 +3689,65 @@ v_cmpx_nlg_f64 src_scc, v[2:3]
 v_cmpx_nlg_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f16 v1, v2
-// GFX11: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
+v_cmpx_nlt_f16 v1.l, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1c,0x7d]
+
+v_cmpx_nlt_f16 v127.l, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1c,0x7d]
+
+v_cmpx_nlt_f16 s1, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1c,0x7d]
+
+v_cmpx_nlt_f16 s105, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1c,0x7d]
+
+v_cmpx_nlt_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 v127, v2
-// GFX11: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
+v_cmpx_nlt_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 s1, v2
-// GFX11: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 ttmp15, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 s105, v2
-// GFX11: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 m0, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 vcc_lo, v2
-// GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 exec_lo, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 vcc_hi, v2
-// GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 exec_hi, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 ttmp15, v2
-// GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 null, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 m0, v2
-// GFX11: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 -1, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 exec_lo, v2
-// GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 0.5, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 exec_hi, v2
-// GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 src_scc, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 null, v2
-// GFX11: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nlt_f16 -1, v2
-// GFX11: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 v1.h, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1c,0x7d]
 
-v_cmpx_nlt_f16 0.5, v2
-// GFX11: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 v127.h, v2.l
+// GFX11: v_cmpx_nlt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1c,0x7d]
 
-v_cmpx_nlt_f16 src_scc, v2
-// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 0.5, v127.l
+// GFX11: v_cmpx_nlt_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x1c,0x7d]
 
-v_cmpx_nlt_f16 0xfe0b, v127
-// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlt_f16 src_scc, v2.h
+// GFX11: v_cmpx_nlt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1d,0x7d]
+
+v_cmpx_nlt_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nlt_f32 v1, v2
 // GFX11: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
@@ -3665,50 +3830,65 @@ v_cmpx_nlt_f64 src_scc, v[2:3]
 v_cmpx_nlt_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f16 v1, v2
-// GFX11: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
+v_cmpx_o_f16 v1.l, v2.l
+// GFX11: v_cmpx_o_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x0e,0x7d]
+
+v_cmpx_o_f16 v127.l, v2.l
+// GFX11: v_cmpx_o_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x0e,0x7d]
+
+v_cmpx_o_f16 s1, v2.l
+// GFX11: v_cmpx_o_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x0e,0x7d]
+
+v_cmpx_o_f16 s105, v2.l
+// GFX11: v_cmpx_o_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x0e,0x7d]
+
+v_cmpx_o_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_o_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 v127, v2
-// GFX11: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
+v_cmpx_o_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_o_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 s1, v2
-// GFX11: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
+v_cmpx_o_f16 ttmp15, v2.l
+// GFX11: v_cmpx_o_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 s105, v2
-// GFX11: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
+v_cmpx_o_f16 m0, v2.l
+// GFX11: v_cmpx_o_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 vcc_lo, v2
-// GFX11: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
+v_cmpx_o_f16 exec_lo, v2.l
+// GFX11: v_cmpx_o_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 vcc_hi, v2
-// GFX11: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
+v_cmpx_o_f16 exec_hi, v2.l
+// GFX11: v_cmpx_o_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 ttmp15, v2
-// GFX11: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
+v_cmpx_o_f16 null, v2.l
+// GFX11: v_cmpx_o_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 m0, v2
-// GFX11: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
+v_cmpx_o_f16 -1, v2.l
+// GFX11: v_cmpx_o_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 exec_lo, v2
-// GFX11: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
+v_cmpx_o_f16 0.5, v2.l
+// GFX11: v_cmpx_o_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 exec_hi, v2
-// GFX11: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
+v_cmpx_o_f16 src_scc, v2.l
+// GFX11: v_cmpx_o_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 null, v2
-// GFX11: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
+v_cmpx_o_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_o_f16 -1, v2
-// GFX11: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
+v_cmpx_o_f16 v1.h, v2.l
+// GFX11: v_cmpx_o_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x0e,0x7d]
 
-v_cmpx_o_f16 0.5, v2
-// GFX11: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
+v_cmpx_o_f16 v127.h, v2.l
+// GFX11: v_cmpx_o_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x0e,0x7d]
 
-v_cmpx_o_f16 src_scc, v2
-// GFX11: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
+v_cmpx_o_f16 0.5, v127.l
+// GFX11: v_cmpx_o_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x0e,0x7d]
 
-v_cmpx_o_f16 0xfe0b, v127
-// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_o_f16 src_scc, v2.h
+// GFX11: v_cmpx_o_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x0f,0x7d]
+
+v_cmpx_o_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_o_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_o_f32 v1, v2
 // GFX11: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
@@ -3791,50 +3971,80 @@ v_cmpx_o_f64 src_scc, v[2:3]
 v_cmpx_o_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_f16 v1, v2
-// GFX11: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
+v_cmpx_t_f16 v1.l, v2.l
+// GFX11: v_cmpx_t_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x1e,0x7d]
 
-v_cmpx_t_f16 v127, v2
-// GFX11: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
+v_cmpx_t_f16 v127.l, v2.l
+// GFX11: v_cmpx_t_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x1e,0x7d]
 
-v_cmpx_t_f16 s1, v2
-// GFX11: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
+v_cmpx_t_f16 s1, v2.l
+// GFX11: v_cmpx_t_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 s105, v2
-// GFX11: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
+v_cmpx_t_f16 s105, v2.l
+// GFX11: v_cmpx_t_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 vcc_lo, v2
-// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
+v_cmpx_t_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 vcc_hi, v2
-// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
+v_cmpx_t_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 ttmp15, v2
-// GFX11: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
+v_cmpx_t_f16 ttmp15, v2.l
+// GFX11: v_cmpx_t_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 m0, v2
-// GFX11: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
+v_cmpx_t_f16 m0, v2.l
+// GFX11: v_cmpx_t_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 exec_lo, v2
-// GFX11: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
+v_cmpx_t_f16 exec_lo, v2.l
+// GFX11: v_cmpx_t_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 exec_hi, v2
-// GFX11: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
+v_cmpx_t_f16 exec_hi, v2.l
+// GFX11: v_cmpx_t_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 null, v2
-// GFX11: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
+v_cmpx_t_f16 null, v2.l
+// GFX11: v_cmpx_t_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 -1, v2
-// GFX11: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
+v_cmpx_t_f16 -1, v2.l
+// GFX11: v_cmpx_t_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 0.5, v2
-// GFX11: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
+v_cmpx_t_f16 0.5, v2.l
+// GFX11: v_cmpx_t_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 src_scc, v2
-// GFX11: v_cmpx_t_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x1e,0x7d]
+v_cmpx_t_f16 src_scc, v2.l
+// GFX11: v_cmpx_t_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x1e,0x7d]
 
-v_cmpx_t_f16 0xfe0b, v127
-// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_t_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_t_f16 v1.h, v2.l
+// GFX11: v_cmpx_t_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x1e,0x7d]
+
+v_cmpx_tru_f16 v1.h, v2.l
+// GFX11: v_cmpx_t_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x1e,0x7d]
+
+v_cmpx_t_f16 v127.h, v2.l
+// GFX11: v_cmpx_t_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x1e,0x7d]
+
+v_cmpx_tru_f16 v127.h, v2.l
+// GFX11: v_cmpx_t_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x1e,0x7d]
+
+v_cmpx_t_f16 0.5, v127.l
+// GFX11: v_cmpx_t_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x1e,0x7d]
+
+v_cmpx_tru_f16 0.5, v127.l
+// GFX11: v_cmpx_t_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x1e,0x7d]
+
+v_cmpx_t_f16 src_scc, v2.h
+// GFX11: v_cmpx_t_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x1f,0x7d]
+
+v_cmpx_tru_f16 src_scc, v2.h
+// GFX11: v_cmpx_t_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x1f,0x7d]
+
+v_cmpx_t_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_tru_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_t_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_t_f32 v1, v2
 // GFX11: v_cmpx_t_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x3e,0x7d]
@@ -4079,41 +4289,41 @@ v_cmpx_t_u64 src_scc, v[2:3]
 v_cmpx_t_u64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_tru_f16 v1, v2
-// GFX11: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
+v_cmpx_tru_f16 v1.l, v2.l
+// GFX11: v_cmpx_t_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x1e,0x7d]
 
-v_cmpx_tru_f16 v127, v2
-// GFX11: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
+v_cmpx_tru_f16 v127.l, v2.l
+// GFX11: v_cmpx_t_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x1e,0x7d]
 
-v_cmpx_tru_f16 s1, v2
-// GFX11: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 s1, v2.l
+// GFX11: v_cmpx_t_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 s105, v2
-// GFX11: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 s105, v2.l
+// GFX11: v_cmpx_t_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 vcc_lo, v2
-// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_t_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 vcc_hi, v2
-// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_t_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 ttmp15, v2
-// GFX11: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 ttmp15, v2.l
+// GFX11: v_cmpx_t_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 m0, v2
-// GFX11: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 m0, v2.l
+// GFX11: v_cmpx_t_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 exec_lo, v2
-// GFX11: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 exec_lo, v2.l
+// GFX11: v_cmpx_t_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 exec_hi, v2
-// GFX11: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 exec_hi, v2.l
+// GFX11: v_cmpx_t_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 null, v2
-// GFX11: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 null, v2.l
+// GFX11: v_cmpx_t_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x1e,0x7d]
 
-v_cmpx_tru_f16 -1, v2
-// GFX11: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
+v_cmpx_tru_f16 -1, v2.l
+// GFX11: v_cmpx_t_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x1e,0x7d]
 
 v_cmpx_tru_f16 0.5, v2
 // GFX11: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
@@ -4205,50 +4415,65 @@ v_cmpx_tru_f64 src_scc, v[2:3]
 v_cmpx_tru_f64 0xaf123456, v[254:255]
 // GFX11: v_cmpx_t_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5f,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f16 v1, v2
-// GFX11: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
+v_cmpx_u_f16 v1.l, v2.l
+// GFX11: v_cmpx_u_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x10,0x7d]
+
+v_cmpx_u_f16 v127.l, v2.l
+// GFX11: v_cmpx_u_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x10,0x7d]
+
+v_cmpx_u_f16 s1, v2.l
+// GFX11: v_cmpx_u_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x10,0x7d]
+
+v_cmpx_u_f16 s105, v2.l
+// GFX11: v_cmpx_u_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x10,0x7d]
+
+v_cmpx_u_f16 vcc_lo, v2.l
+// GFX11: v_cmpx_u_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x10,0x7d]
+
+v_cmpx_u_f16 vcc_hi, v2.l
+// GFX11: v_cmpx_u_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 v127, v2
-// GFX11: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
+v_cmpx_u_f16 ttmp15, v2.l
+// GFX11: v_cmpx_u_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 s1, v2
-// GFX11: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
+v_cmpx_u_f16 m0, v2.l
+// GFX11: v_cmpx_u_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 s105, v2
-// GFX11: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
+v_cmpx_u_f16 exec_lo, v2.l
+// GFX11: v_cmpx_u_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 vcc_lo, v2
-// GFX11: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
+v_cmpx_u_f16 exec_hi, v2.l
+// GFX11: v_cmpx_u_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 vcc_hi, v2
-// GFX11: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
+v_cmpx_u_f16 null, v2.l
+// GFX11: v_cmpx_u_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 ttmp15, v2
-// GFX11: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
+v_cmpx_u_f16 -1, v2.l
+// GFX11: v_cmpx_u_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 m0, v2
-// GFX11: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
+v_cmpx_u_f16 0.5, v2.l
+// GFX11: v_cmpx_u_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 exec_lo, v2
-// GFX11: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
+v_cmpx_u_f16 src_scc, v2.l
+// GFX11: v_cmpx_u_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 exec_hi, v2
-// GFX11: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
+v_cmpx_u_f16 0xfe0b, v127.l
+// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_u_f16 null, v2
-// GFX11: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
+v_cmpx_u_f16 v1.h, v2.l
+// GFX11: v_cmpx_u_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x10,0x7d]
 
-v_cmpx_u_f16 -1, v2
-// GFX11: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
+v_cmpx_u_f16 v127.h, v2.l
+// GFX11: v_cmpx_u_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x10,0x7d]
 
-v_cmpx_u_f16 0.5, v2
-// GFX11: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
+v_cmpx_u_f16 0.5, v127.l
+// GFX11: v_cmpx_u_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x10,0x7d]
 
-v_cmpx_u_f16 src_scc, v2
-// GFX11: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
+v_cmpx_u_f16 src_scc, v2.h
+// GFX11: v_cmpx_u_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x11,0x7d]
 
-v_cmpx_u_f16 0xfe0b, v127
-// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_u_f16 0xfe0b, v127.h
+// GFX11: v_cmpx_u_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_u_f32 v1, v2
 // GFX11: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
index e8d458874596e..1864a32c9f133 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp16.s
@@ -374,47 +374,56 @@ v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_f_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff]
@@ -542,47 +551,56 @@ v_cmpx_f_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_ge_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_ge_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
@@ -812,47 +830,56 @@ v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
@@ -1082,47 +1109,56 @@ v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_le_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
@@ -1352,47 +1388,56 @@ v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
@@ -1901,47 +1946,56 @@ v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_neq_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_neq_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
@@ -1985,47 +2039,56 @@ v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
@@ -2069,47 +2132,56 @@ v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ngt_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
@@ -2153,47 +2225,56 @@ v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
@@ -2237,47 +2318,56 @@ v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
@@ -2321,47 +2411,56 @@ v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
@@ -2405,47 +2504,56 @@ v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_o_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_o_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2489,47 +2597,65 @@ v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_t_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_tru_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01]
+
+v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_tru_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xf5,0x30]
+
+v_cmpx_tru_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2657,38 +2783,38 @@ v_cmpx_t_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_tru_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_tru_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_tru_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_tru_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_tru_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_tru_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
 
 v_cmpx_tru_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
@@ -2741,47 +2867,56 @@ v_cmpx_tru_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_tru_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: v_cmpx_t_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3f,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX11: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_u_f16 v1.l, v2.l row_mirror
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+
+v_cmpx_u_f16 v1.l, v2.l row_half_mirror
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shl:1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_mirror
-// GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shl:15
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_half_mirror
-// GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shr:1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shl:1
-// GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shr:15
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shl:15
-// GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_ror:1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shr:1
-// GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_ror:15
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shr:15
-// GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_ror:1
-// GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16 v1, v2 row_ror:15
-// GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX11: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01]
 
-v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
index 4f8895faf10a2..1d664e4ecb902 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_dpp8.s
@@ -110,14 +110,23 @@ v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x01,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05]
@@ -146,14 +155,23 @@ v_cmpx_f_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
@@ -218,14 +236,23 @@ v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
@@ -290,14 +317,23 @@ v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
@@ -362,14 +398,23 @@ v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
@@ -515,14 +560,23 @@ v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
@@ -533,14 +587,23 @@ v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
@@ -551,14 +614,23 @@ v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
@@ -569,14 +641,23 @@ v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
@@ -587,14 +668,23 @@ v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
@@ -605,14 +695,23 @@ v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
@@ -623,14 +722,23 @@ v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
@@ -641,14 +749,32 @@ v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_tru_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+
+v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_tru_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
+
+v_cmpx_tru_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05]
@@ -677,8 +803,8 @@ v_cmpx_t_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_tru_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 
 v_cmpx_tru_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
@@ -695,14 +821,23 @@ v_cmpx_tru_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_tru_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: v_cmpx_t_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3f,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX11: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
index fe2220d7f5902..a1ab032e85d75 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_err.s
@@ -145,41 +145,77 @@ v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_f_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_f_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_f_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_f_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_f_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_f_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_ge_i16_e32 v1.h, v255.h
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -253,23 +289,41 @@ v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_gt_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_gt_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_gt_i16_e32 v1.h, v255.h
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -343,23 +397,41 @@ v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_le_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_le_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_le_i16_e32 v1.h, v255.h
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -433,23 +505,41 @@ v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lg_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lg_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_lt_f16_e32 v1.h, v255.h
 // GFX11: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -631,148 +721,292 @@ v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_neq_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_neq_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_neq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_neq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ngt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ngt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ngt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_ngt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ngt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nle_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nle_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nle_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nle_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:22: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlg_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v255, v2
+v_cmpx_o_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_o_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_cmpx_t_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_t_f16_e32 v255, v2 quad_perm:[3,2,1,0]
+v_cmpx_t_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_t_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
 v_cmpx_tru_f16_e32 v1, v255
@@ -793,20 +1027,38 @@ v_cmpx_tru_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_tru_f16_e32 v255, v2 quad_perm:[3,2,1,0]
 // GFX11: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v1.h, v255.h
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2
-// GFX11: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v1.l, v255.l
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:24: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v255.h, v2.h
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v255.l, v2.l
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
+
+v_cmpx_u_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: :[[@LINE-1]]:18: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
index c6814de818e6d..233858fd3021c 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vopcx_t16_promote.s
@@ -145,41 +145,77 @@ v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_f_f16 v1, v255
-// GFX11: v_cmpx_f_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_f_f16 v1.h, v255.h
+// GFX11: v_cmpx_f_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x80,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_f_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_f_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_f_f16 v255, v2
-// GFX11: v_cmpx_f_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_f_f16 v1.l, v255.l
+// GFX11: v_cmpx_f_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_f_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_f_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_f_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_f_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_f_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16 v1, v255
-// GFX11: v_cmpx_ge_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_f_f16 v255.h, v2.h
+// GFX11: v_cmpx_f_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x80,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_ge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_f_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_f_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16 v255, v2
-// GFX11: v_cmpx_ge_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_f_f16 v255.l, v2.l
+// GFX11: v_cmpx_f_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_f_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_f_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_f_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v1.h, v255.h
+// GFX11: v_cmpx_ge_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v1.l, v255.l
+// GFX11: v_cmpx_ge_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v255.h, v2.h
+// GFX11: v_cmpx_ge_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v255.l, v2.l
+// GFX11: v_cmpx_ge_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_ge_i16 v1.h, v255.h
 // GFX11: v_cmpx_ge_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00]
@@ -253,23 +289,41 @@ v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16 v1, v255
-// GFX11: v_cmpx_gt_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_gt_f16 v1.h, v255.h
+// GFX11: v_cmpx_gt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_gt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16 v255, v2
-// GFX11: v_cmpx_gt_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_gt_f16 v1.l, v255.l
+// GFX11: v_cmpx_gt_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16 v255.h, v2.h
+// GFX11: v_cmpx_gt_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16 v255.l, v2.l
+// GFX11: v_cmpx_gt_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_gt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_gt_i16 v1.h, v255.h
 // GFX11: v_cmpx_gt_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00]
@@ -343,23 +397,41 @@ v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_le_f16 v1, v255
-// GFX11: v_cmpx_le_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_le_f16 v1.h, v255.h
+// GFX11: v_cmpx_le_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v255.l
+// GFX11: v_cmpx_le_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_le_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_le_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v255.h, v2.h
+// GFX11: v_cmpx_le_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_le_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v255, v2
-// GFX11: v_cmpx_le_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_le_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_le_f16 v255.l, v2.l
+// GFX11: v_cmpx_le_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_le_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_le_i16 v1.h, v255.h
 // GFX11: v_cmpx_le_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00]
@@ -433,23 +505,41 @@ v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v255
-// GFX11: v_cmpx_lg_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_lg_f16 v1.h, v255.h
+// GFX11: v_cmpx_lg_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16 v1.l, v255.l
+// GFX11: v_cmpx_lg_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_lg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v255.h, v2.h
+// GFX11: v_cmpx_lg_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_lg_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v255, v2
-// GFX11: v_cmpx_lg_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_lg_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_lg_f16 v255.l, v2.l
+// GFX11: v_cmpx_lg_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_lt_f16 v1.h, v255.h
 // GFX11: v_cmpx_lt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00]
@@ -631,182 +721,362 @@ v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_neq_f16 v1, v255
-// GFX11: v_cmpx_neq_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_neq_f16 v1.h, v255.h
+// GFX11: v_cmpx_neq_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v1.l, v255.l
+// GFX11: v_cmpx_neq_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_neq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v255.h, v2.h
+// GFX11: v_cmpx_neq_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v255.l, v2.l
+// GFX11: v_cmpx_neq_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_neq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v1.h, v255.h
+// GFX11: v_cmpx_nge_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v1.l, v255.l
+// GFX11: v_cmpx_nge_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v255.h, v2.h
+// GFX11: v_cmpx_nge_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v255.l, v2.l
+// GFX11: v_cmpx_nge_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.h, v255.h
+// GFX11: v_cmpx_ngt_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.l, v255.l
+// GFX11: v_cmpx_ngt_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ngt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v255.h, v2.h
+// GFX11: v_cmpx_ngt_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v255.l, v2.l
+// GFX11: v_cmpx_ngt_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v1.h, v255.h
+// GFX11: v_cmpx_nle_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v1.l, v255.l
+// GFX11: v_cmpx_nle_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nle_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v255.h, v2.h
+// GFX11: v_cmpx_nle_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v255.l, v2.l
+// GFX11: v_cmpx_nle_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nle_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.h, v255.h
+// GFX11: v_cmpx_nlg_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.l, v255.l
+// GFX11: v_cmpx_nlg_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nlg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v255.h, v2.h
+// GFX11: v_cmpx_nlg_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v255.l, v2.l
+// GFX11: v_cmpx_nlg_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nlg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nlt_f16 v1.h, v255.h
+// GFX11: v_cmpx_nlt_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_neq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_neq_f16 v255, v2
-// GFX11: v_cmpx_neq_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlt_f16 v1.l, v255.l
+// GFX11: v_cmpx_nlt_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_neq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_neq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nge_f16 v1, v255
-// GFX11: v_cmpx_nge_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_nlt_f16 v255.h, v2.h
+// GFX11: v_cmpx_nlt_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nge_f16 v255, v2
-// GFX11: v_cmpx_nge_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlt_f16 v255.l, v2.l
+// GFX11: v_cmpx_nlt_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16 v1, v255
-// GFX11: v_cmpx_ngt_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_o_f16 v1.h, v255.h
+// GFX11: v_cmpx_o_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ngt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16 v255, v2
-// GFX11: v_cmpx_ngt_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_o_f16 v1.l, v255.l
+// GFX11: v_cmpx_o_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ngt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_ngt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16 v1, v255
-// GFX11: v_cmpx_nle_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_o_f16 v255.h, v2.h
+// GFX11: v_cmpx_o_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nle_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16 v255, v2
-// GFX11: v_cmpx_nle_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_o_f16 v255.l, v2.l
+// GFX11: v_cmpx_o_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nle_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_o_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nle_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nlg_f16 v1, v255
-// GFX11: v_cmpx_nlg_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_t_f16 v1.h, v255.h
+// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_nlg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_t_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlg_f16 v255, v2
-// GFX11: v_cmpx_nlg_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_t_f16 v1.l, v255.l
+// GFX11: v_cmpx_t_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_nlg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_t_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_t_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v255
-// GFX11: v_cmpx_nlt_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_t_f16 v255.h, v2.h
+// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_t_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_t_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v255, v2
-// GFX11: v_cmpx_nlt_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_t_f16 v255.l, v2.l
+// GFX11: v_cmpx_t_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_t_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_nlt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_t_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_o_f16 v1, v255
-// GFX11: v_cmpx_o_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_tru_f16 v1.h, v255.h
+// GFX11: v_cmpx_t_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x8f,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_o_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_tru_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_o_f16 v255, v2
-// GFX11: v_cmpx_o_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_tru_f16 v1.l, v255.l
+// GFX11: v_cmpx_t_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_o_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_tru_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_o_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_tru_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_t_f16 v1, v255
-// GFX11: v_cmpx_t_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_tru_f16 v255.h, v2.h
+// GFX11: v_cmpx_t_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x8f,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_t_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_tru_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_t_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_tru_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_t_f16 v255, v2
-// GFX11: v_cmpx_t_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_tru_f16 v255.l, v2.l
+// GFX11: v_cmpx_t_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_t_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_tru_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_t_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_tru_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_t_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_tru_f16 v1, v255
-// GFX11: v_cmpx_t_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_u_f16 v1.h, v255.h
+// GFX11: v_cmpx_u_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_tru_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_tru_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_tru_f16 v255, v2
-// GFX11: v_cmpx_t_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_u_f16 v1.l, v255.l
+// GFX11: v_cmpx_u_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_tru_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_tru_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_t_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_u_f16 v1, v255
-// GFX11: v_cmpx_u_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_u_f16 v255.h, v2.h
+// GFX11: v_cmpx_u_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_u_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_u_f16 v255, v2
-// GFX11: v_cmpx_u_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_u_f16 v255.l, v2.l
+// GFX11: v_cmpx_u_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_u_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_u_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX11: v_cmpx_u_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_u_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX11: v_cmpx_u_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
index 17bd81fa7d259..cfc7b2c5fb665 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx.s
@@ -542,11 +542,11 @@ v_cmpx_eq_u64_e64 src_scc, exec
 v_cmpx_eq_u64_e64 0xaf123456, vcc
 // GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f16_e64 v1, v2
-// GFX12: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_ge_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_ge_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_ge_f16_e64 v255, v255
-// GFX12: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_ge_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_ge_f16_e64 s1, s2
 // GFX12: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
@@ -587,6 +587,12 @@ v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_ge_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_ge_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_ge_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_ge_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_ge_f32_e64 v1, v2
 // GFX12: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
@@ -932,11 +938,11 @@ v_cmpx_ge_u64_e64 src_scc, exec
 v_cmpx_ge_u64_e64 0xaf123456, vcc
 // GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f16_e64 v1, v2
-// GFX12: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_gt_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_gt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_gt_f16_e64 v255, v255
-// GFX12: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_gt_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_gt_f16_e64 s1, s2
 // GFX12: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
@@ -977,6 +983,12 @@ v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_gt_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_gt_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_gt_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_gt_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_gt_f32_e64 v1, v2
 // GFX12: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1322,11 +1334,11 @@ v_cmpx_gt_u64_e64 src_scc, exec
 v_cmpx_gt_u64_e64 0xaf123456, vcc
 // GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f16_e64 v1, v2
-// GFX12: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_le_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_le_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_le_f16_e64 v255, v255
-// GFX12: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_le_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_le_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_le_f16_e64 s1, s2
 // GFX12: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
@@ -1367,6 +1379,12 @@ v_cmpx_le_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_le_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_le_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_le_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_le_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_le_f32_e64 v1, v2
 // GFX12: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1712,11 +1730,11 @@ v_cmpx_le_u64_e64 src_scc, exec
 v_cmpx_le_u64_e64 0xaf123456, vcc
 // GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f16_e64 v1, v2
-// GFX12: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_lg_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_lg_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_lg_f16_e64 v255, v255
-// GFX12: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_lg_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_lg_f16_e64 s1, s2
 // GFX12: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
@@ -1757,6 +1775,12 @@ v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_lg_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_lg_f16_e64 v1.h, v2.l            ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_lg_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_lg_f16_e64 v255.l, v255.h        ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_lg_f32_e64 v1, v2
 // GFX12: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2498,11 +2522,11 @@ v_cmpx_ne_u64_e64 src_scc, exec
 v_cmpx_ne_u64_e64 0xaf123456, vcc
 // GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f16_e64 v1, v2
-// GFX12: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_neq_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_neq_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_neq_f16_e64 v255, v255
-// GFX12: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_neq_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_neq_f16_e64 s1, s2
 // GFX12: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
@@ -2543,6 +2567,12 @@ v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_neq_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_neq_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_neq_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_neq_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_neq_f32_e64 v1, v2
 // GFX12: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2624,11 +2654,11 @@ v_cmpx_neq_f64_e64 -|src_scc|, -|exec|
 v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f16_e64 v1, v2
-// GFX12: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nge_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_nge_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nge_f16_e64 v255, v255
-// GFX12: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nge_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nge_f16_e64 s1, s2
 // GFX12: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
@@ -2669,6 +2699,12 @@ v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nge_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_nge_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nge_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_nge_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nge_f32_e64 v1, v2
 // GFX12: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2750,11 +2786,11 @@ v_cmpx_nge_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f16_e64 v1, v2
-// GFX12: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_ngt_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_ngt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_ngt_f16_e64 v255, v255
-// GFX12: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_ngt_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_ngt_f16_e64 s1, s2
 // GFX12: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
@@ -2795,6 +2831,12 @@ v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_ngt_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_ngt_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_ngt_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_ngt_f32_e64 v1, v2
 // GFX12: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2876,11 +2918,11 @@ v_cmpx_ngt_f64_e64 -|src_scc|, -|exec|
 v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f16_e64 v1, v2
-// GFX12: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nle_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_nle_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nle_f16_e64 v255, v255
-// GFX12: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nle_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nle_f16_e64 s1, s2
 // GFX12: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
@@ -2921,6 +2963,12 @@ v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nle_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_nle_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nle_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_nle_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nle_f32_e64 v1, v2
 // GFX12: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3002,11 +3050,11 @@ v_cmpx_nle_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f16_e64 v1, v2
-// GFX12: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nlg_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_nlg_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nlg_f16_e64 v255, v255
-// GFX12: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nlg_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nlg_f16_e64 s1, s2
 // GFX12: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
@@ -3047,6 +3095,12 @@ v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nlg_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_nlg_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nlg_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_nlg_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nlg_f32_e64 v1, v2
 // GFX12: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3128,11 +3182,11 @@ v_cmpx_nlg_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f16_e64 v1, v2
-// GFX12: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_nlt_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_nlt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16_e64 v255, v255
-// GFX12: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_nlt_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_nlt_f16_e64 s1, s2
 // GFX12: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
@@ -3173,6 +3227,12 @@ v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_nlt_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_nlt_f16_e64 v1.h, v2.l           ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_nlt_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_nlt_f16_e64 v255.l, v255.h       ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_nlt_f32_e64 v1, v2
 // GFX12: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3254,11 +3314,11 @@ v_cmpx_nlt_f64_e64 -|src_scc|, -|exec|
 v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f16_e64 v1, v2
-// GFX12: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_o_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_o_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_o_f16_e64 v255, v255
-// GFX12: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_o_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_o_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_o_f16_e64 s1, s2
 // GFX12: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
@@ -3299,6 +3359,12 @@ v_cmpx_o_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_o_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_o_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_o_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_o_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_o_f32_e64 v1, v2
 // GFX12: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3380,11 +3446,11 @@ v_cmpx_o_f64_e64 -|src_scc|, -|exec|
 v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp
 // GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f16_e64 v1, v2
-// GFX12: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+v_cmpx_u_f16_e64 v1.l, v2.l
+// GFX12: v_cmpx_u_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
-v_cmpx_u_f16_e64 v255, v255
-// GFX12: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+v_cmpx_u_f16_e64 v255.l, v255.l
+// GFX12: v_cmpx_u_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
 v_cmpx_u_f16_e64 s1, s2
 // GFX12: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
@@ -3425,6 +3491,12 @@ v_cmpx_u_f16_e64 -src_scc, |vcc_lo|
 v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp
 // GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+v_cmpx_u_f16_e64 v1.h, v2.l
+// GFX12: v_cmpx_u_f16_e64 v1.h, v2.l             ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00]
+
+v_cmpx_u_f16_e64 v255.l, v255.h
+// GFX12: v_cmpx_u_f16_e64 v255.l, v255.h         ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00]
+
 v_cmpx_u_f32_e64 v1, v2
 // GFX12: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
index 86f4b9a6789dd..aca2b4792f74a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp16.s
@@ -422,53 +422,62 @@ v_cmpx_eq_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -728,53 +737,62 @@ v_cmpx_ge_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1034,53 +1052,62 @@ v_cmpx_gt_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1340,53 +1367,62 @@ v_cmpx_le_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1961,53 +1997,62 @@ v_cmpx_ne_u32_e64_dpp v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2057,53 +2102,62 @@ v_cmpx_neq_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2153,53 +2207,62 @@ v_cmpx_nge_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2249,53 +2312,62 @@ v_cmpx_ngt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2345,53 +2417,62 @@ v_cmpx_nle_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2441,53 +2522,62 @@ v_cmpx_nlg_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2537,53 +2627,62 @@ v_cmpx_nlt_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ct
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2633,53 +2732,62 @@ v_cmpx_o_f32_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, s2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x00,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, 2.0 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xe8,0x01,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_mirror
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
index 071a00ac73b8a..3503f3d62d737 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3cx_dpp8.s
@@ -164,23 +164,32 @@ v_cmpx_eq_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xca,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x86,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x86,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x86,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -278,23 +287,32 @@ v_cmpx_ge_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xce,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_gt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x84,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x84,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x84,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -392,23 +410,32 @@ v_cmpx_gt_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcc,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_le_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_le_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x83,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x83,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x83,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -506,23 +533,32 @@ v_cmpx_le_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcb,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x85,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x85,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x85,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -743,23 +779,32 @@ v_cmpx_ne_u32_e64_dpp v1, 10 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x00,0xcd,0xd4,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_neq_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8d,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -779,23 +824,32 @@ v_cmpx_neq_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9d,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nge_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x89,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x89,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x89,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -815,23 +869,32 @@ v_cmpx_nge_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x99,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8b,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -851,23 +914,32 @@ v_cmpx_ngt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9b,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nle_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8c,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -887,23 +959,32 @@ v_cmpx_nle_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9c,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8a,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -923,23 +1004,32 @@ v_cmpx_nlg_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9a,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x8e,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x8e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -959,23 +1049,32 @@ v_cmpx_nlt_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x9e,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_o_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_o_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x87,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x87,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x87,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -995,23 +1094,32 @@ v_cmpx_o_f32_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x97,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.l, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_u_f16_e64_dpp -v1, |s2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0x04,0x00,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_u_f16_e64_dpp -v1, |2.0| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x02,0x88,0xd4,0xea,0xe8,0x01,0x20,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x7e,0x0a,0x88,0xd4,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x83,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0x7e,0x93,0x88,0xd4,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
index ab01d37c39d3e..59634ba3cd64a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx.s
@@ -554,50 +554,62 @@ v_cmpx_eq_u64 src_scc, v[2:3]
 v_cmpx_eq_u64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f16 v1, v2
-// GFX12: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
+v_cmpx_ge_f16 v1.l, v2.l
+// GFX12: v_cmpx_ge_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0c,0x7d]
 
-v_cmpx_ge_f16 v127, v2
-// GFX12: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
+v_cmpx_ge_f16 v127.l, v2.l
+// GFX12: v_cmpx_ge_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0c,0x7d]
 
-v_cmpx_ge_f16 s1, v2
-// GFX12: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 s1, v2.l
+// GFX12: v_cmpx_ge_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 s105, v2
-// GFX12: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 s105, v2.l
+// GFX12: v_cmpx_ge_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 vcc_lo, v2
-// GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 vcc_hi, v2
-// GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 ttmp15, v2
-// GFX12: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 ttmp15, v2.l
+// GFX12: v_cmpx_ge_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 m0, v2
-// GFX12: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 m0, v2.l
+// GFX12: v_cmpx_ge_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 exec_lo, v2
-// GFX12: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 exec_lo, v2.l
+// GFX12: v_cmpx_ge_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 exec_hi, v2
-// GFX12: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 exec_hi, v2.l
+// GFX12: v_cmpx_ge_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 null, v2
-// GFX12: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 null, v2.l
+// GFX12: v_cmpx_ge_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 -1, v2
-// GFX12: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 -1, v2.l
+// GFX12: v_cmpx_ge_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 0.5, v2
-// GFX12: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 0.5, v2.l
+// GFX12: v_cmpx_ge_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 src_scc, v2
-// GFX12: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
+v_cmpx_ge_f16 src_scc, v2.l
+// GFX12: v_cmpx_ge_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0c,0x7d]
 
-v_cmpx_ge_f16 0xfe0b, v127
-// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_ge_f16 v1.h, v2.l
+// GFX12: v_cmpx_ge_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0c,0x7d]
+
+v_cmpx_ge_f16 v127.h, v2.l
+// GFX12: v_cmpx_ge_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0c,0x7d]
+
+v_cmpx_ge_f16 src_scc, v2.h
+// GFX12: v_cmpx_ge_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0d,0x7d]
+
+v_cmpx_ge_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_ge_f32 v1, v2
 // GFX12: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
@@ -956,50 +968,62 @@ v_cmpx_ge_u64 src_scc, v[2:3]
 v_cmpx_ge_u64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f16 v1, v2
-// GFX12: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
+v_cmpx_gt_f16 v1.l, v2.l
+// GFX12: v_cmpx_gt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x08,0x7d]
+
+v_cmpx_gt_f16 v127.l, v2.l
+// GFX12: v_cmpx_gt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x08,0x7d]
+
+v_cmpx_gt_f16 s1, v2.l
+// GFX12: v_cmpx_gt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 v127, v2
-// GFX12: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
+v_cmpx_gt_f16 s105, v2.l
+// GFX12: v_cmpx_gt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 s1, v2
-// GFX12: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
+v_cmpx_gt_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 s105, v2
-// GFX12: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
+v_cmpx_gt_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 vcc_lo, v2
-// GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
+v_cmpx_gt_f16 ttmp15, v2.l
+// GFX12: v_cmpx_gt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 vcc_hi, v2
-// GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
+v_cmpx_gt_f16 m0, v2.l
+// GFX12: v_cmpx_gt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 ttmp15, v2
-// GFX12: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
+v_cmpx_gt_f16 exec_lo, v2.l
+// GFX12: v_cmpx_gt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 m0, v2
-// GFX12: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
+v_cmpx_gt_f16 exec_hi, v2.l
+// GFX12: v_cmpx_gt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 exec_lo, v2
-// GFX12: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
+v_cmpx_gt_f16 null, v2.l
+// GFX12: v_cmpx_gt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 exec_hi, v2
-// GFX12: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
+v_cmpx_gt_f16 -1, v2.l
+// GFX12: v_cmpx_gt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 null, v2
-// GFX12: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
+v_cmpx_gt_f16 0.5, v2.l
+// GFX12: v_cmpx_gt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 -1, v2
-// GFX12: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
+v_cmpx_gt_f16 src_scc, v2.l
+// GFX12: v_cmpx_gt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x08,0x7d]
 
-v_cmpx_gt_f16 0.5, v2
-// GFX12: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
+v_cmpx_gt_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_gt_f16 src_scc, v2
-// GFX12: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
+v_cmpx_gt_f16 v1.h, v2.l
+// GFX12: v_cmpx_gt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x08,0x7d]
 
-v_cmpx_gt_f16 0xfe0b, v127
-// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_f16 v127.h, v2.l
+// GFX12: v_cmpx_gt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x08,0x7d]
+
+v_cmpx_gt_f16 src_scc, v2.h
+// GFX12: v_cmpx_gt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x09,0x7d]
+
+v_cmpx_gt_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_gt_f32 v1, v2
 // GFX12: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
@@ -1358,50 +1382,62 @@ v_cmpx_gt_u64 src_scc, v[2:3]
 v_cmpx_gt_u64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f16 v1, v2
-// GFX12: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
+v_cmpx_le_f16 v1.l, v2.l
+// GFX12: v_cmpx_le_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x06,0x7d]
+
+v_cmpx_le_f16 v127.l, v2.l
+// GFX12: v_cmpx_le_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x06,0x7d]
+
+v_cmpx_le_f16 s1, v2.l
+// GFX12: v_cmpx_le_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x06,0x7d]
+
+v_cmpx_le_f16 s105, v2.l
+// GFX12: v_cmpx_le_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x06,0x7d]
+
+v_cmpx_le_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_le_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 v127, v2
-// GFX12: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
+v_cmpx_le_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_le_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 s1, v2
-// GFX12: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
+v_cmpx_le_f16 ttmp15, v2.l
+// GFX12: v_cmpx_le_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 s105, v2
-// GFX12: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
+v_cmpx_le_f16 m0, v2.l
+// GFX12: v_cmpx_le_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 vcc_lo, v2
-// GFX12: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
+v_cmpx_le_f16 exec_lo, v2.l
+// GFX12: v_cmpx_le_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 vcc_hi, v2
-// GFX12: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
+v_cmpx_le_f16 exec_hi, v2.l
+// GFX12: v_cmpx_le_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 ttmp15, v2
-// GFX12: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
+v_cmpx_le_f16 null, v2.l
+// GFX12: v_cmpx_le_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 m0, v2
-// GFX12: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
+v_cmpx_le_f16 -1, v2.l
+// GFX12: v_cmpx_le_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 exec_lo, v2
-// GFX12: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
+v_cmpx_le_f16 0.5, v2.l
+// GFX12: v_cmpx_le_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 exec_hi, v2
-// GFX12: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
+v_cmpx_le_f16 src_scc, v2.l
+// GFX12: v_cmpx_le_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x06,0x7d]
 
-v_cmpx_le_f16 null, v2
-// GFX12: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
+v_cmpx_le_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_le_f16 -1, v2
-// GFX12: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
+v_cmpx_le_f16 v1.h, v2.l
+// GFX12: v_cmpx_le_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x06,0x7d]
 
-v_cmpx_le_f16 0.5, v2
-// GFX12: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
+v_cmpx_le_f16 v127.h, v2.l
+// GFX12: v_cmpx_le_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x06,0x7d]
 
-v_cmpx_le_f16 src_scc, v2
-// GFX12: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
+v_cmpx_le_f16 src_scc, v2.h
+// GFX12: v_cmpx_le_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x07,0x7d]
 
-v_cmpx_le_f16 0xfe0b, v127
-// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_le_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_le_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_le_f32 v1, v2
 // GFX12: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
@@ -1760,50 +1796,62 @@ v_cmpx_le_u64 src_scc, v[2:3]
 v_cmpx_le_u64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f16 v1, v2
-// GFX12: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
+v_cmpx_lg_f16 v1.l, v2.l
+// GFX12: v_cmpx_lg_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0a,0x7d]
 
-v_cmpx_lg_f16 v127, v2
-// GFX12: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
+v_cmpx_lg_f16 v127.l, v2.l
+// GFX12: v_cmpx_lg_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0a,0x7d]
 
-v_cmpx_lg_f16 s1, v2
-// GFX12: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 s1, v2.l
+// GFX12: v_cmpx_lg_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 s105, v2
-// GFX12: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 s105, v2.l
+// GFX12: v_cmpx_lg_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 vcc_lo, v2
-// GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 vcc_hi, v2
-// GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 ttmp15, v2
-// GFX12: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 ttmp15, v2.l
+// GFX12: v_cmpx_lg_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 m0, v2
-// GFX12: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 m0, v2.l
+// GFX12: v_cmpx_lg_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 exec_lo, v2
-// GFX12: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 exec_lo, v2.l
+// GFX12: v_cmpx_lg_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 exec_hi, v2
-// GFX12: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 exec_hi, v2.l
+// GFX12: v_cmpx_lg_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 null, v2
-// GFX12: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 null, v2.l
+// GFX12: v_cmpx_lg_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 -1, v2
-// GFX12: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 -1, v2.l
+// GFX12: v_cmpx_lg_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 0.5, v2
-// GFX12: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 0.5, v2.l
+// GFX12: v_cmpx_lg_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 src_scc, v2
-// GFX12: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
+v_cmpx_lg_f16 src_scc, v2.l
+// GFX12: v_cmpx_lg_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0a,0x7d]
 
-v_cmpx_lg_f16 0xfe0b, v127
-// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_lg_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_lg_f16 v1.h, v2.l
+// GFX12: v_cmpx_lg_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0a,0x7d]
+
+v_cmpx_lg_f16 v127.h, v2.l
+// GFX12: v_cmpx_lg_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0a,0x7d]
+
+v_cmpx_lg_f16 src_scc, v2.h
+// GFX12: v_cmpx_lg_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0b,0x7d]
+
+v_cmpx_lg_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_lg_f32 v1, v2
 // GFX12: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
@@ -2576,50 +2624,62 @@ v_cmpx_ne_u64 src_scc, v[2:3]
 v_cmpx_ne_u64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f16 v1, v2
-// GFX12: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
+v_cmpx_neq_f16 v1.l, v2.l
+// GFX12: v_cmpx_neq_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1a,0x7d]
+
+v_cmpx_neq_f16 v127.l, v2.l
+// GFX12: v_cmpx_neq_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1a,0x7d]
+
+v_cmpx_neq_f16 s1, v2.l
+// GFX12: v_cmpx_neq_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 v127, v2
-// GFX12: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
+v_cmpx_neq_f16 s105, v2.l
+// GFX12: v_cmpx_neq_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 s1, v2
-// GFX12: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 s105, v2
-// GFX12: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 vcc_lo, v2
-// GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 ttmp15, v2.l
+// GFX12: v_cmpx_neq_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 vcc_hi, v2
-// GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 m0, v2.l
+// GFX12: v_cmpx_neq_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 ttmp15, v2
-// GFX12: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 exec_lo, v2.l
+// GFX12: v_cmpx_neq_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 m0, v2
-// GFX12: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 exec_hi, v2.l
+// GFX12: v_cmpx_neq_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 exec_lo, v2
-// GFX12: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 null, v2.l
+// GFX12: v_cmpx_neq_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 exec_hi, v2
-// GFX12: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 -1, v2.l
+// GFX12: v_cmpx_neq_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 null, v2
-// GFX12: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 0.5, v2.l
+// GFX12: v_cmpx_neq_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 -1, v2
-// GFX12: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 src_scc, v2.l
+// GFX12: v_cmpx_neq_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1a,0x7d]
 
-v_cmpx_neq_f16 0.5, v2
-// GFX12: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_neq_f16 src_scc, v2
-// GFX12: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
+v_cmpx_neq_f16 v1.h, v2.l
+// GFX12: v_cmpx_neq_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1a,0x7d]
 
-v_cmpx_neq_f16 0xfe0b, v127
-// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_neq_f16 v127.h, v2.l
+// GFX12: v_cmpx_neq_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1a,0x7d]
+
+v_cmpx_neq_f16 src_scc, v2.h
+// GFX12: v_cmpx_neq_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1b,0x7d]
+
+v_cmpx_neq_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_neq_f32 v1, v2
 // GFX12: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
@@ -2702,50 +2762,62 @@ v_cmpx_neq_f64 src_scc, v[2:3]
 v_cmpx_neq_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f16 v1, v2
-// GFX12: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
+v_cmpx_nge_f16 v1.l, v2.l
+// GFX12: v_cmpx_nge_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x12,0x7d]
+
+v_cmpx_nge_f16 v127.l, v2.l
+// GFX12: v_cmpx_nge_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x12,0x7d]
+
+v_cmpx_nge_f16 s1, v2.l
+// GFX12: v_cmpx_nge_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x12,0x7d]
+
+v_cmpx_nge_f16 s105, v2.l
+// GFX12: v_cmpx_nge_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x12,0x7d]
+
+v_cmpx_nge_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 v127, v2
-// GFX12: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
+v_cmpx_nge_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 s1, v2
-// GFX12: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
+v_cmpx_nge_f16 ttmp15, v2.l
+// GFX12: v_cmpx_nge_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 s105, v2
-// GFX12: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
+v_cmpx_nge_f16 m0, v2.l
+// GFX12: v_cmpx_nge_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 vcc_lo, v2
-// GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
+v_cmpx_nge_f16 exec_lo, v2.l
+// GFX12: v_cmpx_nge_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 vcc_hi, v2
-// GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
+v_cmpx_nge_f16 exec_hi, v2.l
+// GFX12: v_cmpx_nge_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 ttmp15, v2
-// GFX12: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
+v_cmpx_nge_f16 null, v2.l
+// GFX12: v_cmpx_nge_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 m0, v2
-// GFX12: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
+v_cmpx_nge_f16 -1, v2.l
+// GFX12: v_cmpx_nge_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 exec_lo, v2
-// GFX12: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
+v_cmpx_nge_f16 0.5, v2.l
+// GFX12: v_cmpx_nge_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 exec_hi, v2
-// GFX12: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
+v_cmpx_nge_f16 src_scc, v2.l
+// GFX12: v_cmpx_nge_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x12,0x7d]
 
-v_cmpx_nge_f16 null, v2
-// GFX12: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
+v_cmpx_nge_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nge_f16 -1, v2
-// GFX12: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
+v_cmpx_nge_f16 v1.h, v2.l
+// GFX12: v_cmpx_nge_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x12,0x7d]
 
-v_cmpx_nge_f16 0.5, v2
-// GFX12: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
+v_cmpx_nge_f16 v127.h, v2.l
+// GFX12: v_cmpx_nge_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x12,0x7d]
 
-v_cmpx_nge_f16 src_scc, v2
-// GFX12: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
+v_cmpx_nge_f16 src_scc, v2.h
+// GFX12: v_cmpx_nge_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x13,0x7d]
 
-v_cmpx_nge_f16 0xfe0b, v127
-// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nge_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nge_f32 v1, v2
 // GFX12: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
@@ -2828,50 +2900,62 @@ v_cmpx_nge_f64 src_scc, v[2:3]
 v_cmpx_nge_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f16 v1, v2
-// GFX12: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
+v_cmpx_ngt_f16 v1.l, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x16,0x7d]
 
-v_cmpx_ngt_f16 v127, v2
-// GFX12: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
+v_cmpx_ngt_f16 v127.l, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x16,0x7d]
 
-v_cmpx_ngt_f16 s1, v2
-// GFX12: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 s1, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 s105, v2
-// GFX12: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 s105, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 vcc_lo, v2
-// GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 vcc_hi, v2
-// GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 ttmp15, v2
-// GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 ttmp15, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 m0, v2
-// GFX12: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 m0, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 exec_lo, v2
-// GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 exec_lo, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 exec_hi, v2
-// GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 exec_hi, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 null, v2
-// GFX12: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 null, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 -1, v2
-// GFX12: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 -1, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 0.5, v2
-// GFX12: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 0.5, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 src_scc, v2
-// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
+v_cmpx_ngt_f16 src_scc, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x16,0x7d]
 
-v_cmpx_ngt_f16 0xfe0b, v127
-// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ngt_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_ngt_f16 v1.h, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x16,0x7d]
+
+v_cmpx_ngt_f16 v127.h, v2.l
+// GFX12: v_cmpx_ngt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x16,0x7d]
+
+v_cmpx_ngt_f16 src_scc, v2.h
+// GFX12: v_cmpx_ngt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x17,0x7d]
+
+v_cmpx_ngt_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_ngt_f32 v1, v2
 // GFX12: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
@@ -2954,50 +3038,62 @@ v_cmpx_ngt_f64 src_scc, v[2:3]
 v_cmpx_ngt_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f16 v1, v2
-// GFX12: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
+v_cmpx_nle_f16 v1.l, v2.l
+// GFX12: v_cmpx_nle_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x18,0x7d]
+
+v_cmpx_nle_f16 v127.l, v2.l
+// GFX12: v_cmpx_nle_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x18,0x7d]
+
+v_cmpx_nle_f16 s1, v2.l
+// GFX12: v_cmpx_nle_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 v127, v2
-// GFX12: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
+v_cmpx_nle_f16 s105, v2.l
+// GFX12: v_cmpx_nle_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 s1, v2
-// GFX12: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
+v_cmpx_nle_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 s105, v2
-// GFX12: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
+v_cmpx_nle_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 vcc_lo, v2
-// GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
+v_cmpx_nle_f16 ttmp15, v2.l
+// GFX12: v_cmpx_nle_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 vcc_hi, v2
-// GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
+v_cmpx_nle_f16 m0, v2.l
+// GFX12: v_cmpx_nle_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 ttmp15, v2
-// GFX12: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
+v_cmpx_nle_f16 exec_lo, v2.l
+// GFX12: v_cmpx_nle_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 m0, v2
-// GFX12: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
+v_cmpx_nle_f16 exec_hi, v2.l
+// GFX12: v_cmpx_nle_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 exec_lo, v2
-// GFX12: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
+v_cmpx_nle_f16 null, v2.l
+// GFX12: v_cmpx_nle_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 exec_hi, v2
-// GFX12: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
+v_cmpx_nle_f16 -1, v2.l
+// GFX12: v_cmpx_nle_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 null, v2
-// GFX12: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
+v_cmpx_nle_f16 0.5, v2.l
+// GFX12: v_cmpx_nle_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 -1, v2
-// GFX12: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
+v_cmpx_nle_f16 src_scc, v2.l
+// GFX12: v_cmpx_nle_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x18,0x7d]
 
-v_cmpx_nle_f16 0.5, v2
-// GFX12: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
+v_cmpx_nle_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nle_f16 src_scc, v2
-// GFX12: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
+v_cmpx_nle_f16 v1.h, v2.l
+// GFX12: v_cmpx_nle_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x18,0x7d]
 
-v_cmpx_nle_f16 0xfe0b, v127
-// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nle_f16 v127.h, v2.l
+// GFX12: v_cmpx_nle_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x18,0x7d]
+
+v_cmpx_nle_f16 src_scc, v2.h
+// GFX12: v_cmpx_nle_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x19,0x7d]
+
+v_cmpx_nle_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nle_f32 v1, v2
 // GFX12: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
@@ -3080,50 +3176,62 @@ v_cmpx_nle_f64 src_scc, v[2:3]
 v_cmpx_nle_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f16 v1, v2
-// GFX12: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
+v_cmpx_nlg_f16 v1.l, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x14,0x7d]
+
+v_cmpx_nlg_f16 v127.l, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x14,0x7d]
+
+v_cmpx_nlg_f16 s1, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x14,0x7d]
+
+v_cmpx_nlg_f16 s105, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x14,0x7d]
+
+v_cmpx_nlg_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 v127, v2
-// GFX12: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
+v_cmpx_nlg_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 s1, v2
-// GFX12: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 ttmp15, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 s105, v2
-// GFX12: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 m0, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 vcc_lo, v2
-// GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 exec_lo, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 vcc_hi, v2
-// GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 exec_hi, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 ttmp15, v2
-// GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 null, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 m0, v2
-// GFX12: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 -1, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 exec_lo, v2
-// GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 0.5, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 exec_hi, v2
-// GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 src_scc, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x14,0x7d]
 
-v_cmpx_nlg_f16 null, v2
-// GFX12: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nlg_f16 -1, v2
-// GFX12: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 v1.h, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x14,0x7d]
 
-v_cmpx_nlg_f16 0.5, v2
-// GFX12: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 v127.h, v2.l
+// GFX12: v_cmpx_nlg_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x14,0x7d]
 
-v_cmpx_nlg_f16 src_scc, v2
-// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
+v_cmpx_nlg_f16 src_scc, v2.h
+// GFX12: v_cmpx_nlg_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x15,0x7d]
 
-v_cmpx_nlg_f16 0xfe0b, v127
-// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlg_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nlg_f32 v1, v2
 // GFX12: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
@@ -3206,50 +3314,62 @@ v_cmpx_nlg_f64 src_scc, v[2:3]
 v_cmpx_nlg_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f16 v1, v2
-// GFX12: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
+v_cmpx_nlt_f16 v1.l, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1c,0x7d]
 
-v_cmpx_nlt_f16 v127, v2
-// GFX12: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
+v_cmpx_nlt_f16 v127.l, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1c,0x7d]
 
-v_cmpx_nlt_f16 s1, v2
-// GFX12: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 s1, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 s105, v2
-// GFX12: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 s105, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 vcc_lo, v2
-// GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 vcc_hi, v2
-// GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 ttmp15, v2
-// GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 ttmp15, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 m0, v2
-// GFX12: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 m0, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 exec_lo, v2
-// GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 exec_lo, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 exec_hi, v2
-// GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 exec_hi, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 null, v2
-// GFX12: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 null, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 -1, v2
-// GFX12: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 -1, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 0.5, v2
-// GFX12: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 0.5, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 src_scc, v2
-// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
+v_cmpx_nlt_f16 src_scc, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1c,0x7d]
 
-v_cmpx_nlt_f16 0xfe0b, v127
-// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlt_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+
+v_cmpx_nlt_f16 v1.h, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1c,0x7d]
+
+v_cmpx_nlt_f16 v127.h, v2.l
+// GFX12: v_cmpx_nlt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1c,0x7d]
+
+v_cmpx_nlt_f16 src_scc, v2.h
+// GFX12: v_cmpx_nlt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1d,0x7d]
+
+v_cmpx_nlt_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_nlt_f32 v1, v2
 // GFX12: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
@@ -3332,50 +3452,62 @@ v_cmpx_nlt_f64 src_scc, v[2:3]
 v_cmpx_nlt_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f16 v1, v2
-// GFX12: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
+v_cmpx_o_f16 v1.l, v2.l
+// GFX12: v_cmpx_o_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x0e,0x7d]
+
+v_cmpx_o_f16 v127.l, v2.l
+// GFX12: v_cmpx_o_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x0e,0x7d]
+
+v_cmpx_o_f16 s1, v2.l
+// GFX12: v_cmpx_o_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 v127, v2
-// GFX12: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
+v_cmpx_o_f16 s105, v2.l
+// GFX12: v_cmpx_o_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 s1, v2
-// GFX12: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
+v_cmpx_o_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_o_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 s105, v2
-// GFX12: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
+v_cmpx_o_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_o_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 vcc_lo, v2
-// GFX12: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
+v_cmpx_o_f16 ttmp15, v2.l
+// GFX12: v_cmpx_o_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 vcc_hi, v2
-// GFX12: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
+v_cmpx_o_f16 m0, v2.l
+// GFX12: v_cmpx_o_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 ttmp15, v2
-// GFX12: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
+v_cmpx_o_f16 exec_lo, v2.l
+// GFX12: v_cmpx_o_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 m0, v2
-// GFX12: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
+v_cmpx_o_f16 exec_hi, v2.l
+// GFX12: v_cmpx_o_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 exec_lo, v2
-// GFX12: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
+v_cmpx_o_f16 null, v2.l
+// GFX12: v_cmpx_o_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 exec_hi, v2
-// GFX12: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
+v_cmpx_o_f16 -1, v2.l
+// GFX12: v_cmpx_o_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 null, v2
-// GFX12: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
+v_cmpx_o_f16 0.5, v2.l
+// GFX12: v_cmpx_o_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 -1, v2
-// GFX12: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
+v_cmpx_o_f16 src_scc, v2.l
+// GFX12: v_cmpx_o_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x0e,0x7d]
 
-v_cmpx_o_f16 0.5, v2
-// GFX12: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
+v_cmpx_o_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_o_f16 src_scc, v2
-// GFX12: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
+v_cmpx_o_f16 v1.h, v2.l
+// GFX12: v_cmpx_o_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x0e,0x7d]
 
-v_cmpx_o_f16 0xfe0b, v127
-// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_o_f16 v127.h, v2.l
+// GFX12: v_cmpx_o_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x0e,0x7d]
+
+v_cmpx_o_f16 src_scc, v2.h
+// GFX12: v_cmpx_o_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x0f,0x7d]
+
+v_cmpx_o_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_o_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_o_f32 v1, v2
 // GFX12: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
@@ -3458,50 +3590,62 @@ v_cmpx_o_f64 src_scc, v[2:3]
 v_cmpx_o_f64 0xaf123456, v[254:255]
 // GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f16 v1, v2
-// GFX12: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
+v_cmpx_u_f16 v1.l, v2.l
+// GFX12: v_cmpx_u_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x10,0x7d]
+
+v_cmpx_u_f16 v127.l, v2.l
+// GFX12: v_cmpx_u_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x10,0x7d]
+
+v_cmpx_u_f16 s1, v2.l
+// GFX12: v_cmpx_u_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x10,0x7d]
+
+v_cmpx_u_f16 s105, v2.l
+// GFX12: v_cmpx_u_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x10,0x7d]
+
+v_cmpx_u_f16 vcc_lo, v2.l
+// GFX12: v_cmpx_u_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 v127, v2
-// GFX12: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
+v_cmpx_u_f16 vcc_hi, v2.l
+// GFX12: v_cmpx_u_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 s1, v2
-// GFX12: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
+v_cmpx_u_f16 ttmp15, v2.l
+// GFX12: v_cmpx_u_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 s105, v2
-// GFX12: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
+v_cmpx_u_f16 m0, v2.l
+// GFX12: v_cmpx_u_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 vcc_lo, v2
-// GFX12: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
+v_cmpx_u_f16 exec_lo, v2.l
+// GFX12: v_cmpx_u_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 vcc_hi, v2
-// GFX12: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
+v_cmpx_u_f16 exec_hi, v2.l
+// GFX12: v_cmpx_u_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 ttmp15, v2
-// GFX12: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
+v_cmpx_u_f16 null, v2.l
+// GFX12: v_cmpx_u_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 m0, v2
-// GFX12: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
+v_cmpx_u_f16 -1, v2.l
+// GFX12: v_cmpx_u_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 exec_lo, v2
-// GFX12: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
+v_cmpx_u_f16 0.5, v2.l
+// GFX12: v_cmpx_u_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 exec_hi, v2
-// GFX12: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
+v_cmpx_u_f16 src_scc, v2.l
+// GFX12: v_cmpx_u_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x10,0x7d]
 
-v_cmpx_u_f16 null, v2
-// GFX12: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
+v_cmpx_u_f16 0xfe0b, v127.l
+// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_u_f16 -1, v2
-// GFX12: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
+v_cmpx_u_f16 v1.h, v2.l
+// GFX12: v_cmpx_u_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x10,0x7d]
 
-v_cmpx_u_f16 0.5, v2
-// GFX12: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
+v_cmpx_u_f16 v127.h, v2.l
+// GFX12: v_cmpx_u_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x10,0x7d]
 
-v_cmpx_u_f16 src_scc, v2
-// GFX12: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
+v_cmpx_u_f16 src_scc, v2.h
+// GFX12: v_cmpx_u_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x11,0x7d]
 
-v_cmpx_u_f16 0xfe0b, v127
-// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_u_f16 0xfe0b, v127.h
+// GFX12: v_cmpx_u_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
 
 v_cmpx_u_f32 v1, v2
 // GFX12: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
index 2b919fa9d671e..3a3a89b8a9932 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp16.s
@@ -362,47 +362,53 @@ v_cmpx_eq_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
@@ -626,47 +632,53 @@ v_cmpx_ge_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
@@ -890,47 +902,53 @@ v_cmpx_gt_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_le_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
@@ -1154,47 +1172,53 @@ v_cmpx_le_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
@@ -1688,47 +1712,53 @@ v_cmpx_ne_u32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x05,0x30]
 
-v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
@@ -1772,47 +1802,53 @@ v_cmpx_neq_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nge_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nge_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
@@ -1856,47 +1892,53 @@ v_cmpx_nge_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
@@ -1940,47 +1982,53 @@ v_cmpx_ngt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
@@ -2024,47 +2072,53 @@ v_cmpx_nle_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlg_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
@@ -2108,47 +2162,53 @@ v_cmpx_nlg_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xf5,0x30]
+
+v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
@@ -2192,47 +2252,53 @@ v_cmpx_nlt_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 
-v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x09,0x13]
+
+v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2276,47 +2342,53 @@ v_cmpx_o_f32 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xf5,0x30]
 
-v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+
+v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+
+v_cmpx_u_f16 v1.l, v2.l row_mirror
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_half_mirror
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_mirror
-// GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shl:1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_half_mirror
-// GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shl:15
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shl:1
-// GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shr:1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shl:15
-// GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_shr:15
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shr:1
-// GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_ror:1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_shr:15
-// GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_ror:15
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_ror:1
-// GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
-v_cmpx_u_f16 v1, v2 row_ror:15
-// GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
-v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13]
 
-v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1
+// GFX12: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30]
 
-v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x09,0x13]
+v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x09,0x13]
 
-v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xf5,0x30]
+v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xf5,0x30]
 
 v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
index 11579786d78a8..0f30751003373 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_dpp8.s
@@ -98,14 +98,20 @@ v_cmpx_eq_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
@@ -164,14 +170,20 @@ v_cmpx_ge_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
@@ -230,14 +242,20 @@ v_cmpx_gt_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
@@ -296,14 +314,20 @@ v_cmpx_le_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
@@ -434,14 +458,20 @@ v_cmpx_ne_u32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
@@ -452,14 +482,20 @@ v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
@@ -470,14 +506,20 @@ v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
@@ -488,14 +530,20 @@ v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
@@ -506,14 +554,20 @@ v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
@@ -524,14 +578,20 @@ v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+
+v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
@@ -542,14 +602,20 @@ v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+
+v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
@@ -560,14 +626,20 @@ v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
-v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+
+v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
 
-v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0xea,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xe9,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
 
 v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
index 265ab2c8ff66d..58c355ed56ab1 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_err.s
@@ -145,23 +145,41 @@ v_cmpx_eq_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ge_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_ge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_ge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_ge_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_ge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_ge_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_ge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_ge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_ge_i16_e32 v1.h, v255.h
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -235,23 +253,41 @@ v_cmpx_ge_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_gt_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_gt_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_gt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_gt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_gt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_gt_i16_e32 v1.h, v255.h
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -325,23 +361,41 @@ v_cmpx_gt_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_le_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_le_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_le_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_le_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_le_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_le_i16_e32 v1.h, v255.h
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -415,23 +469,41 @@ v_cmpx_le_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lg_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_lg_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
 
-v_cmpx_lg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:28: error: invalid operand for instruction
+v_cmpx_lg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
+
+v_cmpx_lg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_cmpx_lt_f16_e32 v1.h, v255.h
 // GFX12: :[[@LINE-1]]:25: error: invalid operand for instruction
@@ -613,146 +685,290 @@ v_cmpx_ne_u16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: :[[@LINE-1]]:19: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_neq_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_neq_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nge_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_ngt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nle_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
+
+v_cmpx_nlg_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlg_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_neq_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlg_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlg_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nge_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlg_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlt_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlt_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_ngt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:26: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlt_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_nlt_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nle_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_nlt_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:20: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_o_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_o_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlg_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_o_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_o_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_nlt_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:29: error: invalid operand for instruction
+v_cmpx_o_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v1.h, v255.h
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v1.l, v255.l
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_o_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:24: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v255.h, v2.h
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2
-// GFX12: :[[@LINE-1]]:1: error: operands are not valid for this GPU or mode
+v_cmpx_u_f16_e32 v255.l, v2.l
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
 
-v_cmpx_u_f16_e32 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: :[[@LINE-1]]:27: error: invalid operand for instruction
+v_cmpx_u_f16_e32 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: :[[@LINE-1]]:18: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
index ed228c061d019..3d02c95c94ac0 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vopcx_t16_promote.s
@@ -145,23 +145,41 @@ v_cmpx_eq_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_eq_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_eq_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xba,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16 v1, v255
-// GFX12: v_cmpx_ge_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_ge_f16 v1.h, v255.h
+// GFX12: v_cmpx_ge_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x86,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_ge_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ge_f16 v255, v2
-// GFX12: v_cmpx_ge_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_ge_f16 v1.l, v255.l
+// GFX12: v_cmpx_ge_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_ge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ge_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_ge_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v255.h, v2.h
+// GFX12: v_cmpx_ge_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x86,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ge_f16 v255.l, v2.l
+// GFX12: v_cmpx_ge_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ge_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_ge_i16 v1.h, v255.h
 // GFX12: v_cmpx_ge_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb6,0xd4,0x01,0xff,0x03,0x00]
@@ -235,23 +253,41 @@ v_cmpx_ge_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ge_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ge_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbe,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16 v1, v255
-// GFX12: v_cmpx_gt_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_gt_f16 v1.h, v255.h
+// GFX12: v_cmpx_gt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x84,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_gt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16 v1.l, v255.l
+// GFX12: v_cmpx_gt_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_gt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_gt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_gt_f16 v255, v2
-// GFX12: v_cmpx_gt_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_gt_f16 v255.h, v2.h
+// GFX12: v_cmpx_gt_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x84,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_gt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_gt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_gt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_gt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_gt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_gt_f16 v255.l, v2.l
+// GFX12: v_cmpx_gt_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_gt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_gt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_gt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_gt_i16 v1.h, v255.h
 // GFX12: v_cmpx_gt_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb4,0xd4,0x01,0xff,0x03,0x00]
@@ -325,23 +361,41 @@ v_cmpx_gt_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_gt_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_gt_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbc,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_le_f16 v1, v255
-// GFX12: v_cmpx_le_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_le_f16 v1.h, v255.h
+// GFX12: v_cmpx_le_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x83,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_le_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16 v1.l, v255.l
+// GFX12: v_cmpx_le_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_le_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_le_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_le_f16 v255.h, v2.h
+// GFX12: v_cmpx_le_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x83,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_le_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_le_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_le_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_le_f16 v255, v2
-// GFX12: v_cmpx_le_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_le_f16 v255.l, v2.l
+// GFX12: v_cmpx_le_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_le_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_le_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_le_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_le_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_le_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_le_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_le_i16 v1.h, v255.h
 // GFX12: v_cmpx_le_i16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0xb3,0xd4,0x01,0xff,0x03,0x00]
@@ -415,23 +469,41 @@ v_cmpx_le_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_le_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_le_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbb,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v1, v255
-// GFX12: v_cmpx_lg_f16_e64 v1, v255              ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_lg_f16 v1.h, v255.h
+// GFX12: v_cmpx_lg_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x85,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_lg_f16 v255, v2
-// GFX12: v_cmpx_lg_f16_e64 v255, v2              ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_lg_f16 v1.l, v255.l
+// GFX12: v_cmpx_lg_f16_e64 v1.l, v255.l          ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_lg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_lg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_lg_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_lg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_lg_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16 v255.h, v2.h
+// GFX12: v_cmpx_lg_f16_e64 v255.h, v2.h          ; encoding: [0x7e,0x18,0x85,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_lg_f16 v255.l, v2.l
+// GFX12: v_cmpx_lg_f16_e64 v255.l, v2.l          ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_lg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_lg_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_lg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
 v_cmpx_lt_f16 v1.h, v255.h
 // GFX12: v_cmpx_lt_f16_e64 v1.h, v255.h          ; encoding: [0x7e,0x18,0x81,0xd4,0x01,0xff,0x03,0x00]
@@ -613,146 +685,290 @@ v_cmpx_ne_u16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 v_cmpx_ne_u16 v255.l, v2.l quad_perm:[3,2,1,0]
 // GFX12: v_cmpx_ne_u16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0xbd,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_neq_f16 v1, v255
-// GFX12: v_cmpx_neq_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_neq_f16 v1.h, v255.h
+// GFX12: v_cmpx_neq_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8d,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_neq_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v1.l, v255.l
+// GFX12: v_cmpx_neq_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_neq_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v255.h, v2.h
+// GFX12: v_cmpx_neq_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8d,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_neq_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_neq_f16 v255.l, v2.l
+// GFX12: v_cmpx_neq_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_neq_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_neq_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_neq_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v1.h, v255.h
+// GFX12: v_cmpx_nge_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x89,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nge_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v1.l, v255.l
+// GFX12: v_cmpx_nge_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nge_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v255.h, v2.h
+// GFX12: v_cmpx_nge_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x89,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nge_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nge_f16 v255.l, v2.l
+// GFX12: v_cmpx_nge_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nge_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nge_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nge_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.h, v255.h
+// GFX12: v_cmpx_ngt_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8b,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ngt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v1.l, v255.l
+// GFX12: v_cmpx_ngt_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_ngt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v255.h, v2.h
+// GFX12: v_cmpx_ngt_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8b,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_ngt_f16 v255.l, v2.l
+// GFX12: v_cmpx_ngt_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_ngt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_ngt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_ngt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v1.h, v255.h
+// GFX12: v_cmpx_nle_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8c,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nle_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v1.l, v255.l
+// GFX12: v_cmpx_nle_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00]
+
+v_cmpx_nle_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v255.h, v2.h
+// GFX12: v_cmpx_nle_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8c,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nle_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nle_f16 v255.l, v2.l
+// GFX12: v_cmpx_nle_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00]
+
+v_cmpx_nle_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+
+v_cmpx_nle_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nle_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+
+v_cmpx_nlg_f16 v1.h, v255.h
+// GFX12: v_cmpx_nlg_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8a,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_neq_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_neq_f16 v255, v2
-// GFX12: v_cmpx_neq_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlg_f16 v1.l, v255.l
+// GFX12: v_cmpx_nlg_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_neq_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_neq_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_neq_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nge_f16 v1, v255
-// GFX12: v_cmpx_nge_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_nlg_f16 v255.h, v2.h
+// GFX12: v_cmpx_nlg_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8a,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nge_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nge_f16 v255, v2
-// GFX12: v_cmpx_nge_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlg_f16 v255.l, v2.l
+// GFX12: v_cmpx_nlg_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nge_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlg_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nge_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nge_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlg_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlg_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16 v1, v255
-// GFX12: v_cmpx_ngt_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_nlt_f16 v1.h, v255.h
+// GFX12: v_cmpx_nlt_f16_e64 v1.h, v255.h         ; encoding: [0x7e,0x18,0x8e,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ngt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_ngt_f16 v255, v2
-// GFX12: v_cmpx_ngt_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlt_f16 v1.l, v255.l
+// GFX12: v_cmpx_nlt_f16_e64 v1.l, v255.l         ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_ngt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_ngt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_ngt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16 v1, v255
-// GFX12: v_cmpx_nle_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_nlt_f16 v255.h, v2.h
+// GFX12: v_cmpx_nlt_f16_e64 v255.h, v2.h         ; encoding: [0x7e,0x18,0x8e,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nle_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nle_f16 v255, v2
-// GFX12: v_cmpx_nle_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_nlt_f16 v255.l, v2.l
+// GFX12: v_cmpx_nlt_f16_e64 v255.l, v2.l         ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nle_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_nlt_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nle_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nle_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_nlt_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_nlt_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nlg_f16 v1, v255
-// GFX12: v_cmpx_nlg_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_o_f16 v1.h, v255.h
+// GFX12: v_cmpx_o_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x87,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_nlg_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlg_f16 v255, v2
-// GFX12: v_cmpx_nlg_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_o_f16 v1.l, v255.l
+// GFX12: v_cmpx_o_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_nlg_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_o_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_nlg_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlg_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_o_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v1, v255
-// GFX12: v_cmpx_nlt_f16_e64 v1, v255             ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_o_f16 v255.h, v2.h
+// GFX12: v_cmpx_o_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x87,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_o_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_o_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_nlt_f16 v255, v2
-// GFX12: v_cmpx_nlt_f16_e64 v255, v2             ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_o_f16 v255.l, v2.l
+// GFX12: v_cmpx_o_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_nlt_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_o_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_nlt_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_nlt_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_o_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_o_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_o_f16 v1, v255
-// GFX12: v_cmpx_o_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_u_f16 v1.h, v255.h
+// GFX12: v_cmpx_u_f16_e64 v1.h, v255.h           ; encoding: [0x7e,0x18,0x88,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_o_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.h, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.h, v255.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.h, v255.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_o_f16 v255, v2
-// GFX12: v_cmpx_o_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_u_f16 v1.l, v255.l
+// GFX12: v_cmpx_u_f16_e64 v1.l, v255.l           ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00]
 
-v_cmpx_o_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_u_f16 v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
 
-v_cmpx_o_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_o_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_u_f16 v1.l, v255.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v1.l, v255.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
 
-v_cmpx_u_f16 v1, v255
-// GFX12: v_cmpx_u_f16_e64 v1, v255               ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0xff,0x03,0x00]
+v_cmpx_u_f16 v255.h, v2.h
+// GFX12: v_cmpx_u_f16_e64 v255.h, v2.h           ; encoding: [0x7e,0x18,0x88,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_u_f16 v1, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0xfe,0x03,0x00,0x01,0x77,0x39,0x05]
+v_cmpx_u_f16 v255.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x18,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v1, v255 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v1, v255 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0xfe,0x03,0x00,0x01,0x1b,0x00,0xff]
+v_cmpx_u_f16 v255.h, v2.h quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v255.h, v2.h op_sel:[1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x18,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
 
-v_cmpx_u_f16 v255, v2
-// GFX12: v_cmpx_u_f16_e64 v255, v2               ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00]
+v_cmpx_u_f16 v255.l, v2.l
+// GFX12: v_cmpx_u_f16_e64 v255.l, v2.l           ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0x05,0x02,0x00]
 
-v_cmpx_u_f16 v255, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v255, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
+v_cmpx_u_f16 v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v255.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0xff,0x77,0x39,0x05]
 
-v_cmpx_u_f16 v255, v2 quad_perm:[3,2,1,0]
-// GFX12: v_cmpx_u_f16_e64_dpp v255, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
+v_cmpx_u_f16 v255.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_cmpx_u_f16_e64_dpp v255.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0xff,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
index b73c7f83c7442..4a46eaead2390 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16_from_vopcx.txt
@@ -445,46 +445,72 @@
 # GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x80,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x80,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x80,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_f_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x90,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -613,46 +639,72 @@
 # GFX11: v_cmpx_f_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -917,46 +969,72 @@
 # GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1221,46 +1299,72 @@
 # GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1525,46 +1629,72 @@
 # GFX11: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2159,46 +2289,72 @@
 # GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2243,46 +2399,72 @@
 # GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2327,46 +2509,72 @@
 # GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2411,46 +2619,72 @@
 # GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2495,46 +2729,72 @@
 # GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2579,46 +2839,72 @@
 # GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2663,46 +2949,72 @@
 # GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2747,46 +3059,72 @@
 # GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8f,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8f,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_t_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9f,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2915,46 +3253,72 @@
 # GFX11: v_cmpx_t_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
index 0b7e14108848c..4c197faf52c16 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8_from_vopcx.txt
@@ -123,16 +123,32 @@
 # GFX11: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x80,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x80,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x80,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_f_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x80,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_f_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x90,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -159,16 +175,32 @@
 # GFX11: v_cmpx_f_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xc8,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -235,16 +267,32 @@
 # GFX11: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -311,16 +359,32 @@
 # GFX11: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -387,16 +451,32 @@
 # GFX11: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -555,16 +635,32 @@
 # GFX11: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -579,16 +675,32 @@
 # GFX11: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -603,16 +715,32 @@
 # GFX11: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -627,16 +755,32 @@
 # GFX11: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -651,16 +795,32 @@
 # GFX11: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -675,16 +835,32 @@
 # GFX11: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -699,16 +875,32 @@
 # GFX11: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -723,16 +915,32 @@
 # GFX11: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8f,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8f,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_t_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8f,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_t_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9f,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -759,16 +967,32 @@
 # GFX11: v_cmpx_t_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcf,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX11: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
index cd897944845a0..40c34708d863e 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_from_vopcx.txt
@@ -568,10 +568,12 @@
 # GFX11: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_f_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_f_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x04,0x00,0x00]
@@ -612,6 +614,14 @@
 0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_f_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x80,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_f_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x80,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x80,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_f_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x80,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x80,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_f_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x90,0xd4,0x01,0x05,0x02,0x00]
 
@@ -856,10 +866,12 @@
 # GFX11: v_cmpx_f_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xd8,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
@@ -900,6 +912,14 @@
 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1254,10 +1274,12 @@
 # GFX11: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
@@ -1298,6 +1320,14 @@
 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1652,10 +1682,12 @@
 # GFX11: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_le_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
@@ -1696,6 +1728,14 @@
 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2050,10 +2090,12 @@
 # GFX11: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
@@ -2094,6 +2136,14 @@
 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2856,10 +2906,12 @@
 # GFX11: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
@@ -2900,6 +2952,14 @@
 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2982,10 +3042,12 @@
 # GFX11: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
@@ -3026,6 +3088,14 @@
 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3108,10 +3178,12 @@
 # GFX11: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
@@ -3152,6 +3224,14 @@
 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3234,10 +3314,12 @@
 # GFX11: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
@@ -3278,6 +3360,14 @@
 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3360,10 +3450,12 @@
 # GFX11: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
@@ -3404,6 +3496,14 @@
 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3486,10 +3586,12 @@
 # GFX11: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
@@ -3530,6 +3632,14 @@
 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3612,10 +3722,12 @@
 # GFX11: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_o_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
@@ -3656,6 +3768,14 @@
 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3738,10 +3858,12 @@
 # GFX11: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_t_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_t_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x04,0x00,0x00]
@@ -3782,6 +3904,14 @@
 0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_t_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8f,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_t_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8f,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x8f,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_t_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8f,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x8f,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_t_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x9f,0xd4,0x01,0x05,0x02,0x00]
 
@@ -4026,10 +4156,12 @@
 # GFX11: v_cmpx_t_u64_e64 0xaf123456, vcc        ; encoding: [0x7e,0x00,0xdf,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00
-# GFX11: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-REAL16: v_cmpx_u_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00
-# GFX11: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00
 # GFX11: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
@@ -4070,6 +4202,14 @@
 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX11: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00
+# GFX11-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00
+# GFX11-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00
 # GFX11: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
index 90a95138144f1..3d16d19954ab9 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx.txt
@@ -649,49 +649,84 @@
 # GFX11: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x00,0x7d]
 
 0x7f,0x05,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x00,0x7d]
 
 0x01,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x00,0x7d]
 
 0x69,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x00,0x7d]
 
 0x6a,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x00,0x7d]
 
 0x6b,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x00,0x7d]
 
 0x7b,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x00,0x7d]
 
 0x7d,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x00,0x7d]
 
 0x7e,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x00,0x7d]
 
 0x7f,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x00,0x7d]
 
 0x7c,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x00,0x7d]
 
 0xc1,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x00,0x7d]
 
 0xf0,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x00,0x7d]
 
 0xfd,0x04,0x00,0x7d
-# GFX11: v_cmpx_f_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x00,0x7d]
+# GFX11-REAL16: v_cmpx_f_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x00,0x7d]
 
 0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_f_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_f_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x00,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x00,0x7d
+# GFX11-REAL16: v_cmpx_f_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x00,0x7d]
+
+0xff,0x05,0x00,0x7d
+# GFX11-REAL16: v_cmpx_f_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x00,0x7d]
+
+0xf0,0xfe,0x00,0x7d
+# GFX11-REAL16: v_cmpx_f_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x00,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 0.5, v127              ; encoding: [0xf0,0xfe,0x00,0x7d]
+
+0xfd,0x04,0x01,0x7d
+# GFX11-REAL16: v_cmpx_f_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x01,0x7d]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x01,0x7d]
+
+0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_f_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x01,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x20,0x7d
 # GFX11: v_cmpx_f_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x20,0x7d]
@@ -937,49 +972,84 @@
 # GFX11: v_cmpx_f_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb1,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 
 0x7f,0x05,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 
 0x01,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 
 0x69,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 
 0x6a,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 
 0x6b,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 
 0x7b,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 
 0x7d,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 
 0x7e,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 
 0x7f,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 
 0x7c,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 
 0xc1,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 
 0xf0,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 
 0xfd,0x04,0x0c,0x7d
-# GFX11: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 
 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0c,0x7d
+# GFX11-REAL16: v_cmpx_ge_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0c,0x7d]
+
+0xff,0x05,0x0c,0x7d
+# GFX11-REAL16: v_cmpx_ge_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0c,0x7d]
+
+0xf0,0xfe,0x0c,0x7d
+# GFX11-REAL16: v_cmpx_ge_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x0c,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x0c,0x7d]
+
+0xfd,0x04,0x0d,0x7d
+# GFX11-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0d,0x7d]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0d,0x7d]
+
+0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2c,0x7d
 # GFX11: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
@@ -1385,49 +1455,84 @@
 # GFX11: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 
 0x7f,0x05,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 
 0x01,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 
 0x69,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 
 0x6a,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 
 0x6b,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 
 0x7b,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 
 0x7d,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 
 0x7e,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 
 0x7f,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 
 0x7c,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 
 0xc1,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 
 0xf0,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 
 0xfd,0x04,0x08,0x7d
-# GFX11: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 
 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x08,0x7d
+# GFX11-REAL16: v_cmpx_gt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x08,0x7d]
+
+0xff,0x05,0x08,0x7d
+# GFX11-REAL16: v_cmpx_gt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x08,0x7d]
+
+0xf0,0xfe,0x08,0x7d
+# GFX11-REAL16: v_cmpx_gt_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x08,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x08,0x7d]
+
+0xfd,0x04,0x09,0x7d
+# GFX11-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x09,0x7d]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x09,0x7d]
+
+0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x28,0x7d
 # GFX11: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
@@ -1833,49 +1938,84 @@
 # GFX11: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 
 0x7f,0x05,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 
 0x01,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 
 0x69,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 
 0x6a,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 
 0x6b,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 
 0x7b,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 
 0x7d,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 
 0x7e,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 
 0x7f,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 
 0x7c,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 
 0xc1,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 
 0xf0,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 
 0xfd,0x04,0x06,0x7d
-# GFX11: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
+# GFX11-REAL16: v_cmpx_le_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 
 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x06,0x7d
+# GFX11-REAL16: v_cmpx_le_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x06,0x7d]
+
+0xff,0x05,0x06,0x7d
+# GFX11-REAL16: v_cmpx_le_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x06,0x7d]
+
+0xf0,0xfe,0x06,0x7d
+# GFX11-REAL16: v_cmpx_le_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x06,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x06,0x7d]
+
+0xfd,0x04,0x07,0x7d
+# GFX11-REAL16: v_cmpx_le_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x07,0x7d]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x07,0x7d]
+
+0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x26,0x7d
 # GFX11: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
@@ -2281,49 +2421,84 @@
 # GFX11: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 
 0x7f,0x05,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 
 0x01,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 
 0x69,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 
 0x6a,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 
 0x6b,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 
 0x7b,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 
 0x7d,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 
 0x7e,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 
 0x7f,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 
 0x7c,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 
 0xc1,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 
 0xf0,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 
 0xfd,0x04,0x0a,0x7d
-# GFX11: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 
 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0a,0x7d
+# GFX11-REAL16: v_cmpx_lg_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x7d]
+
+0xff,0x05,0x0a,0x7d
+# GFX11-REAL16: v_cmpx_lg_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x7d]
+
+0xf0,0xfe,0x0a,0x7d
+# GFX11-REAL16: v_cmpx_lg_f16_e32 0.5, v127.l           ; encoding: [0xf0,0xfe,0x0a,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 0.5, v127             ; encoding: [0xf0,0xfe,0x0a,0x7d]
+
+0xfd,0x04,0x0b,0x7d
+# GFX11-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0b,0x7d]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0b,0x7d]
+
+0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2a,0x7d
 # GFX11: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
@@ -3212,49 +3387,84 @@
 # GFX11: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 
 0x7f,0x05,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 
 0x01,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 
 0x69,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 
 0x6a,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 
 0x6b,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 
 0x7b,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 
 0x7d,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 
 0x7e,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 
 0x7f,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 
 0x7c,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 
 0xc1,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 
 0xf0,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 
 0xfd,0x04,0x1a,0x7d
-# GFX11: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 
 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x1a,0x7d
+# GFX11-REAL16: v_cmpx_neq_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1a,0x7d]
+
+0xff,0x05,0x1a,0x7d
+# GFX11-REAL16: v_cmpx_neq_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1a,0x7d]
+
+0xf0,0xfe,0x1a,0x7d
+# GFX11-REAL16: v_cmpx_neq_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x1a,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x1a,0x7d]
+
+0xfd,0x04,0x1b,0x7d
+# GFX11-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1b,0x7d]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1b,0x7d]
+
+0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x3a,0x7d
 # GFX11: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
@@ -3338,49 +3548,84 @@
 # GFX11: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 
 0x7f,0x05,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 
 0x01,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 
 0x69,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 
 0x6a,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 
 0x6b,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 
 0x7b,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 
 0x7d,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 
 0x7e,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 
 0x7f,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 
 0x7c,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 
 0xc1,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 
 0xf0,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 
 0xfd,0x04,0x12,0x7d
-# GFX11: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 
 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x12,0x7d
+# GFX11-REAL16: v_cmpx_nge_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x12,0x7d]
+
+0xff,0x05,0x12,0x7d
+# GFX11-REAL16: v_cmpx_nge_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x12,0x7d]
+
+0xf0,0xfe,0x12,0x7d
+# GFX11-REAL16: v_cmpx_nge_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x12,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x12,0x7d]
+
+0xfd,0x04,0x13,0x7d
+# GFX11-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x13,0x7d]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x13,0x7d]
+
+0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x32,0x7d
 # GFX11: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
@@ -3464,49 +3709,84 @@
 # GFX11: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 
 0x7f,0x05,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 
 0x01,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 
 0x69,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 
 0x6a,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 
 0x6b,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 
 0x7b,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 
 0x7d,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 
 0x7e,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 
 0x7f,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 
 0x7c,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 
 0xc1,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 
 0xf0,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 
 0xfd,0x04,0x16,0x7d
-# GFX11: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 
 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x16,0x7d
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x16,0x7d]
+
+0xff,0x05,0x16,0x7d
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x16,0x7d]
+
+0xf0,0xfe,0x16,0x7d
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x16,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x16,0x7d]
+
+0xfd,0x04,0x17,0x7d
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x17,0x7d]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x17,0x7d]
+
+0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x36,0x7d
 # GFX11: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
@@ -3590,49 +3870,84 @@
 # GFX11: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 
 0x7f,0x05,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 
 0x01,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 
 0x69,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 
 0x6a,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 
 0x6b,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 
 0x7b,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 
 0x7d,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 
 0x7e,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 
 0x7f,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 
 0x7c,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 
 0xc1,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 
 0xf0,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 
 0xfd,0x04,0x18,0x7d
-# GFX11: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 
 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x18,0x7d
+# GFX11-REAL16: v_cmpx_nle_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x18,0x7d]
+
+0xff,0x05,0x18,0x7d
+# GFX11-REAL16: v_cmpx_nle_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x18,0x7d]
+
+0xf0,0xfe,0x18,0x7d
+# GFX11-REAL16: v_cmpx_nle_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x18,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x18,0x7d]
+
+0xfd,0x04,0x19,0x7d
+# GFX11-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x19,0x7d]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x19,0x7d]
+
+0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x38,0x7d
 # GFX11: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
@@ -3716,49 +4031,84 @@
 # GFX11: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 
 0x7f,0x05,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 
 0x01,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 
 0x69,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 
 0x6a,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 
 0x6b,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 
 0x7b,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 
 0x7d,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 
 0x7e,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 
 0x7f,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 
 0x7c,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 
 0xc1,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 
 0xf0,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 
 0xfd,0x04,0x14,0x7d
-# GFX11: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 
 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x14,0x7d
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x14,0x7d]
+
+0xff,0x05,0x14,0x7d
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x14,0x7d]
+
+0xf0,0xfe,0x14,0x7d
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x14,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x14,0x7d]
+
+0xfd,0x04,0x15,0x7d
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x15,0x7d]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x15,0x7d]
+
+0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x34,0x7d
 # GFX11: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
@@ -3842,49 +4192,84 @@
 # GFX11: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 
 0x7f,0x05,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 
 0x01,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 
 0x69,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 
 0x6a,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 
 0x6b,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 
 0x7b,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 
 0x7d,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 
 0x7e,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 
 0x7f,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 
 0x7c,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 
 0xc1,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 
 0xf0,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 
 0xfd,0x04,0x1c,0x7d
-# GFX11: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 
 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x1c,0x7d
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1c,0x7d]
+
+0xff,0x05,0x1c,0x7d
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1c,0x7d]
+
+0xf0,0xfe,0x1c,0x7d
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 0.5, v127.l          ; encoding: [0xf0,0xfe,0x1c,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0.5, v127            ; encoding: [0xf0,0xfe,0x1c,0x7d]
+
+0xfd,0x04,0x1d,0x7d
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1d,0x7d]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1d,0x7d]
+
+0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x3c,0x7d
 # GFX11: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
@@ -3968,49 +4353,84 @@
 # GFX11: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 
 0x7f,0x05,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 
 0x01,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 
 0x69,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 
 0x6a,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 
 0x6b,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 
 0x7b,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 
 0x7d,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 
 0x7e,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 
 0x7f,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 
 0x7c,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 
 0xc1,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 
 0xf0,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 
 0xfd,0x04,0x0e,0x7d
-# GFX11: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
+# GFX11-REAL16: v_cmpx_o_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 
 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0e,0x7d
+# GFX11-REAL16: v_cmpx_o_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0e,0x7d]
+
+0xff,0x05,0x0e,0x7d
+# GFX11-REAL16: v_cmpx_o_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0e,0x7d]
+
+0xf0,0xfe,0x0e,0x7d
+# GFX11-REAL16: v_cmpx_o_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x0e,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 0.5, v127              ; encoding: [0xf0,0xfe,0x0e,0x7d]
+
+0xfd,0x04,0x0f,0x7d
+# GFX11-REAL16: v_cmpx_o_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x0f,0x7d]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0f,0x7d]
+
+0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2e,0x7d
 # GFX11: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
@@ -4094,49 +4514,84 @@
 # GFX11: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x1e,0x7d]
 
 0x7f,0x05,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x1e,0x7d]
 
 0x01,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x1e,0x7d]
 
 0x69,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x1e,0x7d]
 
 0x6a,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x1e,0x7d]
 
 0x6b,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x1e,0x7d]
 
 0x7b,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x1e,0x7d]
 
 0x7d,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x1e,0x7d]
 
 0x7e,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x1e,0x7d]
 
 0x7f,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x1e,0x7d]
 
 0x7c,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x1e,0x7d]
 
 0xc1,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x1e,0x7d]
 
 0xf0,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x1e,0x7d]
 
 0xfd,0x04,0x1e,0x7d
-# GFX11: v_cmpx_t_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x1e,0x7d]
+# GFX11-REAL16: v_cmpx_t_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x1e,0x7d]
 
 0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_t_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_t_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x1e,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x1e,0x7d
+# GFX11-REAL16: v_cmpx_t_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1e,0x7d]
+
+0xff,0x05,0x1e,0x7d
+# GFX11-REAL16: v_cmpx_t_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1e,0x7d]
+
+0xf0,0xfe,0x1e,0x7d
+# GFX11-REAL16: v_cmpx_t_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x1e,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 0.5, v127              ; encoding: [0xf0,0xfe,0x1e,0x7d]
+
+0xfd,0x04,0x1f,0x7d
+# GFX11-REAL16: v_cmpx_t_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x1f,0x7d]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1f,0x7d]
+
+0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_t_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1f,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x3e,0x7d
 # GFX11: v_cmpx_t_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x3e,0x7d]
@@ -4382,49 +4837,84 @@
 # GFX11: v_cmpx_t_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbf,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 
 0x7f,0x05,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 
 0x01,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 
 0x69,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 
 0x6a,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 
 0x6b,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 
 0x7b,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 
 0x7d,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 
 0x7e,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 
 0x7f,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 
 0x7c,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 
 0xc1,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 
 0xf0,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 
 0xfd,0x04,0x10,0x7d
-# GFX11: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
+# GFX11-REAL16: v_cmpx_u_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 
 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00
-# GFX11: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x10,0x7d
+# GFX11-REAL16: v_cmpx_u_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x10,0x7d]
+
+0xff,0x05,0x10,0x7d
+# GFX11-REAL16: v_cmpx_u_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x10,0x7d]
+
+0xf0,0xfe,0x10,0x7d
+# GFX11-REAL16: v_cmpx_u_f16_e32 0.5, v127.l            ; encoding: [0xf0,0xfe,0x10,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 0.5, v127              ; encoding: [0xf0,0xfe,0x10,0x7d]
+
+0xfd,0x04,0x11,0x7d
+# GFX11-REAL16: v_cmpx_u_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x11,0x7d]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x11,0x7d]
+
+0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00
+# GFX11-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x30,0x7d
 # GFX11: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
index 9b9b423a7b104..f55e646dda79b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp16.txt
@@ -445,46 +445,72 @@
 # GFX11: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x00,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_f_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_f_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x00,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_f_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x01,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_f_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_f_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x01,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_f_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x20,0x7d,0x01,0x1b,0x00,0xff]
@@ -613,46 +639,72 @@
 # GFX11: v_cmpx_f_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x91,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_ge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
@@ -917,46 +969,72 @@
 # GFX11: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_gt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
@@ -1221,46 +1299,72 @@
 # GFX11: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_le_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
@@ -1525,46 +1629,72 @@
 # GFX11: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_lg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
@@ -2159,46 +2289,72 @@
 # GFX11: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_neq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
@@ -2243,46 +2399,72 @@
 # GFX11: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
@@ -2327,46 +2509,72 @@
 # GFX11: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_ngt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
@@ -2411,46 +2619,72 @@
 # GFX11: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nle_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
@@ -2495,46 +2729,72 @@
 # GFX11: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nlg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
@@ -2579,46 +2839,72 @@
 # GFX11: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_nlt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
@@ -2663,46 +2949,72 @@
 # GFX11: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_o_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2747,46 +3059,72 @@
 # GFX11: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1e,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_t_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_t_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x1e,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_t_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1f,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_t_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_t_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1f,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_t_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2915,46 +3253,72 @@
 # GFX11: v_cmpx_t_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9f,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff
-# GFX11: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01
-# GFX11: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13
-# GFX11: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX11: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-REAL16: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01
+# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x5f,0x01,0x01]
+
+0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13
+# GFX11-REAL16: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13]
+# GFX11-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30
+# GFX11-REAL16: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX11-FAKE16: v_cmpx_u_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff
 # GFX11: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
index 6ca58524688e2..72fb40a7f22a0 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vopcx_dpp8.txt
@@ -127,10 +127,30 @@
 # GFX11: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_f_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x00,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x00,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_f_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_f_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x00,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_f_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_f_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x01,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_f_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_f_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_f_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x01,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_f_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x20,0x7d,0x01,0x77,0x39,0x05]
@@ -151,10 +171,30 @@
 # GFX11: v_cmpx_f_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x91,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0c,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
@@ -227,10 +267,30 @@
 # GFX11: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x08,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
@@ -303,10 +363,30 @@
 # GFX11: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x06,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
@@ -379,10 +459,30 @@
 # GFX11: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0a,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
@@ -545,10 +645,30 @@
 # GFX11: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1a,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
@@ -557,10 +677,30 @@
 # GFX11: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x12,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
@@ -569,10 +709,30 @@
 # GFX11: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x16,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
@@ -581,10 +741,30 @@
 # GFX11: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x18,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
@@ -593,10 +773,30 @@
 # GFX11: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x14,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
@@ -605,10 +805,30 @@
 # GFX11: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1c,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
@@ -617,10 +837,30 @@
 # GFX11: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x0e,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
@@ -629,10 +869,30 @@
 # GFX11: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_t_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1e,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1e,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_t_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_t_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x1e,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_t_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_t_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1f,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_t_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_t_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_t_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_t_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3e,0x7d,0x01,0x77,0x39,0x05]
@@ -653,10 +913,30 @@
 # GFX11: v_cmpx_t_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05
-# GFX11: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00
-# GFX11: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xfe,0x10,0x7d,0x7f,0x77,0x39,0x05]
+
+0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05
+# GFX11-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+# GFX11-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00
+# GFX11-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
+# GFX11-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05
 # GFX11: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
index d3a19ae00fa21..ab2d154e9ef9f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx.txt
@@ -567,10 +567,12 @@
 # GFX12: v_cmpx_eq_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xda,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
@@ -611,6 +613,15 @@
 0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ge_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x96,0xd4,0x01,0x05,0x02,0x00]
 
@@ -967,10 +978,12 @@
 # GFX12: v_cmpx_ge_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xde,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
@@ -1011,6 +1024,15 @@
 0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_gt_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x94,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1367,10 +1389,12 @@
 # GFX12: v_cmpx_gt_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdc,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_le_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
@@ -1411,6 +1435,15 @@
 0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_le_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x93,0xd4,0x01,0x05,0x02,0x00]
 
@@ -1767,10 +1800,12 @@
 # GFX12: v_cmpx_le_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdb,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.l, v2.l            ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.l        ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
@@ -1811,6 +1846,15 @@
 0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_lg_f32_e64 v1, v2                ; encoding: [0x7e,0x00,0x95,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2576,10 +2620,12 @@
 # GFX12: v_cmpx_ne_u64_e64 0xaf123456, vcc       ; encoding: [0x7e,0x00,0xdd,0xd4,0xff,0xd4,0x00,0x00,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
@@ -2620,6 +2666,15 @@
 0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_neq_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9d,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2702,10 +2757,12 @@
 # GFX12: v_cmpx_neq_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xad,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
@@ -2746,6 +2803,15 @@
 0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_nge_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x99,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2828,10 +2894,12 @@
 # GFX12: v_cmpx_nge_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa9,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
@@ -2872,6 +2940,15 @@
 0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_ngt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9b,0xd4,0x01,0x05,0x02,0x00]
 
@@ -2954,10 +3031,12 @@
 # GFX12: v_cmpx_ngt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xab,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
@@ -2998,6 +3077,15 @@
 0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_nle_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9c,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3080,10 +3168,12 @@
 # GFX12: v_cmpx_nle_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xac,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
@@ -3124,6 +3214,15 @@
 0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_nlg_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9a,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3206,10 +3305,12 @@
 # GFX12: v_cmpx_nlg_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xaa,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.l, v2.l           ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.l       ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
@@ -3250,6 +3351,15 @@
 0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_nlt_f32_e64 v1, v2               ; encoding: [0x7e,0x00,0x9e,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3332,10 +3442,12 @@
 # GFX12: v_cmpx_nlt_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xae,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_o_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
@@ -3376,6 +3488,15 @@
 0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+
+
 0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_o_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x97,0xd4,0x01,0x05,0x02,0x00]
 
@@ -3458,10 +3579,12 @@
 # GFX12: v_cmpx_o_f64_e64 0xaf123456, -|vcc| clamp ; encoding: [0x7e,0x82,0xa7,0xd4,0xff,0xd4,0x00,0x40,0x56,0x34,0x12,0xaf]
 
 0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00
-# GFX12: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-REAL16: v_cmpx_u_f16_e64 v1.l, v2.l             ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
 
 0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00
-# GFX12: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.l         ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
 
 0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00
 # GFX12: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
@@ -3502,6 +3625,14 @@
 0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
 # GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
 
+0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+
 0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00
 # GFX12: v_cmpx_u_f32_e64 v1, v2                 ; encoding: [0x7e,0x00,0x98,0xd4,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
index a1061a067d73c..f8ce4fafc0252 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp16.txt
@@ -666,49 +666,123 @@
 # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x86,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x86,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x86,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x86,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x86,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x86,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x86,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x86,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x86,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x86,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ge_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x86,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x86,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x86,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x86,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ge_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x86,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ge_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x86,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x86,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x86,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x86,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x96,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1084,49 +1158,123 @@
 # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x84,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x84,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x84,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x84,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x84,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x84,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x84,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x84,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x84,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x84,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_gt_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x84,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x84,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x84,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x84,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_gt_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x84,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_gt_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x84,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_gt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x84,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x84,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x84,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x94,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1502,49 +1650,123 @@
 # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x83,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x83,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x83,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x83,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x83,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x83,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x83,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x83,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x83,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x83,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_le_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x83,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x83,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x83,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x83,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_le_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x83,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_le_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x83,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_le_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x83,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x83,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x83,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x93,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -1920,49 +2142,123 @@
 # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x85,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x85,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v1, v2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x85,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64 v255, v255            ; encoding: [0x7e,0x00,0x85,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 s1, s2                ; encoding: [0x7e,0x00,0x85,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 s105, s105            ; encoding: [0x7e,0x00,0x85,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 vcc_lo, ttmp15        ; encoding: [0x7e,0x00,0x85,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 vcc_hi, 0xfe0b        ; encoding: [0x7e,0x00,0x85,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 ttmp15, src_scc       ; encoding: [0x7e,0x00,0x85,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 m0, 0.5               ; encoding: [0x7e,0x00,0x85,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_lg_f16_e64 exec_lo, -1           ; encoding: [0x7e,0x00,0x85,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 |exec_hi|, null       ; encoding: [0x7e,0x01,0x85,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 null, exec_lo         ; encoding: [0x7e,0x00,0x85,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 -1, exec_hi           ; encoding: [0x7e,0x00,0x85,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_lg_f16_e64 0.5, -m0              ; encoding: [0x7e,0x00,0x85,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_lg_f16_e64 -src_scc, |vcc_lo|    ; encoding: [0x7e,0x02,0x85,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_lg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x85,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x85,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x85,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x95,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2830,49 +3126,123 @@
 # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8d,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8d,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8d,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8d,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8d,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8d,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8d,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8d,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_neq_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8d,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8d,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8d,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8d,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_neq_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8d,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_neq_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8d,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_neq_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8d,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8d,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8d,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9d,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -2920,49 +3290,123 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x89,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x89,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x89,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x89,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x89,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x89,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x89,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x89,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x89,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x89,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nge_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x89,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x89,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x89,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x89,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nge_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x89,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nge_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x89,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nge_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x89,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x89,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x89,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x99,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3010,49 +3454,123 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8b,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8b,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8b,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8b,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8b,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8b,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8b,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8b,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_ngt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8b,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8b,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8b,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8b,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_ngt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8b,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_ngt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8b,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_ngt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8b,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8b,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8b,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9b,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3100,49 +3618,123 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8c,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8c,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8c,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8c,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8c,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8c,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8c,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8c,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nle_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8c,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8c,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8c,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8c,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nle_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8c,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nle_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8c,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nle_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8c,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8c,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8c,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9c,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3190,49 +3782,123 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8a,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8a,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8a,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8a,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8a,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8a,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8a,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8a,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlg_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8a,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8a,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8a,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8a,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlg_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8a,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlg_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8a,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlg_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8a,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8a,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8a,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9a,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3280,49 +3946,123 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x8e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x8e,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v1, v2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x8e,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64 v255, v255           ; encoding: [0x7e,0x00,0x8e,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 s1, s2               ; encoding: [0x7e,0x00,0x8e,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 s105, s105           ; encoding: [0x7e,0x00,0x8e,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 vcc_lo, ttmp15       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 vcc_hi, 0xfe0b       ; encoding: [0x7e,0x00,0x8e,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 ttmp15, src_scc      ; encoding: [0x7e,0x00,0x8e,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 m0, 0.5              ; encoding: [0x7e,0x00,0x8e,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_nlt_f16_e64 exec_lo, -1          ; encoding: [0x7e,0x00,0x8e,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 |exec_hi|, null      ; encoding: [0x7e,0x01,0x8e,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 null, exec_lo        ; encoding: [0x7e,0x00,0x8e,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 -1, exec_hi          ; encoding: [0x7e,0x00,0x8e,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_nlt_f16_e64 0.5, -m0             ; encoding: [0x7e,0x00,0x8e,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_nlt_f16_e64 -src_scc, |vcc_lo|   ; encoding: [0x7e,0x02,0x8e,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_nlt_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x8e,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x8e,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x8e,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x9e,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3370,49 +4110,123 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x87,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x87,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x87,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x87,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x87,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x87,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x87,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x87,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x87,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x87,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_o_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x87,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x87,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x87,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x87,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_o_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x87,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_o_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x87,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_o_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x87,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x87,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x87,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x97,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -3460,49 +4274,123 @@
 # GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, s3 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x06,0x00,0x00,0x01,0x2f,0x01,0xff]
 
 0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x88,0xd4,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
 0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+
+0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e64 v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x7e,0x08,0x88,0xd4,0x01,0x05,0x02,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v1, v2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x05,0x02,0x00]
+
+0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e64 v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x7e,0x10,0x88,0xd4,0xff,0xff,0x03,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64 v255, v255             ; encoding: [0x7e,0x00,0x88,0xd4,0xff,0xff,0x03,0x00]
+
+0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 s1, s2                 ; encoding: [0x7e,0x00,0x88,0xd4,0x01,0x04,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 s105, s105             ; encoding: [0x7e,0x00,0x88,0xd4,0x69,0xd2,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 vcc_lo, ttmp15         ; encoding: [0x7e,0x00,0x88,0xd4,0x6a,0xf6,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 vcc_hi, 0xfe0b         ; encoding: [0x7e,0x00,0x88,0xd4,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 ttmp15, src_scc        ; encoding: [0x7e,0x00,0x88,0xd4,0x7b,0xfa,0x01,0x00]
+
+0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 m0, 0.5                ; encoding: [0x7e,0x00,0x88,0xd4,0x7d,0xe0,0x01,0x00]
+
+0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00
+# GFX12: v_cmpx_u_f16_e64 exec_lo, -1            ; encoding: [0x7e,0x00,0x88,0xd4,0x7e,0x82,0x01,0x00]
+
+0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 |exec_hi|, null        ; encoding: [0x7e,0x01,0x88,0xd4,0x7f,0xf8,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 null, exec_lo          ; encoding: [0x7e,0x00,0x88,0xd4,0x7c,0xfc,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 -1, exec_hi            ; encoding: [0x7e,0x00,0x88,0xd4,0xc1,0xfe,0x00,0x00]
+
+0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40
+# GFX12: v_cmpx_u_f16_e64 0.5, -m0               ; encoding: [0x7e,0x00,0x88,0xd4,0xf0,0xfa,0x00,0x40]
+
+0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20
+# GFX12: v_cmpx_u_f16_e64 -src_scc, |vcc_lo|     ; encoding: [0x7e,0x02,0x88,0xd4,0xfd,0xd4,0x00,0x20]
+
+0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00
+# GFX12: v_cmpx_u_f16_e64 -|0xfe0b|, -|vcc_hi| clamp ; encoding: [0x7e,0x83,0x88,0xd4,0xff,0xd6,0x00,0x60,0x0b,0xfe,0x00,0x00]
+
+0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x19,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x7e,0x01,0x88,0xd4,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x0a,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x7e,0x02,0x88,0xd4,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
+
+0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x7e,0x00,0x98,0xd4,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
index 56e1ea1194a5c..83d9623c5458f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3cx_dpp8.txt
@@ -155,22 +155,41 @@
 # GFX12: v_cmpx_eq_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xca,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x86,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x86,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x86,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x86,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_ge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x96,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -254,19 +273,37 @@
 # GFX12: v_cmpx_ge_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xce,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x84,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x84,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x84,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x84,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_gt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x94,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -350,19 +387,37 @@
 # GFX12: v_cmpx_gt_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcc,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x83,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x83,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x83,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x83,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_le_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x93,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -446,19 +501,37 @@
 # GFX12: v_cmpx_le_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcb,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x85,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x85,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x85,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x85,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_lg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x95,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -658,19 +731,37 @@
 # GFX12: v_cmpx_ne_u32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x00,0xcd,0xd4,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8d,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8d,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8d,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_neq_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9d,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -688,22 +779,41 @@
 # GFX12: v_cmpx_neq_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9d,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, 2.0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0xe8,0x01,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x89,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x89,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x89,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x89,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nge_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x99,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -721,19 +831,37 @@
 # GFX12: v_cmpx_nge_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x99,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8b,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8b,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8b,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_ngt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9b,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -751,19 +879,37 @@
 # GFX12: v_cmpx_ngt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9b,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8c,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8c,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8c,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nle_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9c,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -781,19 +927,37 @@
 # GFX12: v_cmpx_nle_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9c,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8a,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8a,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8a,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nlg_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9a,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -811,19 +975,37 @@
 # GFX12: v_cmpx_nlg_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9a,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x8e,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x8e,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x8e,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x8e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nlt_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x9e,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -841,19 +1023,37 @@
 # GFX12: v_cmpx_nlt_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x9e,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x87,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x87,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x87,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x87,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
 
 0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_o_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x97,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -871,19 +1071,36 @@
 # GFX12: v_cmpx_o_f32_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x97,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp v1.l, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp v1, s3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x88,0xd4,0xe9,0x06,0x00,0x00,0x01,0x77,0x39,0x05]
 
 0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.l| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp |v1.h|, -v2.h op_sel:[1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x19,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x01,0x88,0xd4,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -v1.h, |v2.l| op_sel:[1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x0a,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x02,0x88,0xd4,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e64_dpp -|v255.l|, -|v255.h| op_sel:[0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x93,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e64_dpp -|v255|, -|v255| clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0x7e,0x83,0x88,0xd4,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_u_f32_e64_dpp v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x7e,0x00,0x98,0xd4,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
index b458e613d2f20..373cc6b6e5878 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx.txt
@@ -633,49 +633,80 @@
 # GFX12: v_cmpx_eq_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb5,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0c,0x7d]
 
 0x7f,0x05,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0c,0x7d]
 
 0x01,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0c,0x7d]
 
 0x69,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0c,0x7d]
 
 0x6a,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0c,0x7d]
 
 0x6b,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0c,0x7d]
 
 0x7b,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0c,0x7d]
 
 0x7d,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0c,0x7d]
 
 0x7e,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0c,0x7d]
 
 0x7f,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0c,0x7d]
 
 0x7c,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0c,0x7d]
 
 0xc1,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0c,0x7d]
 
 0xf0,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0c,0x7d]
 
 0xfd,0x04,0x0c,0x7d
-# GFX12: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0c,0x7d]
 
 0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0c,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0c,0x7d
+# GFX12-REAL16: v_cmpx_ge_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0c,0x7d]
+
+0xff,0x05,0x0c,0x7d
+# GFX12-REAL16: v_cmpx_ge_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0c,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0c,0x7d]
+
+0xfd,0x04,0x0d,0x7d
+# GFX12-REAL16: v_cmpx_ge_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0d,0x7d]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0d,0x7d]
+
+0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_ge_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0d,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2c,0x7d
 # GFX12: v_cmpx_ge_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2c,0x7d]
@@ -1073,49 +1104,80 @@
 # GFX12: v_cmpx_ge_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbd,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x08,0x7d]
 
 0x7f,0x05,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x08,0x7d]
 
 0x01,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x08,0x7d]
 
 0x69,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x08,0x7d]
 
 0x6a,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x08,0x7d]
 
 0x6b,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x08,0x7d]
 
 0x7b,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x08,0x7d]
 
 0x7d,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x08,0x7d]
 
 0x7e,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x08,0x7d]
 
 0x7f,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x08,0x7d]
 
 0x7c,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x08,0x7d]
 
 0xc1,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x08,0x7d]
 
 0xf0,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x08,0x7d]
 
 0xfd,0x04,0x08,0x7d
-# GFX12: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x08,0x7d]
 
 0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x08,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x08,0x7d
+# GFX12-REAL16: v_cmpx_gt_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x08,0x7d]
+
+0xff,0x05,0x08,0x7d
+# GFX12-REAL16: v_cmpx_gt_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x08,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x08,0x7d]
+
+0xfd,0x04,0x09,0x7d
+# GFX12-REAL16: v_cmpx_gt_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x09,0x7d]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x09,0x7d]
+
+0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_gt_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x09,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x28,0x7d
 # GFX12: v_cmpx_gt_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x28,0x7d]
@@ -1513,49 +1575,80 @@
 # GFX12: v_cmpx_gt_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb9,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x06,0x7d]
 
 0x7f,0x05,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x06,0x7d]
 
 0x01,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x06,0x7d]
 
 0x69,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x06,0x7d]
 
 0x6a,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x06,0x7d]
 
 0x6b,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x06,0x7d]
 
 0x7b,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x06,0x7d]
 
 0x7d,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x06,0x7d]
 
 0x7e,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x06,0x7d]
 
 0x7f,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x06,0x7d]
 
 0x7c,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x06,0x7d]
 
 0xc1,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x06,0x7d]
 
 0xf0,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x06,0x7d]
 
 0xfd,0x04,0x06,0x7d
-# GFX12: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
+# GFX12-REAL16: v_cmpx_le_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x06,0x7d]
 
 0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x06,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x06,0x7d
+# GFX12-REAL16: v_cmpx_le_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x06,0x7d]
+
+0xff,0x05,0x06,0x7d
+# GFX12-REAL16: v_cmpx_le_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x06,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x06,0x7d]
+
+0xfd,0x04,0x07,0x7d
+# GFX12-REAL16: v_cmpx_le_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x07,0x7d]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x07,0x7d]
+
+0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_le_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x07,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x26,0x7d
 # GFX12: v_cmpx_le_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x26,0x7d]
@@ -1953,49 +2046,80 @@
 # GFX12: v_cmpx_le_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xb7,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 v1.l, v2.l            ; encoding: [0x01,0x05,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 v1, v2                ; encoding: [0x01,0x05,0x0a,0x7d]
 
 0x7f,0x05,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 v127.l, v2.l          ; encoding: [0x7f,0x05,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 v127, v2              ; encoding: [0x7f,0x05,0x0a,0x7d]
 
 0x01,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 s1, v2.l              ; encoding: [0x01,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 s1, v2                ; encoding: [0x01,0x04,0x0a,0x7d]
 
 0x69,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 s105, v2.l            ; encoding: [0x69,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 s105, v2              ; encoding: [0x69,0x04,0x0a,0x7d]
 
 0x6a,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 vcc_lo, v2.l          ; encoding: [0x6a,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 vcc_lo, v2            ; encoding: [0x6a,0x04,0x0a,0x7d]
 
 0x6b,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 vcc_hi, v2.l          ; encoding: [0x6b,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 vcc_hi, v2            ; encoding: [0x6b,0x04,0x0a,0x7d]
 
 0x7b,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 ttmp15, v2.l          ; encoding: [0x7b,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 ttmp15, v2            ; encoding: [0x7b,0x04,0x0a,0x7d]
 
 0x7d,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 m0, v2.l              ; encoding: [0x7d,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 m0, v2                ; encoding: [0x7d,0x04,0x0a,0x7d]
 
 0x7e,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 exec_lo, v2.l         ; encoding: [0x7e,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 exec_lo, v2           ; encoding: [0x7e,0x04,0x0a,0x7d]
 
 0x7f,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 exec_hi, v2.l         ; encoding: [0x7f,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 exec_hi, v2           ; encoding: [0x7f,0x04,0x0a,0x7d]
 
 0x7c,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 null, v2.l            ; encoding: [0x7c,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 null, v2              ; encoding: [0x7c,0x04,0x0a,0x7d]
 
 0xc1,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 -1, v2.l              ; encoding: [0xc1,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 -1, v2                ; encoding: [0xc1,0x04,0x0a,0x7d]
 
 0xf0,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 0.5, v2.l             ; encoding: [0xf0,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 0.5, v2               ; encoding: [0xf0,0x04,0x0a,0x7d]
 
 0xfd,0x04,0x0a,0x7d
-# GFX12: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.l         ; encoding: [0xfd,0x04,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 src_scc, v2           ; encoding: [0xfd,0x04,0x0a,0x7d]
 
 0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.l        ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v127          ; encoding: [0xff,0xfe,0x0a,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0a,0x7d
+# GFX12-REAL16: v_cmpx_lg_f16_e32 v1.h, v2.l            ; encoding: [0x81,0x05,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0a,0x7d]
+
+0xff,0x05,0x0a,0x7d
+# GFX12-REAL16: v_cmpx_lg_f16_e32 v127.h, v2.l          ; encoding: [0xff,0x05,0x0a,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0a,0x7d]
+
+0xfd,0x04,0x0b,0x7d
+# GFX12-REAL16: v_cmpx_lg_f16_e32 src_scc, v2.h         ; encoding: [0xfd,0x04,0x0b,0x7d]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0b,0x7d]
+
+0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_lg_f16_e32 0xfe0b, v127.h        ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0b,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2a,0x7d
 # GFX12: v_cmpx_lg_f32_e32 v1, v2                ; encoding: [0x01,0x05,0x2a,0x7d]
@@ -2864,49 +2988,80 @@
 # GFX12: v_cmpx_ne_u64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0xbb,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1a,0x7d]
 
 0x7f,0x05,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1a,0x7d]
 
 0x01,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1a,0x7d]
 
 0x69,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1a,0x7d]
 
 0x6a,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1a,0x7d]
 
 0x6b,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1a,0x7d]
 
 0x7b,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1a,0x7d]
 
 0x7d,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1a,0x7d]
 
 0x7e,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1a,0x7d]
 
 0x7f,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1a,0x7d]
 
 0x7c,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1a,0x7d]
 
 0xc1,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1a,0x7d]
 
 0xf0,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1a,0x7d]
 
 0xfd,0x04,0x1a,0x7d
-# GFX12: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1a,0x7d]
 
 0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1a,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x1a,0x7d
+# GFX12-REAL16: v_cmpx_neq_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1a,0x7d]
+
+0xff,0x05,0x1a,0x7d
+# GFX12-REAL16: v_cmpx_neq_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1a,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1a,0x7d]
+
+0xfd,0x04,0x1b,0x7d
+# GFX12-REAL16: v_cmpx_neq_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1b,0x7d]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1b,0x7d]
+
+0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_neq_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1b,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x3a,0x7d
 # GFX12: v_cmpx_neq_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3a,0x7d]
@@ -2990,49 +3145,80 @@
 # GFX12: v_cmpx_neq_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5b,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x12,0x7d]
 
 0x7f,0x05,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x12,0x7d]
 
 0x01,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x12,0x7d]
 
 0x69,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x12,0x7d]
 
 0x6a,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x12,0x7d]
 
 0x6b,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x12,0x7d]
 
 0x7b,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x12,0x7d]
 
 0x7d,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x12,0x7d]
 
 0x7e,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x12,0x7d]
 
 0x7f,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x12,0x7d]
 
 0x7c,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x12,0x7d]
 
 0xc1,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x12,0x7d]
 
 0xf0,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x12,0x7d]
 
 0xfd,0x04,0x12,0x7d
-# GFX12: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x12,0x7d]
 
 0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x12,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x12,0x7d
+# GFX12-REAL16: v_cmpx_nge_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x12,0x7d]
+
+0xff,0x05,0x12,0x7d
+# GFX12-REAL16: v_cmpx_nge_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x12,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x12,0x7d]
+
+0xfd,0x04,0x13,0x7d
+# GFX12-REAL16: v_cmpx_nge_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x13,0x7d]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x13,0x7d]
+
+0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_nge_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x13,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x32,0x7d
 # GFX12: v_cmpx_nge_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x32,0x7d]
@@ -3116,49 +3302,80 @@
 # GFX12: v_cmpx_nge_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x53,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x16,0x7d]
 
 0x7f,0x05,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x16,0x7d]
 
 0x01,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x16,0x7d]
 
 0x69,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x16,0x7d]
 
 0x6a,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x16,0x7d]
 
 0x6b,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x16,0x7d]
 
 0x7b,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x16,0x7d]
 
 0x7d,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x16,0x7d]
 
 0x7e,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x16,0x7d]
 
 0x7f,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x16,0x7d]
 
 0x7c,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x16,0x7d]
 
 0xc1,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x16,0x7d]
 
 0xf0,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x16,0x7d]
 
 0xfd,0x04,0x16,0x7d
-# GFX12: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x16,0x7d]
 
 0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x16,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x16,0x7d
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x16,0x7d]
+
+0xff,0x05,0x16,0x7d
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x16,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x16,0x7d]
+
+0xfd,0x04,0x17,0x7d
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x17,0x7d]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x17,0x7d]
+
+0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x17,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x36,0x7d
 # GFX12: v_cmpx_ngt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x36,0x7d]
@@ -3242,49 +3459,80 @@
 # GFX12: v_cmpx_ngt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x57,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x18,0x7d]
 
 0x7f,0x05,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x18,0x7d]
 
 0x01,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x18,0x7d]
 
 0x69,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x18,0x7d]
 
 0x6a,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x18,0x7d]
 
 0x6b,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x18,0x7d]
 
 0x7b,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x18,0x7d]
 
 0x7d,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x18,0x7d]
 
 0x7e,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x18,0x7d]
 
 0x7f,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x18,0x7d]
 
 0x7c,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x18,0x7d]
 
 0xc1,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x18,0x7d]
 
 0xf0,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x18,0x7d]
 
 0xfd,0x04,0x18,0x7d
-# GFX12: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x18,0x7d]
 
 0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x18,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x18,0x7d
+# GFX12-REAL16: v_cmpx_nle_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x18,0x7d]
+
+0xff,0x05,0x18,0x7d
+# GFX12-REAL16: v_cmpx_nle_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x18,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x18,0x7d]
+
+0xfd,0x04,0x19,0x7d
+# GFX12-REAL16: v_cmpx_nle_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x19,0x7d]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x19,0x7d]
+
+0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_nle_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x19,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x38,0x7d
 # GFX12: v_cmpx_nle_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x38,0x7d]
@@ -3368,49 +3616,80 @@
 # GFX12: v_cmpx_nle_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x59,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x14,0x7d]
 
 0x7f,0x05,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x14,0x7d]
 
 0x01,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x14,0x7d]
 
 0x69,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x14,0x7d]
 
 0x6a,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x14,0x7d]
 
 0x6b,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x14,0x7d]
 
 0x7b,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x14,0x7d]
 
 0x7d,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x14,0x7d]
 
 0x7e,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x14,0x7d]
 
 0x7f,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x14,0x7d]
 
 0x7c,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x14,0x7d]
 
 0xc1,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x14,0x7d]
 
 0xf0,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x14,0x7d]
 
 0xfd,0x04,0x14,0x7d
-# GFX12: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x14,0x7d]
 
 0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x14,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x14,0x7d
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x14,0x7d]
+
+0xff,0x05,0x14,0x7d
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x14,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x14,0x7d]
+
+0xfd,0x04,0x15,0x7d
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x15,0x7d]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x15,0x7d]
+
+0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x15,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x34,0x7d
 # GFX12: v_cmpx_nlg_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x34,0x7d]
@@ -3494,49 +3773,80 @@
 # GFX12: v_cmpx_nlg_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x55,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 v1.l, v2.l           ; encoding: [0x01,0x05,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v1, v2               ; encoding: [0x01,0x05,0x1c,0x7d]
 
 0x7f,0x05,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 v127.l, v2.l         ; encoding: [0x7f,0x05,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v127, v2             ; encoding: [0x7f,0x05,0x1c,0x7d]
 
 0x01,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 s1, v2.l             ; encoding: [0x01,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 s1, v2               ; encoding: [0x01,0x04,0x1c,0x7d]
 
 0x69,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 s105, v2.l           ; encoding: [0x69,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 s105, v2             ; encoding: [0x69,0x04,0x1c,0x7d]
 
 0x6a,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 vcc_lo, v2.l         ; encoding: [0x6a,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 vcc_lo, v2           ; encoding: [0x6a,0x04,0x1c,0x7d]
 
 0x6b,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 vcc_hi, v2.l         ; encoding: [0x6b,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 vcc_hi, v2           ; encoding: [0x6b,0x04,0x1c,0x7d]
 
 0x7b,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 ttmp15, v2.l         ; encoding: [0x7b,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 ttmp15, v2           ; encoding: [0x7b,0x04,0x1c,0x7d]
 
 0x7d,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 m0, v2.l             ; encoding: [0x7d,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 m0, v2               ; encoding: [0x7d,0x04,0x1c,0x7d]
 
 0x7e,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 exec_lo, v2.l        ; encoding: [0x7e,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 exec_lo, v2          ; encoding: [0x7e,0x04,0x1c,0x7d]
 
 0x7f,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 exec_hi, v2.l        ; encoding: [0x7f,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 exec_hi, v2          ; encoding: [0x7f,0x04,0x1c,0x7d]
 
 0x7c,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 null, v2.l           ; encoding: [0x7c,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 null, v2             ; encoding: [0x7c,0x04,0x1c,0x7d]
 
 0xc1,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 -1, v2.l             ; encoding: [0xc1,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 -1, v2               ; encoding: [0xc1,0x04,0x1c,0x7d]
 
 0xf0,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 0.5, v2.l            ; encoding: [0xf0,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0.5, v2              ; encoding: [0xf0,0x04,0x1c,0x7d]
 
 0xfd,0x04,0x1c,0x7d
-# GFX12: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.l        ; encoding: [0xfd,0x04,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v2          ; encoding: [0xfd,0x04,0x1c,0x7d]
 
 0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.l       ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v127         ; encoding: [0xff,0xfe,0x1c,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x1c,0x7d
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 v1.h, v2.l           ; encoding: [0x81,0x05,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x1c,0x7d]
+
+0xff,0x05,0x1c,0x7d
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 v127.h, v2.l         ; encoding: [0xff,0x05,0x1c,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x1c,0x7d]
+
+0xfd,0x04,0x1d,0x7d
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 src_scc, v2.h        ; encoding: [0xfd,0x04,0x1d,0x7d]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x1d,0x7d]
+
+0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16_e32 0xfe0b, v127.h       ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x1d,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x3c,0x7d
 # GFX12: v_cmpx_nlt_f32_e32 v1, v2               ; encoding: [0x01,0x05,0x3c,0x7d]
@@ -3620,49 +3930,80 @@
 # GFX12: v_cmpx_nlt_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x5d,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x0e,0x7d]
 
 0x7f,0x05,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x0e,0x7d]
 
 0x01,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x0e,0x7d]
 
 0x69,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x0e,0x7d]
 
 0x6a,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x0e,0x7d]
 
 0x6b,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x0e,0x7d]
 
 0x7b,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x0e,0x7d]
 
 0x7d,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x0e,0x7d]
 
 0x7e,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x0e,0x7d]
 
 0x7f,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x0e,0x7d]
 
 0x7c,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x0e,0x7d]
 
 0xc1,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x0e,0x7d]
 
 0xf0,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x0e,0x7d]
 
 0xfd,0x04,0x0e,0x7d
-# GFX12: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
+# GFX12-REAL16: v_cmpx_o_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x0e,0x7d]
 
 0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x0e,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x0e,0x7d
+# GFX12-REAL16: v_cmpx_o_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x0e,0x7d]
+
+0xff,0x05,0x0e,0x7d
+# GFX12-REAL16: v_cmpx_o_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x0e,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x0e,0x7d]
+
+0xfd,0x04,0x0f,0x7d
+# GFX12-REAL16: v_cmpx_o_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x0f,0x7d]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x0f,0x7d]
+
+0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_o_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x0f,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x2e,0x7d
 # GFX12: v_cmpx_o_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x2e,0x7d]
@@ -3746,49 +4087,80 @@
 # GFX12: v_cmpx_o_f64_e32 0xaf123456, v[254:255] ; encoding: [0xff,0xfc,0x4f,0x7d,0x56,0x34,0x12,0xaf]
 
 0x01,0x05,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 v1.l, v2.l             ; encoding: [0x01,0x05,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 v1, v2                 ; encoding: [0x01,0x05,0x10,0x7d]
 
 0x7f,0x05,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 v127.l, v2.l           ; encoding: [0x7f,0x05,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 v127, v2               ; encoding: [0x7f,0x05,0x10,0x7d]
 
 0x01,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 s1, v2.l               ; encoding: [0x01,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 s1, v2                 ; encoding: [0x01,0x04,0x10,0x7d]
 
 0x69,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 s105, v2.l             ; encoding: [0x69,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 s105, v2               ; encoding: [0x69,0x04,0x10,0x7d]
 
 0x6a,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 vcc_lo, v2.l           ; encoding: [0x6a,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 vcc_lo, v2             ; encoding: [0x6a,0x04,0x10,0x7d]
 
 0x6b,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 vcc_hi, v2.l           ; encoding: [0x6b,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 vcc_hi, v2             ; encoding: [0x6b,0x04,0x10,0x7d]
 
 0x7b,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 ttmp15, v2.l           ; encoding: [0x7b,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 ttmp15, v2             ; encoding: [0x7b,0x04,0x10,0x7d]
 
 0x7d,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 m0, v2.l               ; encoding: [0x7d,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 m0, v2                 ; encoding: [0x7d,0x04,0x10,0x7d]
 
 0x7e,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 exec_lo, v2.l          ; encoding: [0x7e,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 exec_lo, v2            ; encoding: [0x7e,0x04,0x10,0x7d]
 
 0x7f,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 exec_hi, v2.l          ; encoding: [0x7f,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 exec_hi, v2            ; encoding: [0x7f,0x04,0x10,0x7d]
 
 0x7c,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 null, v2.l             ; encoding: [0x7c,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 null, v2               ; encoding: [0x7c,0x04,0x10,0x7d]
 
 0xc1,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 -1, v2.l               ; encoding: [0xc1,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 -1, v2                 ; encoding: [0xc1,0x04,0x10,0x7d]
 
 0xf0,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 0.5, v2.l              ; encoding: [0xf0,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 0.5, v2                ; encoding: [0xf0,0x04,0x10,0x7d]
 
 0xfd,0x04,0x10,0x7d
-# GFX12: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
+# GFX12-REAL16: v_cmpx_u_f16_e32 src_scc, v2.l          ; encoding: [0xfd,0x04,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 src_scc, v2            ; encoding: [0xfd,0x04,0x10,0x7d]
 
 0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00
-# GFX12: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.l         ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v127           ; encoding: [0xff,0xfe,0x10,0x7d,0x0b,0xfe,0x00,0x00]
+
+0x81,0x05,0x10,0x7d
+# GFX12-REAL16: v_cmpx_u_f16_e32 v1.h, v2.l             ; encoding: [0x81,0x05,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 v129/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0x81,0x05,0x10,0x7d]
+
+0xff,0x05,0x10,0x7d
+# GFX12-REAL16: v_cmpx_u_f16_e32 v127.h, v2.l           ; encoding: [0xff,0x05,0x10,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 v255/*Invalid register, operand has 'VS_32_Lo128' register class*/, v2 ; encoding: [0xff,0x05,0x10,0x7d]
+
+0xfd,0x04,0x11,0x7d
+# GFX12-REAL16: v_cmpx_u_f16_e32 src_scc, v2.h          ; encoding: [0xfd,0x04,0x11,0x7d]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 src_scc, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xfd,0x04,0x11,0x7d]
+
+0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00
+# GFX12-REAL16: v_cmpx_u_f16_e32 0xfe0b, v127.h         ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16_e32 0xfe0b, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ ; encoding: [0xff,0xfe,0x11,0x7d,0x0b,0xfe,0x00,0x00]
 
 0x01,0x05,0x30,0x7d
 # GFX12: v_cmpx_u_f32_e32 v1, v2                 ; encoding: [0x01,0x05,0x30,0x7d]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
index 0289ed58f07e3..61655c0aa2001 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp16.txt
@@ -429,46 +429,68 @@
 # GFX12: v_cmpx_eq_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x95,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0c,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_ge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_ge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0c,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0d,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_ge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_ge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_ge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2c,0x7d,0x01,0x1b,0x00,0xff]
@@ -725,46 +747,68 @@
 # GFX12: v_cmpx_ge_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9d,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x08,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_gt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_gt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x08,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x09,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_gt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_gt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x09,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_gt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x28,0x7d,0x01,0x1b,0x00,0xff]
@@ -1021,46 +1065,68 @@
 # GFX12: v_cmpx_gt_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x99,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x06,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_le_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_le_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x06,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x07,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_le_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_le_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x07,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_le_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x26,0x7d,0x01,0x1b,0x00,0xff]
@@ -1317,46 +1383,68 @@
 # GFX12: v_cmpx_le_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x97,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0a,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_lg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_lg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0a,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0b,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_lg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_lg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_lg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2a,0x7d,0x01,0x1b,0x00,0xff]
@@ -1931,46 +2019,68 @@
 # GFX12: v_cmpx_ne_u32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x9b,0x7d,0xff,0x6f,0x0d,0x30]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1a,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_neq_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_neq_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1a,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1b,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_neq_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_neq_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_neq_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3a,0x7d,0x01,0x1b,0x00,0xff]
@@ -2015,46 +2125,68 @@
 # GFX12: v_cmpx_neq_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3b,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x12,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_nge_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nge_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x12,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x13,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_nge_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nge_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x13,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nge_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x32,0x7d,0x01,0x1b,0x00,0xff]
@@ -2099,46 +2231,68 @@
 # GFX12: v_cmpx_nge_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x33,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x16,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_ngt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_ngt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x16,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x17,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_ngt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_ngt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x17,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_ngt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x36,0x7d,0x01,0x1b,0x00,0xff]
@@ -2183,46 +2337,68 @@
 # GFX12: v_cmpx_ngt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x37,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x18,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_nle_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nle_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x18,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x19,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_nle_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nle_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x19,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nle_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x38,0x7d,0x01,0x1b,0x00,0xff]
@@ -2267,46 +2443,68 @@
 # GFX12: v_cmpx_nle_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x39,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x14,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_nlg_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nlg_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x14,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x15,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_nlg_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nlg_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x15,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nlg_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x34,0x7d,0x01,0x1b,0x00,0xff]
@@ -2351,46 +2549,68 @@
 # GFX12: v_cmpx_nlg_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x35,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1c,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_nlt_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nlt_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1c,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x1d,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_nlt_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_nlt_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x1d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_nlt_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x3c,0x7d,0x01,0x1b,0x00,0xff]
@@ -2435,46 +2655,68 @@
 # GFX12: v_cmpx_nlt_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x3d,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0e,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_o_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_o_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0e,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x0f,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_o_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_o_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x0f,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_o_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x2e,0x7d,0x01,0x1b,0x00,0xff]
@@ -2519,46 +2761,68 @@
 # GFX12: v_cmpx_o_f32 -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x2f,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1b,0x00,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0xe4,0x00,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x40,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x41,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x01,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x0f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x11,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x1f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x21,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x2f,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff
-# GFX12: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x50,0x01,0xff]
 
 0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01
-# GFX12: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x5f,0x01,0x01]
 
 0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13
-# GFX12: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x10,0x7d,0x01,0x60,0x01,0x13]
 
 0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30
-# GFX12: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-REAL16: v_cmpx_u_f16 -|v127.l|, -|v127.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_u_f16 -|v127|, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x10,0x7d,0x7f,0x6f,0xfd,0x30]
+
+0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13
+# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13]
+# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0x04,0x11,0x7d,0x81,0x60,0x01,0x13]
+
+0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30
+# GFX12-REAL16: v_cmpx_u_f16 -|v127.h|, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30]
+# GFX12-FAKE16: v_cmpx_u_f16 -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/|, -|v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xfe,0x11,0x7d,0xff,0x6f,0xfd,0x30]
 
 0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff
 # GFX12: v_cmpx_u_f32 v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x30,0x7d,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
index e55d6a4c77c2a..bcd75b7bfa087 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vopcx_dpp8.txt
@@ -93,10 +93,20 @@
 # GFX12: v_cmpx_eq_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x95,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0c,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0c,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0d,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_ge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_ge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2c,0x7d,0x01,0x77,0x39,0x05]
@@ -149,10 +159,20 @@
 # GFX12: v_cmpx_ge_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_gt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x08,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_gt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x08,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_gt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_gt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x09,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_gt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_gt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x09,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_gt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x28,0x7d,0x01,0x77,0x39,0x05]
@@ -205,10 +225,20 @@
 # GFX12: v_cmpx_gt_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x99,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_le_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x06,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_le_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x06,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_le_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_le_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x07,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_le_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_le_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x07,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_le_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x26,0x7d,0x01,0x77,0x39,0x05]
@@ -261,10 +291,20 @@
 # GFX12: v_cmpx_le_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x97,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_lg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_lg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0a,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_lg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_lg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0b,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_lg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_lg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_lg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2a,0x7d,0x01,0x77,0x39,0x05]
@@ -383,10 +423,20 @@
 # GFX12: v_cmpx_ne_u32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x9b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_neq_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1a,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_neq_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1a,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_neq_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_neq_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1b,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_neq_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_neq_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_neq_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3a,0x7d,0x01,0x77,0x39,0x05]
@@ -395,10 +445,20 @@
 # GFX12: v_cmpx_neq_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3b,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nge_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x12,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nge_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x12,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nge_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nge_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x13,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nge_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nge_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x13,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nge_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x32,0x7d,0x01,0x77,0x39,0x05]
@@ -407,10 +467,20 @@
 # GFX12: v_cmpx_nge_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x33,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x16,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_ngt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x16,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_ngt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x17,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_ngt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_ngt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x17,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_ngt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x36,0x7d,0x01,0x77,0x39,0x05]
@@ -419,10 +489,20 @@
 # GFX12: v_cmpx_ngt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x37,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nle_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x18,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nle_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x18,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nle_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nle_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x19,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nle_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nle_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x19,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nle_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x38,0x7d,0x01,0x77,0x39,0x05]
@@ -431,10 +511,20 @@
 # GFX12: v_cmpx_nle_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x39,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x14,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlg_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x14,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlg_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x15,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlg_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlg_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x15,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nlg_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x34,0x7d,0x01,0x77,0x39,0x05]
@@ -443,10 +533,20 @@
 # GFX12: v_cmpx_nlg_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x35,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1c,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_nlt_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1c,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_nlt_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x1d,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_nlt_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_nlt_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x1d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_nlt_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x3c,0x7d,0x01,0x77,0x39,0x05]
@@ -455,10 +555,20 @@
 # GFX12: v_cmpx_nlt_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x3d,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_o_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0e,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_o_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0e,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_o_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_o_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0f,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_o_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_o_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x0f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_o_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x2e,0x7d,0x01,0x77,0x39,0x05]
@@ -467,10 +577,20 @@
 # GFX12: v_cmpx_o_f32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x2f,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05
-# GFX12: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-REAL16: v_cmpx_u_f16 v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x10,0x7d,0x01,0x77,0x39,0x05]
 
 0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00
-# GFX12: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-REAL16: v_cmpx_u_f16 v127.l, v127.l dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16 v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x10,0x7d,0x7f,0x00,0x00,0x00]
+
+0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05
+# GFX12-REAL16: v_cmpx_u_f16 v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+# GFX12-FAKE16: v_cmpx_u_f16 v129/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v130/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x11,0x7d,0x81,0x77,0x39,0x05]
+
+0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00
+# GFX12-REAL16: v_cmpx_u_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
+# GFX12-FAKE16: v_cmpx_u_f16 v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/, v255/*Invalid register, operand has 'VGPR_32_Lo128' register class*/ dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xfe,0x11,0x7d,0xff,0x00,0x00,0x00]
 
 0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05
 # GFX12: v_cmpx_u_f32 v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x30,0x7d,0x01,0x77,0x39,0x05]

From 5aafc6d58f3405662902cee006be11e599801b88 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Mon, 27 Jan 2025 16:18:47 +0100
Subject: [PATCH 205/432] [Polly] Fix typos discovered by codespell (#124545)

Patch created using the following command line:
```bash
codespell polly --skip="*.pdf,polly/lib/External/*" --write-changes \
  --ignore-words-list=couter,createor,distribues,doble,identty,indention,indx,olt,ore,padd,sais,te,theses
```
---
 polly/docs/Architecture.rst                               | 2 +-
 polly/docs/doxygen.cfg.in                                 | 4 ++--
 polly/include/polly/CodeGen/RuntimeDebugBuilder.h         | 2 +-
 polly/include/polly/DependenceInfo.h                      | 4 ++--
 polly/include/polly/ScopInfo.h                            | 2 +-
 polly/include/polly/Support/SCEVAffinator.h               | 2 +-
 polly/include/polly/Support/ScopHelper.h                  | 4 ++--
 polly/lib/Analysis/ScopBuilder.cpp                        | 2 +-
 polly/lib/Analysis/ScopDetectionDiagnostic.cpp            | 2 +-
 polly/lib/CodeGen/IslNodeBuilder.cpp                      | 2 +-
 polly/lib/CodeGen/Utils.cpp                               | 2 +-
 polly/lib/Support/SCEVAffinator.cpp                       | 2 +-
 polly/lib/Support/ScopHelper.cpp                          | 2 +-
 polly/lib/Transform/DeLICM.cpp                            | 2 +-
 polly/lib/Transform/ManualOptimizer.cpp                   | 2 +-
 polly/lib/Transform/MatmulOptimizer.cpp                   | 2 +-
 polly/lib/Transform/ScheduleOptimizer.cpp                 | 2 +-
 polly/lib/Transform/ScheduleTreeTransform.cpp             | 8 ++++----
 .../CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll    | 2 +-
 polly/test/CodeGen/multiple-codegens.ll                   | 2 +-
 polly/test/CodeGen/multiple-scops-in-a-row.ll             | 2 +-
 polly/test/CodeGen/reduction_2.ll                         | 2 +-
 polly/test/CodeGen/scalar-store-from-same-bb.ll           | 2 +-
 polly/test/CodeGen/test-invalid-operands-for-select.ll    | 2 +-
 polly/test/DeLICM/load-in-cond-inf-loop.ll                | 2 +-
 polly/test/DeLICM/pr41656.ll                              | 2 +-
 polly/test/DeLICM/pr48783.ll                              | 2 +-
 polly/test/DeLICM/reject_outofquota.ll                    | 2 +-
 .../reduction_modulo_schedule_multiple_dimensions_2.ll    | 2 +-
 .../reduction_modulo_schedule_multiple_dimensions_3.ll    | 2 +-
 .../reduction_modulo_schedule_multiple_dimensions_4.ll    | 2 +-
 .../pattern-matching-based-opts-after-delicm.ll           | 2 +-
 .../pattern-matching-based-opts-after-delicm_2.ll         | 2 +-
 polly/test/ScopDetect/scev_remove_max.ll                  | 2 +-
 .../test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll  | 2 +-
 .../test/ScopInfo/NonAffine/non_affine_loop_used_later.ll | 2 +-
 .../test/ScopInfo/allow-all-parameters-dereferencable.ll  | 2 +-
 .../invariant_same_loop_bound_multiple_times-1.ll         | 2 +-
 .../invariant_same_loop_bound_multiple_times-2.ll         | 2 +-
 polly/test/ScopInfo/multidim_gep_pointercast2.ll          | 2 +-
 polly/test/ScopInfo/multidim_many_references.ll           | 2 +-
 polly/test/ScopInfo/scalar_to_array.ll                    | 2 +-
 polly/test/ScopInfo/zero_ext_of_truncate.ll               | 2 +-
 polly/test/create_ll.sh                                   | 2 +-
 polly/utils/pyscop/isl.py                                 | 4 ++--
 polly/www/changelog.html                                  | 2 +-
 polly/www/get_started.html                                | 2 +-
 polly/www/index.html                                      | 8 ++++----
 polly/www/projects.html                                   | 2 +-
 polly/www/publications.html                               | 2 +-
 50 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/polly/docs/Architecture.rst b/polly/docs/Architecture.rst
index 645d6522e8694..506e0ff8d26be 100644
--- a/polly/docs/Architecture.rst
+++ b/polly/docs/Architecture.rst
@@ -27,7 +27,7 @@ executed in the so-called **Inliner cycle**, This is again a set of **Scalar
 Simplification** passes, a set of **Simple Loop Optimizations**, and the
 **Inliner** itself. Even though these passes make up the majority of the LLVM
 pass pipeline, the primary goal of these passes is still canonicalization
-without loosing semantic information that complicates later analysis. As part of
+without losing semantic information that complicates later analysis. As part of
 the inliner cycle, the LLVM inliner step-by-step tries to inline functions, runs
 canonicalization passes to exploit newly exposed simplification opportunities,
 and then tries to inline the further simplified functions. Some simple loop
diff --git a/polly/docs/doxygen.cfg.in b/polly/docs/doxygen.cfg.in
index befe6e138ec27..a553fc8154fb1 100644
--- a/polly/docs/doxygen.cfg.in
+++ b/polly/docs/doxygen.cfg.in
@@ -1066,7 +1066,7 @@ HTML_STYLESHEET        =
 # defined cascading style sheet that is included after the standard style sheets
 # created by doxygen. Using this option one can overrule certain style aspects.
 # This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefor more robust against future updates.
+# standard style sheet and is therefore more robust against future updates.
 # Doxygen will copy the style sheet file to the output directory. For an example
 # see the documentation.
 # This tag requires that the tag GENERATE_HTML is set to YES.
@@ -1950,7 +1950,7 @@ PREDEFINED             =
 EXPAND_AS_DEFINED      =
 
 # If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all refrences to function-like macros that are alone on a line, have an
+# remove all references to function-like macros that are alone on a line, have an
 # all uppercase name, and do not end with a semicolon. Such function macros are
 # typically used for boiler-plate code, and will confuse the parser if not
 # removed.
diff --git a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
index 46193f9ccaf2c..2e5349d9edd6e 100644
--- a/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
+++ b/polly/include/polly/CodeGen/RuntimeDebugBuilder.h
@@ -118,7 +118,7 @@ struct RuntimeDebugBuilder {
 
   /// Call fflush
   ///
-  /// @parma Builder The builder used to insert the code.
+  /// @param Builder The builder used to insert the code.
   static void createFlush(PollyIRBuilder &Builder);
 };
 } // namespace polly
diff --git a/polly/include/polly/DependenceInfo.h b/polly/include/polly/DependenceInfo.h
index 7526a294c6baf..d562ad80592f2 100644
--- a/polly/include/polly/DependenceInfo.h
+++ b/polly/include/polly/DependenceInfo.h
@@ -211,7 +211,7 @@ struct DependenceAnalysis final : public AnalysisInfoMixin<DependenceAnalysis> {
 
     /// Invalidate the dependence information and recompute it when needed
     /// again.
-    /// May be required when the underlaying Scop was changed in a way that
+    /// May be required when the underlying Scop was changed in a way that
     /// would add new dependencies (e.g. between new statement instances
     /// insierted into the SCoP) or intentionally breaks existing ones. It is
     /// not required when updating the schedule that conforms the existing
@@ -251,7 +251,7 @@ class DependenceInfo final : public ScopPass {
   const Dependences &recomputeDependences(Dependences::AnalysisLevel Level);
 
   /// Invalidate the dependence information and recompute it when needed again.
-  /// May be required when the underlaying Scop was changed in a way that would
+  /// May be required when the underlying Scop was changed in a way that would
   /// add new dependencies (e.g. between new statement instances insierted into
   /// the SCoP) or intentionally breaks existing ones. It is not required when
   /// updating the schedule that conforms the existing dependencies.
diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h
index 974de817e72db..ab0f81dd2836d 100644
--- a/polly/include/polly/ScopInfo.h
+++ b/polly/include/polly/ScopInfo.h
@@ -1494,7 +1494,7 @@ class ScopStmt final {
   /// @param Access  The access to add.
   /// @param Prepend If true, will add @p Access before all other instructions
   ///                (instead of appending it).
-  void addAccess(MemoryAccess *Access, bool Preprend = false);
+  void addAccess(MemoryAccess *Access, bool Prepend = false);
 
   /// Remove a MemoryAccess from this statement.
   ///
diff --git a/polly/include/polly/Support/SCEVAffinator.h b/polly/include/polly/Support/SCEVAffinator.h
index 63a341a01309b..faacfd8ba0e69 100644
--- a/polly/include/polly/Support/SCEVAffinator.h
+++ b/polly/include/polly/Support/SCEVAffinator.h
@@ -50,7 +50,7 @@ class SCEVAffinator final : public llvm::SCEVVisitor<SCEVAffinator, PWACtx> {
   /// Check an <nsw> AddRec for the loop @p L is cached.
   bool hasNSWAddRecForLoop(llvm::Loop *L) const;
 
-  /// Return the LoopInfo used by thi object.
+  /// Return the LoopInfo used by the object.
   llvm::LoopInfo *getLI() const { return &LI; }
 
 private:
diff --git a/polly/include/polly/Support/ScopHelper.h b/polly/include/polly/Support/ScopHelper.h
index 13852ecb18ee7..7818f67b505fd 100644
--- a/polly/include/polly/Support/ScopHelper.h
+++ b/polly/include/polly/Support/ScopHelper.h
@@ -83,7 +83,7 @@ using RecordedAssumptionsTy = llvm::SmallVector<Assumption, 8>;
 ///
 /// This function will add the assumption to the RecordedAssumptions. This
 /// collection will be added (@see addAssumption) to the assumed context once
-/// all paramaters are known and the context is fully built.
+/// all parameters are known and the context is fully built.
 ///
 /// @param RecordedAssumption container which keeps all recorded assumptions.
 /// @param Kind The assumption kind describing the underlying cause.
@@ -132,7 +132,7 @@ using BoxedLoopsSetTy = llvm::SetVector<const llvm::Loop *>;
 /// isNull(), isInstruction(), isLoad(), isStore(), ..., isMemTransferInst(),
 /// operator bool(), operator!()
 ///
-/// The functions isa, cast, cast_or_null, dyn_cast are modeled te resemble
+/// The functions isa, cast, cast_or_null, dyn_cast are modeled to resemble
 /// those from llvm/Support/Casting.h. Partial template function specialization
 /// is currently not supported in C++ such that those cannot be used directly.
 /// (llvm::isa could, but then llvm:cast etc. would not have the expected
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 82fa9e11550f2..76c9b4775784e 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -2522,7 +2522,7 @@ combineReductionType(MemoryAccess::ReductionType RT0,
   return MemoryAccess::RT_NONE;
 }
 
-///  True if @p AllAccs intersects with @p MemAccs execpt @p LoadMA and @p
+///  True if @p AllAccs intersects with @p MemAccs except @p LoadMA and @p
 ///  StoreMA
 bool hasIntersectingAccesses(isl::set AllAccs, MemoryAccess *LoadMA,
                              MemoryAccess *StoreMA, isl::set Domain,
diff --git a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
index 14a6f074454f7..f810d543f1ac0 100644
--- a/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
+++ b/polly/lib/Analysis/ScopDetectionDiagnostic.cpp
@@ -71,7 +71,7 @@ static Statistic RejectStatistics[] = {
     SCOP_STAT(LoopOnlySomeLatches, "Not all loop latches in scop"),
     SCOP_STAT(FuncCall, "Function call with side effects"),
     SCOP_STAT(NonSimpleMemoryAccess,
-              "Compilated access semantics (volatile or atomic)"),
+              "Complicated access semantics (volatile or atomic)"),
     SCOP_STAT(Alias, "Base address aliasing"),
     SCOP_STAT(Other, ""),
     SCOP_STAT(IntToPtr, "Integer to pointer conversions"),
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 739bd63a5eb80..bbe7bc42cd833 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1159,7 +1159,7 @@ bool IslNodeBuilder::preloadInvariantEquivClass(
   // For an equivalence class of invariant loads we pre-load the representing
   // element with the unified execution context. However, we have to map all
   // elements of the class to the one preloaded load as they are referenced
-  // during the code generation and therefor need to be mapped.
+  // during the code generation and therefore need to be mapped.
   const MemoryAccessList &MAs = IAClass.InvariantAccesses;
   if (MAs.empty())
     return true;
diff --git a/polly/lib/CodeGen/Utils.cpp b/polly/lib/CodeGen/Utils.cpp
index 3afb2e580889b..e95705616022f 100644
--- a/polly/lib/CodeGen/Utils.cpp
+++ b/polly/lib/CodeGen/Utils.cpp
@@ -46,7 +46,7 @@ static BasicBlock *splitEdge(BasicBlock *Prev, BasicBlock *Succ,
   // llvm::SplitCriticalEdge is more efficient than
   // llvm::SplitBlockPredecessors, which is more general. In the future we might
   // either modify llvm::SplitCriticalEdge to allow skipping the critical edge
-  // check; or Copy&Pase it here.
+  // check; or Copy&Paste it here.
   BasicBlock *MiddleBlock = SplitBlockPredecessors(
       Succ, ArrayRef<BasicBlock *>(Prev), Suffix, DT, LI);
 
diff --git a/polly/lib/Support/SCEVAffinator.cpp b/polly/lib/Support/SCEVAffinator.cpp
index ce4467f082ba4..87e0fc056ca4b 100644
--- a/polly/lib/Support/SCEVAffinator.cpp
+++ b/polly/lib/Support/SCEVAffinator.cpp
@@ -502,7 +502,7 @@ PWACtx SCEVAffinator::visitUDivExpr(const SCEVUDivExpr *Expr) {
   }
 
   // TODO: One can represent the dividend as piece-wise function to be more
-  //       precise but therefor a heuristic is needed.
+  //       precise but therefore a heuristic is needed.
 
   // Assume a non-negative dividend.
   takeNonNegativeAssumption(DividendPWAC, RecordedAssumptions);
diff --git a/polly/lib/Support/ScopHelper.cpp b/polly/lib/Support/ScopHelper.cpp
index bece390d31dc6..fbf891be0a1e7 100644
--- a/polly/lib/Support/ScopHelper.cpp
+++ b/polly/lib/Support/ScopHelper.cpp
@@ -456,7 +456,7 @@ struct ScopExpander final : SCEVVisitor<ScopExpander, const SCEV *> {
 
     // FIXME: This emits a SCEV for GenSE (since GenLRepl will refer to the
     // induction variable of a generated loop), so we should not use SCEVVisitor
-    // with it. Howver, it still contains references to the SCoP region.
+    // with it. However, it still contains references to the SCoP region.
     return visit(Evaluated);
   }
   ///}
diff --git a/polly/lib/Transform/DeLICM.cpp b/polly/lib/Transform/DeLICM.cpp
index b7e464e6739c6..9a9768afe113e 100644
--- a/polly/lib/Transform/DeLICM.cpp
+++ b/polly/lib/Transform/DeLICM.cpp
@@ -1290,7 +1290,7 @@ class DeLICMImpl final : public ZoneAlgorithm {
           continue;
         }
 
-        // Check for more than one element acces per statement instance.
+        // Check for more than one element access per statement instance.
         // Currently we expect write accesses to be functional, eg. disallow
         //
         //   { Stmt[0] -> [i] : 0 <= i < 2 }
diff --git a/polly/lib/Transform/ManualOptimizer.cpp b/polly/lib/Transform/ManualOptimizer.cpp
index 0e330f207fbc4..98ab9819e5b1f 100644
--- a/polly/lib/Transform/ManualOptimizer.cpp
+++ b/polly/lib/Transform/ManualOptimizer.cpp
@@ -149,7 +149,7 @@ class SearchTransformVisitor final
   // transformed in innermost-first order.
   isl::schedule Result;
 
-  /// Check wether a schedule after a  transformation is legal. Return the old
+  /// Check whether a schedule after a  transformation is legal. Return the old
   /// schedule without the transformation.
   isl::schedule
   checkDependencyViolation(llvm::MDNode *LoopMD, llvm::Value *CodeRegion,
diff --git a/polly/lib/Transform/MatmulOptimizer.cpp b/polly/lib/Transform/MatmulOptimizer.cpp
index ff1683b2d63c5..01d431a97e7db 100644
--- a/polly/lib/Transform/MatmulOptimizer.cpp
+++ b/polly/lib/Transform/MatmulOptimizer.cpp
@@ -1759,7 +1759,7 @@ static bool isTCPattern(isl::schedule_node Node, const Dependences *D,
   //
   // For example, this covers the matrix multiplication pattern after a full
   // run of -polly-optree and -polly-delicm, where the write access is not
-  // through the original memory access, but trough a PHI node that was
+  // through the original memory access, but through a PHI node that was
   // delicmed. Subsequently, such band nodes will be replaced by a single band
   // node.
   //
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 55d51982d90e7..070700a64a168 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -771,7 +771,7 @@ static void runIslScheduleOptimizer(
     return;
   }
 
-  // Apply ISL's algorithm only if not overriden by the user. Note that
+  // Apply ISL's algorithm only if not overridden by the user. Note that
   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
   // rely on the coincidence/permutable annotations on schedule tree bands that
   // are added by the rescheduling analyzer. Therefore, disabling the
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index f0684de825d27..3f3630027e6e3 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -572,13 +572,13 @@ class BandCollapseRewriter final
     isl::schedule_node_band Band = RootBand;
     isl::ctx Ctx = Band.ctx();
 
-    // Do not merge permutable band to avoid loosing the permutability property.
+    // Do not merge permutable band to avoid losing the permutability property.
     // Cannot collapse even two permutable loops, they might be permutable
     // individually, but not necassarily across.
     if (unsignedFromIslSize(Band.n_member()) > 1u && Band.permutable())
       return getBase().visitBand(Band);
 
-    // Find collapsable bands.
+    // Find collapsible bands.
     SmallVector<isl::schedule_node_band> Nest;
     int NumTotalLoops = 0;
     isl::schedule_node Body;
@@ -884,10 +884,10 @@ class GreedyFusionRewriter final
       collectPotentiallyFusableBands(Child, Bands, Child);
     }
 
-    // Direct children that had at least one of its decendants fused.
+    // Direct children that had at least one of its descendants fused.
     SmallDenseSet<isl_schedule_node *, 4> ChangedDirectChildren;
 
-    // Fuse neigboring bands until reaching the end of candidates.
+    // Fuse neighboring bands until reaching the end of candidates.
     int i = 0;
     while (i + 1 < (int)Bands.size()) {
       isl::schedule Fused =
diff --git a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
index 28531244421d1..eb7de01ba862c 100644
--- a/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
+++ b/polly/test/CodeGen/LoopParallelMD/do_not_mutate_debug_info.ll
@@ -1,4 +1,4 @@
-; This test checks that we do not accidently mutate the debug info when
+; This test checks that we do not accidentally mutate the debug info when
 ; inserting loop parallel metadata.
 ; RUN: opt %loadNPMPolly < %s  -S -polly -passes=polly-codegen -polly-ast-detect-parallel | FileCheck %s
 ; CHECK-NOT: !7 = !{!7}
diff --git a/polly/test/CodeGen/multiple-codegens.ll b/polly/test/CodeGen/multiple-codegens.ll
index 2fa974e66df50..a63f8a615ff9e 100644
--- a/polly/test/CodeGen/multiple-codegens.ll
+++ b/polly/test/CodeGen/multiple-codegens.ll
@@ -6,7 +6,7 @@
 ; RegionPassManager. -polly-codegen must not reuse the -polly-ast analysis the
 ; was created for the first -polly-scops pass.
 ; The current solution is that only the first -polly-codegen is allowed to
-; generate code, the second detects it is re-using an IslAst that belongs to a
+; generate code, the second detects it is reusing an IslAst that belongs to a
 ; different ScopInfo.
 ;
 ; int a, b, c;
diff --git a/polly/test/CodeGen/multiple-scops-in-a-row.ll b/polly/test/CodeGen/multiple-scops-in-a-row.ll
index b81ba04e36463..effae223c152a 100644
--- a/polly/test/CodeGen/multiple-scops-in-a-row.ll
+++ b/polly/test/CodeGen/multiple-scops-in-a-row.ll
@@ -6,7 +6,7 @@
 
 ; We explicitly check here that the second scop is not code generated. Later
 ; improvements may make this possible (e.g., Polly gaining support for
-; parameteric conditional expressions or a changed code generation order).
+; parametric conditional expressions or a changed code generation order).
 ; However, in case this happens, we want to ensure this test case is been
 ; reasoned about and updated accordingly.
 
diff --git a/polly/test/CodeGen/reduction_2.ll b/polly/test/CodeGen/reduction_2.ll
index 4aa306775e781..060a1866870e4 100644
--- a/polly/test/CodeGen/reduction_2.ll
+++ b/polly/test/CodeGen/reduction_2.ll
@@ -88,7 +88,7 @@ if.end:                                           ; preds = %if.then, %for.end
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i32, i1) nounwind
 
 ; This is a negative test. We can prove that RED[0] in the conditional after
-; the loop is dereferencable and consequently expand the SCoP from the
+; the loop is dereferenceable and consequently expand the SCoP from the
 ; loop to include the conditional. However, during SCoP generation we realize
 ; that, while RED[0] is invariant, it is written to as part of the same scop
 ; and can consequently not be hoisted. Hence, we invalidate the scop.
diff --git a/polly/test/CodeGen/scalar-store-from-same-bb.ll b/polly/test/CodeGen/scalar-store-from-same-bb.ll
index 3f232da37e4cd..0c1164b245a43 100644
--- a/polly/test/CodeGen/scalar-store-from-same-bb.ll
+++ b/polly/test/CodeGen/scalar-store-from-same-bb.ll
@@ -2,7 +2,7 @@
 ; RUN: -passes=polly-codegen -S < %s | FileCheck %s
 
 ; This test ensures that the expression N + 1 that is stored in the phi-node
-; alloca, is directly computed and not incorrectly transfered through memory.
+; alloca, is directly computed and not incorrectly transferred through memory.
 
 ; CHECK: store i64 [[REG:%.*]], ptr %res.phiops
 ; CHECK: [[REG]] = add i64 %N, 1
diff --git a/polly/test/CodeGen/test-invalid-operands-for-select.ll b/polly/test/CodeGen/test-invalid-operands-for-select.ll
index 9f5013cf1bb16..fdc98fbb4d9e7 100644
--- a/polly/test/CodeGen/test-invalid-operands-for-select.ll
+++ b/polly/test/CodeGen/test-invalid-operands-for-select.ll
@@ -2,7 +2,7 @@
 ;
 ; Check that we do not crash as described here: http://llvm.org/PR21167
 ;
-; In case the pieceweise affine function used to create an isl_ast_expr
+; In case the piecewise affine function used to create an isl_ast_expr
 ; had empty cases (e.g., with contradicting constraints on the
 ; parameters), it was possible that the condition of the isl_ast_expr
 ; select was not a comparison but a constant (thus of type i64).
diff --git a/polly/test/DeLICM/load-in-cond-inf-loop.ll b/polly/test/DeLICM/load-in-cond-inf-loop.ll
index f6e23110aa6f1..a78a4691bb0d5 100644
--- a/polly/test/DeLICM/load-in-cond-inf-loop.ll
+++ b/polly/test/DeLICM/load-in-cond-inf-loop.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=print<polly-delicm>' -disable-output < %s | FileCheck %s
 
-; When %b is 0, %for.body13 is an infite loop. In this case the loaded
+; When %b is 0, %for.body13 is an infinite loop. In this case the loaded
 ; value %1 is not used anywhere.
 ; This is a problem when DeLICM tries to map %1 to %arrayidx16 because
 ; %1 has no corresponding when %b == 0 and therefore hat no location
diff --git a/polly/test/DeLICM/pr41656.ll b/polly/test/DeLICM/pr41656.ll
index d7cfde35a6e80..2a92503809a24 100644
--- a/polly/test/DeLICM/pr41656.ll
+++ b/polly/test/DeLICM/pr41656.ll
@@ -4,7 +4,7 @@
 ;
 ; This test case has an InvalidContext such that part of the predecessors
 ; of for.body.us.i lie within the invalid context. This causes a
-; consistency check withing the invalid context of PR41656 to fail.
+; consistency check within the invalid context of PR41656 to fail.
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/DeLICM/pr48783.ll b/polly/test/DeLICM/pr48783.ll
index e3c3eb6a19ccf..deba8bfcc5daf 100644
--- a/polly/test/DeLICM/pr48783.ll
+++ b/polly/test/DeLICM/pr48783.ll
@@ -4,7 +4,7 @@
 ;
 ; PHI predecessors of statement instances can only be reliably derived in defined behaviour situations. In this case, the inner loop's counter would overflow when its upper bound (%call24) is lower than its lower bound (2). However, due to the nsw flag, this would be undefined behavior and therefore not added to any runtime-check context, but to the defined-behaviour context.
 ;
-; Dereived from test case pr41656.ll
+; Derived from test case pr41656.ll
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/polly/test/DeLICM/reject_outofquota.ll b/polly/test/DeLICM/reject_outofquota.ll
index 820679a5349d2..9bc6bf1f23733 100644
--- a/polly/test/DeLICM/reject_outofquota.ll
+++ b/polly/test/DeLICM/reject_outofquota.ll
@@ -66,7 +66,7 @@ return:
 ; CHECK: maximal number of operations exceeded during zone analysis
 
 ; Check that even if the quota was exceeded in DeLICM, DependenceInfo is still
-; successfull since it uses a different operations counter.
+; successful since it uses a different operations counter.
 ;
 ; DEP:     RAW dependences:
 ; DEP-NOT:        n/a
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
index 46b2559c6e0b1..d7f9029fd347a 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_2.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
-; Verify that the outer dimension doesnt't carry reduction dependences
+; Verify that the outer dimension doesn't carry reduction dependences
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
index 6f40ee90fef53..f18060a2e20a8 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_3.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
-; Verify that the outer dimension doesnt't carry reduction dependences
+; Verify that the outer dimension doesn't carry reduction dependences
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
index f82b9569a88b1..8e2a590c5f57c 100644
--- a/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
+++ b/polly/test/IstAstInfo/reduction_modulo_schedule_multiple_dimensions_4.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=polly-import-jscop,print<polly-ast>' -polly-ast-detect-parallel -disable-output < %s | FileCheck %s
 ;
-; Verify that the outer dimension doesnt't carry reduction dependences
+; Verify that the outer dimension doesn't carry reduction dependences
 ;
 ; CHECK-NOT:#pragma known-parallel reduction
 ; CHECK:    #pragma known-parallel
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 8228a5c08f598..6e9ade869ec6c 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -7,7 +7,7 @@
 
 ; Check that the pattern matching detects the matrix multiplication pattern
 ; after a full run of -polly-optree and -polly-delicm, where the write access
-; is not through the original memory access, but trough a PHI node that was
+; is not through the original memory access, but through a PHI node that was
 ; delicmed. This test covers the polybench 2mm and 3mm cases.
 ;
 ; This test case generates the following schedule, which contains filters:
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
index 4bda7584f5962..4ef0605a0ba75 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm_2.ll
@@ -5,7 +5,7 @@
 ;
 ; Check that the pattern matching detects the tensor contraction pattern
 ; after a full run of -polly-delicm. This test case generates the following
-; schedule, which contans two band nodes. Without DeLICM two statement are
+; schedule, which contains two band nodes. Without DeLICM two statement are
 ; generated.
 ;
 ; domain: "{ Stmt5[i0, i1, i2, i3, i4, i5] : 0 <= i0 <= 31 and 0 <= i1 <= 31 and
diff --git a/polly/test/ScopDetect/scev_remove_max.ll b/polly/test/ScopDetect/scev_remove_max.ll
index caf55bf87a667..f76c832ff08f5 100644
--- a/polly/test/ScopDetect/scev_remove_max.ll
+++ b/polly/test/ScopDetect/scev_remove_max.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=print<polly-detect>' < %s
 
-; This test case helps to determine wether SCEVRemoveMax::remove produces
+; This test case helps to determine whether SCEVRemoveMax::remove produces
 ; an infinite loop and a segmentation fault, if it processes, for example,
 ; '((-1 + (-1 * %b1)) umax {(-1 + (-1 * %yStart)),+,-1}<%.preheader>)'.
 ;
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 92028093f70bb..3743bfae9fcaf 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -1,7 +1,7 @@
 ; XFAIL: *
 
 ; The test case stopped making sense after r310940 that added infinite loops to
-; the PostDominatorTree. Infinite loops are postdominated ony by the virtual
+; the PostDominatorTree. Infinite loops are postdominated only by the virtual
 ; root, which causes them not to appear in regions in ScopDetection anymore.
 
 ; RUN: opt %loadNPMPolly -pass-remarks-missed="polly-detect" -polly-allow-nonaffine-loops '-passes=print<polly-detect>' -disable-output < %s 2>&1 | FileCheck %s
diff --git a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
index 79b61eca258f2..63ff354d7e5f7 100644
--- a/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
+++ b/polly/test/ScopInfo/NonAffine/non_affine_loop_used_later.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops                                                                        '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ; RUN: opt %loadNPMPolly -polly-allow-nonaffine -polly-allow-nonaffine-branches -polly-allow-nonaffine-loops -polly-unprofitable-scalar-accs=true -polly-process-unprofitable=false '-passes=print<polly-detect>,print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s --check-prefix=PROFIT
 ;
-; Verify that we over approximate the read acces of A[j] in the last statement as j is
+; Verify that we over approximate the read access of A[j] in the last statement as j is
 ; computed in a non-affine loop we do not model.
 ;
 ; CHECK:      Function: f
diff --git a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
index cb06e352da658..70c3c56fb3112 100644
--- a/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
+++ b/polly/test/ScopInfo/allow-all-parameters-dereferencable.ll
@@ -28,7 +28,7 @@
 ; CODE-RTC-NEXT: br i1 %{{[a-zA-Z0-9\.]*}}, label %polly.preload.exec, label %polly.preload.merge
 
 ; Check that we don't generate a runtime check because we treat all
-; parameters as dereferencable.
+; parameters as dereferenceable.
 ; CODE-NOT: polly.preload.cond:                               ; preds = %polly.preload.begin
 ; CODE-NOT: br i1 %{{r1:[a-zA-Z0-9]*}}, label %polly.preload.exec, label %polly.preload.merge
 
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
index a473ef30376c1..e3292b4e4aefa 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-1.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
-; three loads that occure in the region but actually access the same
+; three loads that occur in the region but actually access the same
 ; location. Also check that the execution context is the most generic
 ; one, e.g., here the universal set.
 ;
diff --git a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
index 66a0bc631b1dc..d69438de5817f 100644
--- a/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
+++ b/polly/test/ScopInfo/invariant_same_loop_bound_multiple_times-2.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -polly-invariant-load-hoisting=true -disable-output < %s 2>&1 | FileCheck %s
 ;
 ; Verify that we only have one parameter and one invariant load for all
-; three loads that occure in the region but actually access the same
+; three loads that occur in the region but actually access the same
 ; location. Also check that the execution context is the most generic
 ; one, e.g., here the universal set.
 ;
diff --git a/polly/test/ScopInfo/multidim_gep_pointercast2.ll b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
index b31a0d0262db9..9daae4b1ce3db 100644
--- a/polly/test/ScopInfo/multidim_gep_pointercast2.ll
+++ b/polly/test/ScopInfo/multidim_gep_pointercast2.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %loadNPMPolly '-passes=print<polly-function-scops>' -disable-output < %s 2>&1 | FileCheck %s
 ;
-; Verfy that we do not use the GetElementPtr information to delinearize A
+; Verify that we do not use the GetElementPtr information to delinearize A
 ; because of the cast in-between. Use the single-dimensional modeling instead.
 ;
 ;    void f(short A[][2]) {
diff --git a/polly/test/ScopInfo/multidim_many_references.ll b/polly/test/ScopInfo/multidim_many_references.ll
index 3801fda4923c3..f0f1c2b1f39db 100644
--- a/polly/test/ScopInfo/multidim_many_references.ll
+++ b/polly/test/ScopInfo/multidim_many_references.ll
@@ -28,7 +28,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; This test case verifies that the construction of the assumed context finishes
 ; successfully. Depending on how constrained are accumulated in the assumed
 ; context, this test case can take even for a smaller number of arrays over a
-; minute to complete. With the unrolling choosen in this test, an inefficient
+; minute to complete. With the unrolling chosen in this test, an inefficient
 ; formulation of the assumption tracking cause LLVM to crash due to excessive
 ; memory usage due to an overly large number of disjuncts being formed.
 
diff --git a/polly/test/ScopInfo/scalar_to_array.ll b/polly/test/ScopInfo/scalar_to_array.ll
index d64f1696c30b6..3f61d0d723046 100644
--- a/polly/test/ScopInfo/scalar_to_array.ll
+++ b/polly/test/ScopInfo/scalar_to_array.ll
@@ -109,7 +109,7 @@ return:                                           ; preds = %for.cond
 ; It is not possible to have a scop which accesses a scalar element that is
 ; a global variable. All global variables are pointers containing possibly
 ; a single element. Hence they do not need to be handled anyways.
-; Please note that this is still required when scalar to array rewritting is
+; Please note that this is still required when scalar to array rewriting is
 ; disabled.
 
 ; CHECK-LABEL: Function: use_after_scop
diff --git a/polly/test/ScopInfo/zero_ext_of_truncate.ll b/polly/test/ScopInfo/zero_ext_of_truncate.ll
index bd3749b6aa74f..cbe4af05169f8 100644
--- a/polly/test/ScopInfo/zero_ext_of_truncate.ll
+++ b/polly/test/ScopInfo/zero_ext_of_truncate.ll
@@ -8,7 +8,7 @@
 ;      }
 ;    }
 ;
-; FIXME: The truncated value should be a paramter.
+; FIXME: The truncated value should be a parameter.
 ; CHECK:         Assumed Context:
 ; CHECK-NEXT:    [N, tmp, M] -> { : }
 ; CHECK-NEXT:    Invalid Context:
diff --git a/polly/test/create_ll.sh b/polly/test/create_ll.sh
index c44d8ed81f2b5..1c03d9a015e01 100755
--- a/polly/test/create_ll.sh
+++ b/polly/test/create_ll.sh
@@ -12,7 +12,7 @@ opt -correlated-propagation -mem2reg -instcombine -loop-simplify -indvars \
 -instnamer ${LLFILE} -S -o ${LLFILE_TMP}
 
 # Insert a header into the new testcase containing a sample RUN line a FIXME and
-# an XFAIL. Then insert the formated C code and finally the LLVM-IR without
+# an XFAIL. Then insert the formatted C code and finally the LLVM-IR without
 # attributes, the module ID or the target triple.
 echo '; RUN: opt %loadPolly -S < %s | FileCheck %s' > ${LLFILE}
 echo ';' >> ${LLFILE}
diff --git a/polly/utils/pyscop/isl.py b/polly/utils/pyscop/isl.py
index c06b7bca28042..c5d92ae9812de 100644
--- a/polly/utils/pyscop/isl.py
+++ b/polly/utils/pyscop/isl.py
@@ -72,7 +72,7 @@ def initialize_isl_methods(self):
         if hasattr(self.__class__, "initialized"):
             return
 
-        self.__class__.initalized = True
+        self.__class__.initialized = True
         self.get_isl_method("read_from_str").argtypes = [Context, c_char_p, c_int]
         self.get_isl_method("copy").argtypes = [self.__class__]
         self.get_isl_method("copy").restype = c_int
@@ -204,7 +204,7 @@ def initialize_isl_methods(self):
         if hasattr(self.__class__, "initialized"):
             return
 
-        self.__class__.initalized = True
+        self.__class__.initialized = True
         self.get_isl_method("copy").argtypes = [self.__class__]
         self.get_isl_method("copy").restype = c_int
         self.get_isl_method("free").argtypes = [self.__class__]
diff --git a/polly/www/changelog.html b/polly/www/changelog.html
index 8c789cd31f530..6ba9ba4723058 100644
--- a/polly/www/changelog.html
+++ b/polly/www/changelog.html
@@ -40,7 +40,7 @@ <h2> 3.6</h2>
 <li>Run-time alias checks</li>
 <li>Computation of no-alias information for later LLVM optimizations
 (vectorizer, LICM, ...)</li>
-<li>Support for multi-dimensional arrays of parameteric size (still tested)</li>
+<li>Support for multi-dimensional arrays of parametric size (still tested)</li>
 <li>New assumption tracking framework</li>
 <ul>
 <li>Accesses to multi-dimensional arrays of fixed size are within bounds</li>
diff --git a/polly/www/get_started.html b/polly/www/get_started.html
index 70e8a6beca65c..d5f2b0dff9b3e 100644
--- a/polly/www/get_started.html
+++ b/polly/www/get_started.html
@@ -15,7 +15,7 @@
 <div id="content">
 
 <h1>Building and Installing Polly</h1>
-You can build Polly with <a href="https://cmake.org/">cmake</a> and your preferred geneator (e.g. Ninja, make, Visual Studio, etc.).
+You can build Polly with <a href="https://cmake.org/">cmake</a> and your preferred generator (e.g. Ninja, make, Visual Studio, etc.).
 
 <h3 id="source">Get the code</h3>
 
diff --git a/polly/www/index.html b/polly/www/index.html
index 87642324eadb4..6adc43ec3c4aa 100644
--- a/polly/www/index.html
+++ b/polly/www/index.html
@@ -90,7 +90,7 @@ <h4>AST Generation Paper published in TOPLAS</h4>
       while issues like the generation of the correct loop structure and loop
       bounds will be taken care of by our AST generator.
       <li><b>Polyhedral unrolling:</b> We discuss techniques that allow the
-      unrolling of non-trivial loops in the context of parameteric loop bounds,
+      unrolling of non-trivial loops in the context of parametric loop bounds,
       complex tile shapes and conditionally executed statements. Such unrolling
       support enables the generation of predicated code e.g. in the context of
       GPGPU computing.
@@ -110,7 +110,7 @@ <h4>AST Generation Paper published in TOPLAS</h4>
    <a href="https://www.grosser.es#pub-polyhedral-AST-generation">
    <em>Polyhedral AST generation is more than scanning polyhedra</em></a><br />
     Tobias Grosser, Sven Verdoolaege, Albert Cohen<br />
-    ACM Transations on Programming Languages and Systems (TOPLAS), 37(4),
+    ACM Transactions on Programming Languages and Systems (TOPLAS), 37(4),
     July 2015
 
     <br>
@@ -356,13 +356,13 @@ <h4>Experimental support for the <b>new isl code generator</b></h4>
   </td>
   </tr>
   <tr>
-  <td><p>Februar</p></td>
+  <td><p>February</p></td>
   <td><p>pollycc - a script to automatically compile with
   polyhedral optimizations </p></td>
   </tr>
 
   <tr>
-  <td><p> Januar</p></td>
+  <td><p> January</p></td>
   <td><p> Basic OpenMP support, Alias analysis integration,
   Pluto/POCC support </p></td>
   </tr>
diff --git a/polly/www/projects.html b/polly/www/projects.html
index 915c2b93921a0..ea7e137f031ea 100644
--- a/polly/www/projects.html
+++ b/polly/www/projects.html
@@ -42,7 +42,7 @@ <h3>Integrate Polly with the LLVM vectorizers</h3>
   to bring features of Polly to standard -O3 optimizations.
 
   <h3>Register tiling to obtain fast BLAS kernels with Polly</h3>
-  Even though Polly is already able to speep up compute kernels significantly,
+  Even though Polly is already able to speed up compute kernels significantly,
   when comparing to the best BLAS routines we still are at least one order of
   magnitude off. In this project you will investigate what is needed to close
   this performance gap. Earlier investigations have shown that register tiling
diff --git a/polly/www/publications.html b/polly/www/publications.html
index c256c8d151475..4890568fe6d2c 100644
--- a/polly/www/publications.html
+++ b/polly/www/publications.html
@@ -60,7 +60,7 @@ <h3> 2015 </h3>
   <ul>
   <li><em>Polyhedral AST generation is more than scanning polyhedra</em><br />
   Tobias Grosser, Sven Verdoolaege, Albert Cohen<br />
-   ACM Transations on Programming Languages and Systems (TOPLAS), 37(4), July
+   ACM Transactions on Programming Languages and Systems (TOPLAS), 37(4), July
    2015<br />
   <a href="https://www.grosser.es#pub-polyhedral-AST-generation">Paper</a>
   </li>

From e14962a39cc6476bebba65e5639b74b0318c7fc3 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 27 Jan 2025 15:25:17 +0000
Subject: [PATCH 206/432] [NFC][DebugInfo] Use iterators for instruction
 insertion in more places (#124291)

As part of the "RemoveDIs" work to eliminate debug intrinsics, we're
replacing methods that use Instruction*'s as positions with iterators.
This patch changes some more complex call-sites, those crossing file
boundaries and where I've had to perform some minor rewrites.
---
 clang/lib/CodeGen/CodeGenFunction.h           |  2 +-
 .../llvm/Transforms/Utils/BasicBlockUtils.h   | 10 ++++-----
 llvm/lib/CodeGen/CodeGenPrepare.cpp           | 22 +++++++++----------
 .../Target/Hexagon/HexagonVectorCombine.cpp   | 16 ++++++++------
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  | 16 ++++++++------
 .../Instrumentation/AddressSanitizer.cpp      |  2 +-
 .../Instrumentation/MemorySanitizer.cpp       |  2 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  2 +-
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 19 ++++++++--------
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  8 +++----
 10 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index fab27d4c22ed8..bdd6e3bd55a7f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -451,7 +451,7 @@ class CodeGenFunction : public CodeGenTypeCache {
              "EBB should be entry block of the current code gen function");
       PostAllocaInsertPt = AllocaInsertPt->clone();
       PostAllocaInsertPt->setName("postallocapt");
-      PostAllocaInsertPt->insertAfter(AllocaInsertPt);
+      PostAllocaInsertPt->insertAfter(AllocaInsertPt->getIterator());
     }
 
     return PostAllocaInsertPt;
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index b447942ffbd67..6faff3d1fd8e3 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -540,7 +540,7 @@ inline void SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
 /// SplitBefore.  Returns the first insert point in the loop body, and the
 /// PHINode for the induction variable (i.e. "i" above).
 std::pair<Instruction*, Value*>
-SplitBlockAndInsertSimpleForLoop(Value *End, Instruction *SplitBefore);
+SplitBlockAndInsertSimpleForLoop(Value *End, BasicBlock::iterator SplitBefore);
 
 /// Utility function for performing a given action on each lane of a vector
 /// with \p EC elements.  To simplify porting legacy code, this defaults to
@@ -550,9 +550,9 @@ SplitBlockAndInsertSimpleForLoop(Value *End, Instruction *SplitBefore);
 /// IRBuilder whose insert point is correctly set for instantiating the
 /// given index, and a value which is (at runtime) the index to access.
 /// This index *may* be a constant.
-void SplitBlockAndInsertForEachLane(ElementCount EC, Type *IndexTy,
-    Instruction *InsertBefore,
-    std::function<void(IRBuilderBase&, Value*)> Func);
+void SplitBlockAndInsertForEachLane(
+    ElementCount EC, Type *IndexTy, BasicBlock::iterator InsertBefore,
+    std::function<void(IRBuilderBase &, Value *)> Func);
 
 /// Utility function for performing a given action on each lane of a vector
 /// with \p EVL effective length. EVL is assumed > 0. To simplify porting legacy
@@ -563,7 +563,7 @@ void SplitBlockAndInsertForEachLane(ElementCount EC, Type *IndexTy,
 /// the given index, and a value which is (at runtime) the index to access. This
 /// index *may* be a constant.
 void SplitBlockAndInsertForEachLane(
-    Value *End, Instruction *InsertBefore,
+    Value *End, BasicBlock::iterator InsertBefore,
     std::function<void(IRBuilderBase &, Value *)> Func);
 
 /// Check whether BB is the merge point of a if-region.
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 7e9d705a7bef6..94101ccca1096 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2935,13 +2935,13 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB,
 
   // Make sure there are no instructions between the first instruction
   // and return.
-  const Instruction *BI = BB->getFirstNonPHI();
+  BasicBlock::const_iterator BI = BB->getFirstNonPHIIt();
   // Skip over debug and the bitcast.
-  while (isa<DbgInfoIntrinsic>(BI) || BI == BCI || BI == EVI ||
-         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(BI) ||
-         isFakeUse(BI))
-    BI = BI->getNextNode();
-  if (BI != RetI)
+  while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI ||
+         isa<PseudoProbeInst>(BI) || isLifetimeEndOrBitCastFor(&*BI) ||
+         isFakeUse(&*BI))
+    BI = std::next(BI);
+  if (&*BI != RetI)
     return false;
 
   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
@@ -3265,8 +3265,8 @@ class TypePromotionTransaction {
     /// Either an instruction:
     /// - Is the first in a basic block: BB is used.
     /// - Has a previous instruction: PrevInst is used.
-    union {
-      Instruction *PrevInst;
+    struct {
+      BasicBlock::iterator PrevInst;
       BasicBlock *BB;
     } Point;
     std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;
@@ -3286,7 +3286,7 @@ class TypePromotionTransaction {
         BeforeDbgRecord = Inst->getDbgReinsertionPosition();
 
       if (HasPrevInstruction) {
-        Point.PrevInst = &*std::prev(Inst->getIterator());
+        Point.PrevInst = std::prev(Inst->getIterator());
       } else {
         Point.BB = BB;
       }
@@ -3297,7 +3297,7 @@ class TypePromotionTransaction {
       if (HasPrevInstruction) {
         if (Inst->getParent())
           Inst->removeFromParent();
-        Inst->insertAfter(&*Point.PrevInst);
+        Inst->insertAfter(Point.PrevInst);
       } else {
         BasicBlock::iterator Position = Point.BB->getFirstInsertionPt();
         if (Inst->getParent())
@@ -3317,7 +3317,7 @@ class TypePromotionTransaction {
 
   public:
     /// Move \p Inst before \p Before.
-    InstructionMoveBefore(Instruction *Inst, Instruction *Before)
+    InstructionMoveBefore(Instruction *Inst, BasicBlock::iterator Before)
         : TypePromotionAction(Inst), Position(Inst) {
       LLVM_DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before
                         << "\n");
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index ce933108b83b1..62b2839295d96 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -342,7 +342,8 @@ class AlignVectors {
   MoveList createLoadGroups(const AddrList &Group) const;
   MoveList createStoreGroups(const AddrList &Group) const;
   bool moveTogether(MoveGroup &Move) const;
-  template <typename T> InstMap cloneBefore(Instruction *To, T &&Insts) const;
+  template <typename T>
+  InstMap cloneBefore(BasicBlock::iterator To, T &&Insts) const;
 
   void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
                         int ScLen, Value *AlignVal, Value *AlignAddr) const;
@@ -1046,7 +1047,7 @@ auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
   if (Move.IsLoad) {
     // Move all the loads (and dependencies) to where the first load is.
     // Clone all deps to before Where, keeping order.
-    Move.Clones = cloneBefore(Where, Move.Deps);
+    Move.Clones = cloneBefore(Where->getIterator(), Move.Deps);
     // Move all main instructions to after Where, keeping order.
     ArrayRef<Instruction *> Main(Move.Main);
     for (Instruction *M : Main) {
@@ -1067,7 +1068,7 @@ auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
     // Move all main instructions to before Where, inverting order.
     ArrayRef<Instruction *> Main(Move.Main);
     for (Instruction *M : Main.drop_front(1)) {
-      M->moveBefore(Where);
+      M->moveBefore(Where->getIterator());
       Where = M;
     }
   }
@@ -1076,7 +1077,8 @@ auto AlignVectors::moveTogether(MoveGroup &Move) const -> bool {
 }
 
 template <typename T>
-auto AlignVectors::cloneBefore(Instruction *To, T &&Insts) const -> InstMap {
+auto AlignVectors::cloneBefore(BasicBlock::iterator To, T &&Insts) const
+    -> InstMap {
   InstMap Map;
 
   for (Instruction *I : Insts) {
@@ -1200,10 +1202,10 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
                             VSpan.section(Start, Width).values());
   };
 
-  auto moveBefore = [this](Instruction *In, Instruction *To) {
+  auto moveBefore = [this](BasicBlock::iterator In, BasicBlock::iterator To) {
     // Move In and its upward dependencies to before To.
     assert(In->getParent() == To->getParent());
-    DepList Deps = getUpwardDeps(In, To);
+    DepList Deps = getUpwardDeps(&*In, &*To);
     In->moveBefore(To);
     // DepList is sorted with respect to positions in the basic block.
     InstMap Map = cloneBefore(In, Deps);
@@ -1236,7 +1238,7 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
       // in order to check legality.
       if (auto *Load = dyn_cast<Instruction>(Loads[Index])) {
         if (!HVC.isSafeToMoveBeforeInBB(*Load, BasePos))
-          moveBefore(Load, &*BasePos);
+          moveBefore(Load->getIterator(), BasePos);
       }
       LLVM_DEBUG(dbgs() << "Loads[" << Index << "]:" << *Loads[Index] << '\n');
     }
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index ff5df12c398c5..2630ae19dabaa 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1221,8 +1221,8 @@ static void handleNoSuspendCoroutine(coro::Shape &Shape) {
 // SimplifySuspendPoint needs to check that there is no calls between
 // coro_save and coro_suspend, since any of the calls may potentially resume
 // the coroutine and if that is the case we cannot eliminate the suspend point.
-static bool hasCallsInBlockBetween(Instruction *From, Instruction *To) {
-  for (Instruction *I = From; I != To; I = I->getNextNode()) {
+static bool hasCallsInBlockBetween(iterator_range<BasicBlock::iterator> R) {
+  for (Instruction &I : R) {
     // Assume that no intrinsic can resume the coroutine.
     if (isa<IntrinsicInst>(I))
       continue;
@@ -1256,7 +1256,7 @@ static bool hasCallsInBlocksBetween(BasicBlock *SaveBB, BasicBlock *ResDesBB) {
   Set.erase(ResDesBB);
 
   for (auto *BB : Set)
-    if (hasCallsInBlockBetween(BB->getFirstNonPHI(), nullptr))
+    if (hasCallsInBlockBetween({BB->getFirstNonPHIIt(), BB->end()}))
       return true;
 
   return false;
@@ -1265,17 +1265,19 @@ static bool hasCallsInBlocksBetween(BasicBlock *SaveBB, BasicBlock *ResDesBB) {
 static bool hasCallsBetween(Instruction *Save, Instruction *ResumeOrDestroy) {
   auto *SaveBB = Save->getParent();
   auto *ResumeOrDestroyBB = ResumeOrDestroy->getParent();
+  BasicBlock::iterator SaveIt = Save->getIterator();
+  BasicBlock::iterator ResumeOrDestroyIt = ResumeOrDestroy->getIterator();
 
   if (SaveBB == ResumeOrDestroyBB)
-    return hasCallsInBlockBetween(Save->getNextNode(), ResumeOrDestroy);
+    return hasCallsInBlockBetween({std::next(SaveIt), ResumeOrDestroyIt});
 
   // Any calls from Save to the end of the block?
-  if (hasCallsInBlockBetween(Save->getNextNode(), nullptr))
+  if (hasCallsInBlockBetween({std::next(SaveIt), SaveBB->end()}))
     return true;
 
   // Any calls from begging of the block up to ResumeOrDestroy?
-  if (hasCallsInBlockBetween(ResumeOrDestroyBB->getFirstNonPHI(),
-                             ResumeOrDestroy))
+  if (hasCallsInBlockBetween(
+          {ResumeOrDestroyBB->getFirstNonPHIIt(), ResumeOrDestroyIt}))
     return true;
 
   // Any calls in all of the blocks between SaveBB and ResumeOrDestroyBB?
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index e5f3b7f24bca7..0aa0cbcfc48b3 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1661,7 +1661,7 @@ void AddressSanitizer::instrumentMaskedLoadOrStore(
   if (Stride)
     Stride = IB.CreateZExtOrTrunc(Stride, IntptrTy);
 
-  SplitBlockAndInsertForEachLane(EVL, LoopInsertBefore,
+  SplitBlockAndInsertForEachLane(EVL, LoopInsertBefore->getIterator(),
                                  [&](IRBuilderBase &IRB, Value *Index) {
     Value *MaskElem = IRB.CreateExtractElement(Mask, Index);
     if (auto *MaskElemC = dyn_cast<ConstantInt>(MaskElem)) {
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 56d3eb10d73e9..10a796e0ce4d4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1272,7 +1272,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *End =
           IRB.CreateUDiv(RoundUp, ConstantInt::get(MS.IntptrTy, kOriginSize));
       auto [InsertPt, Index] =
-          SplitBlockAndInsertSimpleForLoop(End, &*IRB.GetInsertPoint());
+          SplitBlockAndInsertSimpleForLoop(End, IRB.GetInsertPoint());
       IRB.SetInsertPoint(InsertPt);
 
       Value *GEP = IRB.CreateGEP(MS.OriginTy, OriginPtr, Index);
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 5a9a7ecdc13bf..f57ec3f57b23b 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6138,7 +6138,7 @@ void LSRInstance::ImplementSolution(
     if (!llvm::all_of(BO->uses(),
                       [&](Use &U) {return DT.dominates(IVIncInsertPos, U);}))
       continue;
-    BO->moveBefore(IVIncInsertPos);
+    BO->moveBefore(IVIncInsertPos->getIterator());
     Changed = true;
   }
 
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7811677000998..5d191c27b5505 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1728,8 +1728,9 @@ void llvm::SplitBlockAndInsertIfThenElse(
   }
 }
 
-std::pair<Instruction*, Value*>
-llvm::SplitBlockAndInsertSimpleForLoop(Value *End, Instruction *SplitBefore) {
+std::pair<Instruction *, Value *>
+llvm::SplitBlockAndInsertSimpleForLoop(Value *End,
+                                       BasicBlock::iterator SplitBefore) {
   BasicBlock *LoopPred = SplitBefore->getParent();
   BasicBlock *LoopBody = SplitBlock(SplitBefore->getParent(), SplitBefore);
   BasicBlock *LoopExit = SplitBlock(SplitBefore->getParent(), SplitBefore);
@@ -1752,14 +1753,14 @@ llvm::SplitBlockAndInsertSimpleForLoop(Value *End, Instruction *SplitBefore) {
   IV->addIncoming(ConstantInt::get(Ty, 0), LoopPred);
   IV->addIncoming(IVNext, LoopBody);
 
-  return std::make_pair(LoopBody->getFirstNonPHI(), IV);
+  return std::make_pair(&*LoopBody->getFirstNonPHIIt(), IV);
 }
 
-void llvm::SplitBlockAndInsertForEachLane(ElementCount EC,
-     Type *IndexTy, Instruction *InsertBefore,
-     std::function<void(IRBuilderBase&, Value*)> Func) {
+void llvm::SplitBlockAndInsertForEachLane(
+    ElementCount EC, Type *IndexTy, BasicBlock::iterator InsertBefore,
+    std::function<void(IRBuilderBase &, Value *)> Func) {
 
-  IRBuilder<> IRB(InsertBefore);
+  IRBuilder<> IRB(InsertBefore->getParent(), InsertBefore);
 
   if (EC.isScalable()) {
     Value *NumElements = IRB.CreateElementCount(IndexTy, EC);
@@ -1780,10 +1781,10 @@ void llvm::SplitBlockAndInsertForEachLane(ElementCount EC,
 }
 
 void llvm::SplitBlockAndInsertForEachLane(
-    Value *EVL, Instruction *InsertBefore,
+    Value *EVL, BasicBlock::iterator InsertBefore,
     std::function<void(IRBuilderBase &, Value *)> Func) {
 
-  IRBuilder<> IRB(InsertBefore);
+  IRBuilder<> IRB(InsertBefore->getParent(), InsertBefore);
   Type *Ty = EVL->getType();
 
   if (!isa<ConstantInt>(EVL)) {
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 6a8a468ebcae2..adc40da07d967 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -184,14 +184,14 @@ namespace {
 } // end anonymous namespace
 
 static IntrinsicInst *getConvergenceEntry(BasicBlock &BB) {
-  auto *I = BB.getFirstNonPHI();
-  while (I) {
-    if (auto *IntrinsicCall = dyn_cast<ConvergenceControlInst>(I)) {
+  BasicBlock::iterator It = BB.getFirstNonPHIIt();
+  while (It != BB.end()) {
+    if (auto *IntrinsicCall = dyn_cast<ConvergenceControlInst>(It)) {
       if (IntrinsicCall->isEntry()) {
         return IntrinsicCall;
       }
     }
-    I = I->getNextNode();
+    It = std::next(It);
   }
   return nullptr;
 }

From 212f344b84b400c0a9dedfa3c1ec6af9d9d30223 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 27 Jan 2025 16:24:15 +0100
Subject: [PATCH 207/432] [InstCombine] Handle constant expression result in
 tryFactorization()

If IRBuilder folds the result to a constant expression, don't try
to set nowrap flags on it.

Fixes https://github.com/llvm/llvm-project/issues/124526.
---
 .../Transforms/InstCombine/InstructionCombining.cpp   |  2 +-
 llvm/test/Transforms/InstCombine/add2.ll              | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index cad17c511b6d0..a64c188575e6c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -732,7 +732,7 @@ static Value *tryFactorization(BinaryOperator &I, const SimplifyQuery &SQ,
   RetVal->takeName(&I);
 
   // Try to add no-overflow flags to the final value.
-  if (isa<OverflowingBinaryOperator>(RetVal)) {
+  if (isa<BinaryOperator>(RetVal)) {
     bool HasNSW = false;
     bool HasNUW = false;
     if (isa<OverflowingBinaryOperator>(&I)) {
diff --git a/llvm/test/Transforms/InstCombine/add2.ll b/llvm/test/Transforms/InstCombine/add2.ll
index ae80ab2e92ad1..c474a33c48a2b 100644
--- a/llvm/test/Transforms/InstCombine/add2.ll
+++ b/llvm/test/Transforms/InstCombine/add2.ll
@@ -321,6 +321,17 @@ define i16 @mul_add_to_mul_9(i16 %a) {
   ret i16 %add
 }
 
+@g = external global i8
+
+define i32 @shl_add_to_shl_constexpr() {
+; CHECK-LABEL: @shl_add_to_shl_constexpr(
+; CHECK-NEXT:    ret i32 mul (i32 ptrtoint (ptr @g to i32), i32 4)
+;
+  %shl = shl i32 ptrtoint (ptr @g to i32), 1
+  %add = add i32 %shl, %shl
+  ret i32 %add
+}
+
 ; This test and the next test verify that when a range metadata is attached to
 ; llvm.cttz, ValueTracking correctly intersects the range specified by the
 ; metadata and the range implied by the intrinsic.

From 559287575b5b747cfc01652adb647ee5516aa626 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Mon, 27 Jan 2025 07:28:47 -0800
Subject: [PATCH 208/432] [GlobalMerge][NFC] Reland "Skip sorting by
 profitability when it is not needed"

Relands #124146 but without changes to the sorting algorithm and the following
reverse.
---
 llvm/lib/CodeGen/GlobalMerge.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 7b76155b175d1..4debdea3f6568 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -423,24 +423,12 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     }
   }
 
-  // Now we found a bunch of sets of globals used together.  We accumulated
-  // the number of times we encountered the sets (i.e., the number of functions
-  // that use that exact set of globals).
-  //
-  // Multiply that by the size of the set to give us a crude profitability
-  // metric.
-  llvm::stable_sort(UsedGlobalSets,
-                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
-                      return UGS1.Globals.count() * UGS1.UsageCount <
-                             UGS2.Globals.count() * UGS2.UsageCount;
-                    });
-
   // We can choose to merge all globals together, but ignore globals never used
   // with another global.  This catches the obviously non-profitable cases of
   // having a single global, but is aggressive enough for any other case.
   if (GlobalMergeIgnoreSingleUse) {
     BitVector AllGlobals(Globals.size());
-    for (const UsedGlobalSet &UGS : llvm::reverse(UsedGlobalSets)) {
+    for (const UsedGlobalSet &UGS : UsedGlobalSets) {
       if (UGS.UsageCount == 0)
         continue;
       if (UGS.Globals.count() > 1)
@@ -449,6 +437,18 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
     return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
   }
 
+  // Now we found a bunch of sets of globals used together.  We accumulated
+  // the number of times we encountered the sets (i.e., the number of functions
+  // that use that exact set of globals).
+  //
+  // Multiply that by the size of the set to give us a crude profitability
+  // metric.
+  llvm::stable_sort(UsedGlobalSets,
+                    [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
+                      return UGS1.Globals.count() * UGS1.UsageCount <
+                             UGS2.Globals.count() * UGS2.UsageCount;
+                    });
+
   // Starting from the sets with the best (=biggest) profitability, find a
   // good combination.
   // The ideal (and expensive) solution can only be found by trying all

From 1eb4e9f88b827f9adbcdd5f385f75406aa604812 Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:32:13 -0500
Subject: [PATCH 209/432] [flang] arm build fix (#124562)

---
 flang/runtime/exceptions.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 7fca0c431f8cd..4ea11aa6ee8fc 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -108,7 +108,7 @@ bool RTNAME(GetUnderflowMode)(void) {
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  _FPU_GETCW(fpcr);
+  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
   return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
@@ -119,13 +119,13 @@ void RTNAME(SetUnderflowMode)(bool flag) {
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  _FPU_GETCW(fpcr);
+  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
   if (flag) {
     fpcr &= ~_FPU_FPCR_FZ_MASK_;
   } else {
     fpcr |= _FPU_FPCR_FZ_MASK_;
   }
-  _FPU_SETCW(fpcr);
+  __asm__ __volatile__("msr    fpcr, %w0" : : "r"(fpcr));
 #endif
 }
 

From 20f72d19fc58a394df765d407d9008a381e02b91 Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:38:42 -0500
Subject: [PATCH 210/432] Revert "[flang] arm build fix" (#124569)

Reverts llvm/llvm-project#124562
---
 flang/runtime/exceptions.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 4ea11aa6ee8fc..7fca0c431f8cd 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -108,7 +108,7 @@ bool RTNAME(GetUnderflowMode)(void) {
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
+  _FPU_GETCW(fpcr);
   return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
@@ -119,13 +119,13 @@ void RTNAME(SetUnderflowMode)(bool flag) {
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
 #elif defined(_FPU_GETCW)
   uint32_t fpcr;
-  __asm__ __volatile__("mrs    %w0, fpcr" : "=r"(fpcr));
+  _FPU_GETCW(fpcr);
   if (flag) {
     fpcr &= ~_FPU_FPCR_FZ_MASK_;
   } else {
     fpcr |= _FPU_FPCR_FZ_MASK_;
   }
-  __asm__ __volatile__("msr    fpcr, %w0" : : "r"(fpcr));
+  _FPU_SETCW(fpcr);
 #endif
 }
 

From 3322ba493ad7d203cdd6c545b7d9d9589c44357f Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 10:40:23 -0500
Subject: [PATCH 211/432] Revert "[flang] IEEE underflow control for Arm"
 (#124570)

Reverts llvm/llvm-project#124170
---
 flang/include/flang/Tools/TargetSetup.h | 31 +++++++++++----------
 flang/runtime/exceptions.cpp            | 36 +++++--------------------
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index 5d23df6823a94..d1b0da3a42c89 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -24,35 +24,34 @@ namespace Fortran::tools {
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
-
-  targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true);
-
+  // FIXME: Handle real(3) ?
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
+    targetCharacteristics.DisableType(
+        Fortran::common::TypeCategory::Real, /*kind=*/10);
+  }
   if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
   }
-
   if (targetTriple.isARM() || targetTriple.isAArch64()) {
     targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime();
     targetCharacteristics.set_ieeeFeature(
         evaluate::IeeeFeature::Halting, false);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
-  }
-
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
-    targetCharacteristics.DisableType(
-        Fortran::common::TypeCategory::Real, /*kind=*/10);
+  } else {
+    targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting);
   }
 
-  // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h.
-  // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation.
+  // Figure out if we can support F128: see
+  // flang/runtime/Float128Math/math-entries.h
+  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
+  // cross-compilation
 #ifdef FLANG_RUNTIME_F128_MATH_LIB
-  constexpr bool f128Support = true; // use libquadmath wrappers
+  // we can use libquadmath wrappers
+  constexpr bool f128Support = true;
 #elif HAS_LDBL128
-  constexpr bool f128Support = true; // use libm wrappers
+  // we can use libm wrappers
+  constexpr bool f128Support = true;
 #else
   constexpr bool f128Support = false;
 #endif
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 7fca0c431f8cd..f541b8e844ade 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,9 +11,7 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if __aarch64__
-#include <fpu_control.h>
-#elif __x86_64__
+#if __x86_64__
 #include <xmmintrin.h>
 #endif
 
@@ -92,40 +90,20 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
 #endif
 }
 
-// A hardware FZ (flush to zero) bit is the negation of the
-// ieee_[get|set]_underflow_mode GRADUAL argument.
-#if defined(_MM_FLUSH_ZERO_MASK)
-// The MXCSR FZ bit affects computations of real kinds 3, 4, and 8.
-#elif defined(_FPU_GETCW)
-// The FPCR FZ bit affects computations of real kinds 3, 4, and 8.
-// bit 24: FZ   -- single, double precision flush to zero bit
-// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant]
-#define _FPU_FPCR_FZ_MASK_ 0x01080000
-#endif
-
 bool RTNAME(GetUnderflowMode)(void) {
-#if defined(_MM_FLUSH_ZERO_MASK)
+#if _MM_FLUSH_ZERO_MASK
+  // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode
+  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
-#elif defined(_FPU_GETCW)
-  uint32_t fpcr;
-  _FPU_GETCW(fpcr);
-  return (fpcr & _FPU_FPCR_FZ_MASK_) != _FPU_FPCR_FZ_MASK_;
 #else
   return false;
 #endif
 }
 void RTNAME(SetUnderflowMode)(bool flag) {
-#if defined(_MM_FLUSH_ZERO_MASK)
+#if _MM_FLUSH_ZERO_MASK
+  // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode
+  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
-#elif defined(_FPU_GETCW)
-  uint32_t fpcr;
-  _FPU_GETCW(fpcr);
-  if (flag) {
-    fpcr &= ~_FPU_FPCR_FZ_MASK_;
-  } else {
-    fpcr |= _FPU_FPCR_FZ_MASK_;
-  }
-  _FPU_SETCW(fpcr);
 #endif
 }
 

From 1e2d5f7943d09d658a5fbacf661d2c6c361f857c Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Mon, 27 Jan 2025 16:44:17 +0100
Subject: [PATCH 212/432] [NFC][mlir][OpenMP] Remove mentions of `target` from
 generic `loop` rewrite (#124528)

This removes mentions of `target` from the generic `loop` rewrite pass
since there is not need for it anyway. It is enough to detect `loop`'s
nesting within `teams` or `parallel` directives.
---
 .../OpenMP/GenericLoopConversion.cpp          | 27 ++++----
 .../generic-loop-rewriting-todo.mlir          | 66 ++++++++-----------
 .../Transforms/generic-loop-rewriting.mlir    | 53 +++++++--------
 3 files changed, 62 insertions(+), 84 deletions(-)

diff --git a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
index c95d625d7240b..3e742f7d60a9c 100644
--- a/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/GenericLoopConversion.cpp
@@ -29,11 +29,7 @@ namespace {
 class GenericLoopConversionPattern
     : public mlir::OpConversionPattern<mlir::omp::LoopOp> {
 public:
-  enum class GenericLoopCombinedInfo {
-    Standalone,
-    TargetTeamsLoop,
-    TargetParallelLoop
-  };
+  enum class GenericLoopCombinedInfo { Standalone, TeamsLoop, ParallelLoop };
 
   using mlir::OpConversionPattern<mlir::omp::LoopOp>::OpConversionPattern;
 
@@ -55,10 +51,11 @@ class GenericLoopConversionPattern
     case GenericLoopCombinedInfo::Standalone:
       rewriteStandaloneLoop(loopOp, rewriter);
       break;
-    case GenericLoopCombinedInfo::TargetParallelLoop:
-      llvm_unreachable("not yet implemented: `parallel loop` direcitve");
+    case GenericLoopCombinedInfo::ParallelLoop:
+      llvm_unreachable(
+          "not yet implemented: Combined `parallel loop` directive");
       break;
-    case GenericLoopCombinedInfo::TargetTeamsLoop:
+    case GenericLoopCombinedInfo::TeamsLoop:
       rewriteToDistributeParallelDo(loopOp, rewriter);
       break;
     }
@@ -74,10 +71,10 @@ class GenericLoopConversionPattern
     switch (combinedInfo) {
     case GenericLoopCombinedInfo::Standalone:
       break;
-    case GenericLoopCombinedInfo::TargetParallelLoop:
+    case GenericLoopCombinedInfo::ParallelLoop:
       return loopOp.emitError(
-          "not yet implemented: Combined `omp target parallel loop` directive");
-    case GenericLoopCombinedInfo::TargetTeamsLoop:
+          "not yet implemented: Combined `parallel loop` directive");
+    case GenericLoopCombinedInfo::TeamsLoop:
       break;
     }
 
@@ -99,7 +96,7 @@ class GenericLoopConversionPattern
     if (!loopOp.getReductionVars().empty())
       return todo("reduction");
 
-    // TODO For `target teams loop`, check similar constrains to what is checked
+    // TODO For `teams loop`, check similar constrains to what is checked
     // by `TeamsLoopChecker` in SemaOpenMP.cpp.
     return mlir::success();
   }
@@ -111,13 +108,11 @@ class GenericLoopConversionPattern
     GenericLoopCombinedInfo result = GenericLoopCombinedInfo::Standalone;
 
     if (auto teamsOp = mlir::dyn_cast_if_present<mlir::omp::TeamsOp>(parentOp))
-      if (mlir::isa_and_present<mlir::omp::TargetOp>(teamsOp->getParentOp()))
-        result = GenericLoopCombinedInfo::TargetTeamsLoop;
+      result = GenericLoopCombinedInfo::TeamsLoop;
 
     if (auto parallelOp =
             mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(parentOp))
-      if (mlir::isa_and_present<mlir::omp::TargetOp>(parallelOp->getParentOp()))
-        result = GenericLoopCombinedInfo::TargetParallelLoop;
+      result = GenericLoopCombinedInfo::ParallelLoop;
 
     return result;
   }
diff --git a/flang/test/Transforms/generic-loop-rewriting-todo.mlir b/flang/test/Transforms/generic-loop-rewriting-todo.mlir
index becd6b8dcb5cb..cbde981c4c49d 100644
--- a/flang/test/Transforms/generic-loop-rewriting-todo.mlir
+++ b/flang/test/Transforms/generic-loop-rewriting-todo.mlir
@@ -1,37 +1,31 @@
 // RUN: fir-opt --omp-generic-loop-conversion -verify-diagnostics %s
 
-func.func @_QPtarget_parallel_loop() {
-  omp.target {
-    omp.parallel {
-      %c0 = arith.constant 0 : i32
-      %c10 = arith.constant 10 : i32
-      %c1 = arith.constant 1 : i32
-      // expected-error@below {{not yet implemented: Combined `omp target parallel loop` directive}}
-      omp.loop {
-        omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-          omp.yield
-        }
+func.func @_QPparallel_loop() {
+  omp.parallel {
+    %c0 = arith.constant 0 : i32
+    %c10 = arith.constant 10 : i32
+    %c1 = arith.constant 1 : i32
+    // expected-error@below {{not yet implemented: Combined `parallel loop` directive}}
+    omp.loop {
+      omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
+        omp.yield
       }
-      omp.terminator
     }
     omp.terminator
   }
   return
 }
 
-func.func @_QPtarget_loop_bind() {
-  omp.target {
-    omp.teams {
-      %c0 = arith.constant 0 : i32
-      %c10 = arith.constant 10 : i32
-      %c1 = arith.constant 1 : i32
-      // expected-error@below {{not yet implemented: Unhandled clause bind in omp.loop operation}}
-      omp.loop bind(thread) {
-        omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-          omp.yield
-        }
+func.func @_QPloop_bind() {
+  omp.teams {
+    %c0 = arith.constant 0 : i32
+    %c10 = arith.constant 10 : i32
+    %c1 = arith.constant 1 : i32
+    // expected-error@below {{not yet implemented: Unhandled clause bind in omp.loop operation}}
+    omp.loop bind(thread) {
+      omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
+        omp.yield
       }
-      omp.terminator
     }
     omp.terminator
   }
@@ -48,22 +42,18 @@ omp.declare_reduction @add_reduction_i32 : i32 init {
     omp.yield(%0 : i32)
   }
 
-func.func @_QPtarget_loop_order() {
+func.func @_QPloop_order() {
+  omp.teams {
+    %c0 = arith.constant 0 : i32
+    %c10 = arith.constant 10 : i32
+    %c1 = arith.constant 1 : i32
+    %sum = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_orderEi"}
 
-  omp.target {
-    omp.teams {
-      %c0 = arith.constant 0 : i32
-      %c10 = arith.constant 10 : i32
-      %c1 = arith.constant 1 : i32
-      %sum = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_orderEi"}
-
-      // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.loop operation}}
-      omp.loop reduction(@add_reduction_i32 %sum -> %arg2 : !fir.ref<i32>) {
-        omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-          omp.yield
-        }
+    // expected-error@below {{not yet implemented: Unhandled clause reduction in omp.loop operation}}
+    omp.loop reduction(@add_reduction_i32 %sum -> %arg2 : !fir.ref<i32>) {
+      omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
+        omp.yield
       }
-      omp.terminator
     }
     omp.terminator
   }
diff --git a/flang/test/Transforms/generic-loop-rewriting.mlir b/flang/test/Transforms/generic-loop-rewriting.mlir
index a18ea9853602a..842136444fc15 100644
--- a/flang/test/Transforms/generic-loop-rewriting.mlir
+++ b/flang/test/Transforms/generic-loop-rewriting.mlir
@@ -1,55 +1,48 @@
 // RUN: fir-opt --omp-generic-loop-conversion %s | FileCheck %s
 
-omp.private {type = private} @_QFtarget_teams_loopEi_private_ref_i32 : !fir.ref<i32> alloc {
+omp.private {type = private} @_QFteams_loopEi_private_ref_i32 : !fir.ref<i32> alloc {
 ^bb0(%arg0: !fir.ref<i32>):
   omp.yield(%arg0 : !fir.ref<i32>)
 }
 
-func.func @_QPtarget_teams_loop() {
+func.func @_QPteams_loop() {
   %i = fir.alloca i32
-  %i_map = omp.map.info var_ptr(%i : !fir.ref<i32>, i32) map_clauses(implicit, exit_release_or_enter_alloc) capture(ByCopy) -> !fir.ref<i32> {name = "i"}
-  omp.target map_entries(%i_map -> %arg0 : !fir.ref<i32>) {
-    omp.teams {
-      %c0 = arith.constant 0 : i32
-      %c10 = arith.constant 10 : i32
-      %c1 = arith.constant 1 : i32
-      omp.loop private(@_QFtarget_teams_loopEi_private_ref_i32 %arg0 -> %arg2 : !fir.ref<i32>) {
-        omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
-          fir.store %arg3 to %arg2 : !fir.ref<i32>
-          omp.yield
-        }
+  omp.teams {
+    %c0 = arith.constant 0 : i32
+    %c10 = arith.constant 10 : i32
+    %c1 = arith.constant 1 : i32
+    omp.loop private(@_QFteams_loopEi_private_ref_i32 %i -> %arg2 : !fir.ref<i32>) {
+      omp.loop_nest (%arg3) : i32 = (%c0) to (%c10) inclusive step (%c1) {
+        fir.store %arg3 to %arg2 : !fir.ref<i32>
+        omp.yield
       }
-      omp.terminator
     }
     omp.terminator
   }
   return
 }
 
-// CHECK-LABEL: func.func @_QPtarget_teams_loop
-// CHECK:         omp.target map_entries(
-// CHECK-SAME:      %{{.*}} -> %[[I_ARG:[^[:space:]]+]] : {{.*}}) {
-// 
-// CHECK:           omp.teams {
+// CHECK-LABEL: func.func @_QPteams_loop
+// CHECK:         %[[I:.*]] = fir.alloca i32
+// CHECK:         omp.teams {
 // 
 // TODO we probably need to move the `loop_nest` bounds ops from the `teams`
 // region to the `parallel` region to avoid making these values `shared`. We can
 // find the backward slices of these bounds that are within the `teams` region
 // and move these slices to the `parallel` op.
 
-// CHECK:             %[[LB:.*]] = arith.constant 0 : i32
-// CHECK:             %[[UB:.*]] = arith.constant 10 : i32
-// CHECK:             %[[STEP:.*]] = arith.constant 1 : i32
+// CHECK:           %[[LB:.*]] = arith.constant 0 : i32
+// CHECK:           %[[UB:.*]] = arith.constant 10 : i32
+// CHECK:           %[[STEP:.*]] = arith.constant 1 : i32
 // 
-// CHECK:             omp.parallel private(@{{.*}} %[[I_ARG]]
-// CHECK-SAME:          -> %[[I_PRIV_ARG:[^[:space:]]+]] : !fir.ref<i32>) {
-// CHECK:               omp.distribute {
-// CHECK:                 omp.wsloop {
+// CHECK:           omp.parallel private(@{{.*}} %[[I]]
+// CHECK-SAME:        -> %[[I_PRIV_ARG:[^[:space:]]+]] : !fir.ref<i32>) {
+// CHECK:             omp.distribute {
+// CHECK:               omp.wsloop {
 // 
-// CHECK:                   omp.loop_nest (%{{.*}}) : i32 = 
-// CHECK-SAME:                (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
-// CHECK:                     fir.store %{{.*}} to %[[I_PRIV_ARG]] : !fir.ref<i32>
-// CHECK:                   }
+// CHECK:                 omp.loop_nest (%{{.*}}) : i32 =
+// CHECK-SAME:              (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+// CHECK:                   fir.store %{{.*}} to %[[I_PRIV_ARG]] : !fir.ref<i32>
 // CHECK:                 }
 // CHECK:               }
 // CHECK:             }

From e492083f55d98144ba9a049450cb429d7fd52510 Mon Sep 17 00:00:00 2001
From: Scott Manley <rscottmanley@gmail.com>
Date: Mon, 27 Jan 2025 09:47:45 -0600
Subject: [PATCH 213/432] [OpenACC] Add AutomaticAllocationScope to recipe ops
 (#124337)

The recipe operations should have AutomaticAllocationScope so recipes can
be converted using operators that require parent ops to have
AutomaticAllocationScope
---
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 17 +++++---
 mlir/test/Dialect/OpenACC/ops.mlir            | 41 +++++++++++++++++++
 2 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index c60eb5cc620a7..7e9ed2c741cf7 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1008,8 +1008,9 @@ def OpenACC_UpdateHostOp : OpenACC_DataExitOpWithVarPtr<"update_host",
 // 2.5.13 private clause
 //===----------------------------------------------------------------------===//
 
-def OpenACC_PrivateRecipeOp : OpenACC_Op<"private.recipe",
-    [IsolatedFromAbove, Symbol, RecipeInterface]> {
+def OpenACC_PrivateRecipeOp
+    : OpenACC_Op<"private.recipe", [IsolatedFromAbove, Symbol, RecipeInterface,
+                                    AutomaticAllocationScope]> {
   let summary = "privatization recipe";
 
   let description = [{
@@ -1065,8 +1066,10 @@ def OpenACC_PrivateRecipeOp : OpenACC_Op<"private.recipe",
 // 2.5.14 firstprivate clause
 //===----------------------------------------------------------------------===//
 
-def OpenACC_FirstprivateRecipeOp : OpenACC_Op<"firstprivate.recipe",
-    [IsolatedFromAbove, Symbol, RecipeInterface]> {
+def OpenACC_FirstprivateRecipeOp
+    : OpenACC_Op<"firstprivate.recipe", [IsolatedFromAbove, Symbol,
+                                         RecipeInterface,
+                                         AutomaticAllocationScope]> {
   let summary = "privatization recipe";
 
   let description = [{
@@ -1131,8 +1134,10 @@ def OpenACC_FirstprivateRecipeOp : OpenACC_Op<"firstprivate.recipe",
 // 2.5.15 reduction clause
 //===----------------------------------------------------------------------===//
 
-def OpenACC_ReductionRecipeOp : OpenACC_Op<"reduction.recipe",
-    [IsolatedFromAbove, Symbol, RecipeInterface]> {
+def OpenACC_ReductionRecipeOp
+    : OpenACC_Op<"reduction.recipe", [IsolatedFromAbove, Symbol,
+                                      RecipeInterface,
+                                      AutomaticAllocationScope]> {
   let summary = "reduction recipe";
 
   let description = [{
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 2706792a263a8..28ab6f9fcfb4c 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1892,3 +1892,44 @@ func.func @acc_combined() {
 // CHECK: acc.loop combined(kernels)
 // CHECK: acc.serial combined(loop)
 // CHECK: acc.loop combined(serial)
+
+acc.firstprivate.recipe @firstprivatization_memref_i32 : memref<i32> init {
+^bb0(%arg0: memref<i32>):
+  %alloca = memref.alloca() : memref<i32>
+  acc.yield %alloca : memref<i32>
+} copy {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+  %0 = memref.load %arg1[] : memref<i32>
+  memref.store %0, %arg0[] : memref<i32>
+  acc.terminator
+}
+
+// CHECK-LABEL: acc.firstprivate.recipe @firstprivatization_memref_i32
+// CHECK:       memref.alloca
+
+acc.reduction.recipe @reduction_add_memref_i32 : memref<i32> reduction_operator <add> init {
+^bb0(%arg0: memref<i32>):
+  %c0_i32 = arith.constant 0 : i32
+  %alloca = memref.alloca() : memref<i32>
+  memref.store %c0_i32, %alloca[] : memref<i32>
+  acc.yield %alloca : memref<i32>
+} combiner {
+^bb0(%arg0: memref<i32>, %arg1: memref<i32>):
+  %0 = memref.load %arg0[] : memref<i32>
+  %1 = memref.load %arg1[] : memref<i32>
+  %2 = arith.addi %0, %1 : i32
+  memref.store %2, %arg0[] : memref<i32>
+  acc.yield %arg0 : memref<i32>
+}
+
+// CHECK-LABEL: acc.reduction.recipe @reduction_add_memref_i32
+// CHECK:       memref.alloca
+
+acc.private.recipe @privatization_memref_i32 : memref<i32> init {
+^bb0(%arg0: memref<i32>):
+  %alloca = memref.alloca() : memref<i32>
+  acc.yield %alloca : memref<i32>
+}
+
+// CHECK-LABEL: acc.private.recipe @privatization_memref_i32
+// CHECK:       memref.alloca

From f95f10c7e65b9abd9b4e2ed71190d08565952ec0 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 15:50:53 +0000
Subject: [PATCH 214/432] [AArch64] Generate zeroing forms of certain SVE2.2
 instructions (9/11) (#116835)

SVE2.2 introduces instructions with predicated forms with zeroing of
the inactive lanes. This allows in some cases to save a `movprfx` or
a `mov` instruction when emitting code for `_x` or `_z` variants of
intrinsics.

This patch adds support for emitting the zeroing forms of certain
`URECPE`, `URSQRTE`, `SQABS` and `SQNEG` instructions.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  13 +-
 ...eroing-forms-urecpe-ursqrte-sqabs-sqneg.ll | 858 ++++++++++++++++++
 3 files changed, 874 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 2d6a3b6199c67..dee94d3184146 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4283,10 +4283,10 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm FLOGB_ZPzZ : sve_fp_z2op_p_zd_d_flogb<"flogb", int_aarch64_sve_flogb>;
 
   // SVE2 integer unary operations, zeroing predicate
-  def URECPE_ZPzZ  : sve2_int_un_pred_arit_z<0b10, 0b00, "urecpe", ZPR32>;
-  def URSQRTE_ZPzZ : sve2_int_un_pred_arit_z<0b10, 0b01, "ursqrte", ZPR32>;
-  defm SQABS_ZPzZ  : sve2_int_un_pred_arit_z<0b10, "sqabs">;
-  defm SQNEG_ZPzZ  : sve2_int_un_pred_arit_z<0b11, "sqneg">;
+  defm URECPE_ZPzZ  : sve2_int_un_pred_arit_z_S<0b00, "urecpe",  int_aarch64_sve_urecpe>;
+  defm URSQRTE_ZPzZ : sve2_int_un_pred_arit_z_S<0b01, "ursqrte", int_aarch64_sve_ursqrte>;
+  defm SQABS_ZPzZ   : sve2_int_un_pred_arit_z<  0b10, "sqabs",   int_aarch64_sve_sqabs>;
+  defm SQNEG_ZPzZ   : sve2_int_un_pred_arit_z<  0b11, "sqneg",   int_aarch64_sve_sqneg>;
 
   // Floating point round to integral fp value in integer size range
   // Merging
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 199b2e343d3f7..82bee17b24be5 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4200,11 +4200,22 @@ multiclass sve2_int_un_pred_arit<bits<2> opc, string asm, SDPatternOperator op>
   defm : SVE_3_Op_Undef_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
-multiclass sve2_int_un_pred_arit_z<bits<2> opc, string asm> {
+multiclass sve2_int_un_pred_arit_z_S<bits<2> opc, string asm, SDPatternOperator op> {
+  def _S : sve2_int_un_pred_arit_z<0b10, opc, asm, ZPR32>;
+
+  defm : SVE_3_Op_UndefZero_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+}
+
+multiclass sve2_int_un_pred_arit_z<bits<2> opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_un_pred_arit_z<0b00, opc, asm, ZPR8>;
   def _H : sve2_int_un_pred_arit_z<0b01, opc, asm, ZPR16>;
   def _S : sve2_int_un_pred_arit_z<0b10, opc, asm, ZPR32>;
   def _D : sve2_int_un_pred_arit_z<0b11, opc, asm, ZPR64>;
+
+  defm : SVE_3_Op_UndefZero_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_3_Op_UndefZero_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll
new file mode 100644
index 0000000000000..787ac4458079c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-urecpe-ursqrte-sqabs-sqneg.ll
@@ -0,0 +1,858 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2   < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme    --force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 --force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 4 x i32> @test_svrecpe_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrecpe_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    urecpe z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpe_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    urecpe z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.urecpe.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrecpe_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrecpe_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    urecpe z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpe_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    urecpe z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.urecpe.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrecpe_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrecpe_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    urecpe z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrecpe_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    urecpe z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.urecpe.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrsqrte_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrsqrte_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ursqrte z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrsqrte_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ursqrte z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ursqrte.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrsqrte_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrsqrte_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ursqrte z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrsqrte_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ursqrte z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ursqrte.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrsqrte_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrsqrte_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ursqrte z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrsqrte_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ursqrte z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ursqrte.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 16 x i8> @test_svqabs_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqabs_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqabs z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqabs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svqabs_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqabs_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqabs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svqabs_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqabs_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    sqabs z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqabs.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svqabs_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqabs_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqabs z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqabs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svqabs_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqabs_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqabs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svqabs_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqabs_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    sqabs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqabs.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svqabs_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqabs_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqabs z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqabs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svqabs_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqabs_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqabs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svqabs_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqabs_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sqabs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqabs.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svqabs_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqabs_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqabs z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svqabs_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqabs_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svqabs_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqabs_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    sqabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqabs_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svqneg_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqneg_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqneg z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqneg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svqneg_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqneg_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqneg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svqneg_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svqneg_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    sqneg z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqneg.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svqneg_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqneg_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqneg z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqneg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svqneg_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqneg_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqneg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svqneg_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svqneg_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    sqneg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqneg.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svqneg_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqneg_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqneg z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqneg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svqneg_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqneg_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqneg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svqneg_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svqneg_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sqneg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqneg.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svqneg_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqneg_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sqneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqneg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svqneg_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqneg_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqneg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svqneg_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svqneg_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    sqneg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svqneg_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sqneg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqneg.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svurecpe_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svurecpe_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    urecpe z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svurecpe_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    urecpe z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.urecpe.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svurecpe_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svurecpe_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    urecpe z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svurecpe_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    urecpe z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.urecpe.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svursqrte_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svursqrte_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    ursqrte z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svursqrte_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    ursqrte z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ursqrte.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svursqrte_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svursqrte_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    ursqrte z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svursqrte_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    ursqrte z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.ursqrte.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 16 x i8> @test_svsqabs_nxv16i8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svsqabs_nxv16i8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv16i8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    sqabs z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqabs.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svsqabs_nxv16i8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svsqabs_nxv16i8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqabs z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv16i8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    sqabs z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqabs.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svsqabs_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svsqabs_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sqabs z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqabs.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svsqabs_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svsqabs_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqabs z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sqabs z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqabs.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svsqabs_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svsqabs_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sqabs z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqabs.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svsqabs_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svsqabs_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqabs z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sqabs z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqabs.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svsqabs_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svsqabs_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqabs z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sqabs z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsqabs_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svsqabs_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqabs z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqabs_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sqabs z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqabs.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svsqneg_nxv16i8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svsqneg_nxv16i8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv16i8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    sqneg z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqneg.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svsqneg_nxv16i8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svsqneg_nxv16i8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqneg z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv16i8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    sqneg z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.sqneg.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svsqneg_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svsqneg_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sqneg z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqneg.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svsqneg_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svsqneg_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqneg z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sqneg z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sqneg.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svsqneg_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svsqneg_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sqneg z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqneg.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svsqneg_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svsqneg_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqneg z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sqneg z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sqneg.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svsqneg_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svsqneg_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sqneg z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sqneg z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqneg.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsqneg_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svsqneg_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sqneg z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsqneg_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sqneg z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sqneg.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}

From cb6f021af2354761357684ffa26ebbe718615244 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 27 Jan 2025 23:54:32 +0800
Subject: [PATCH 215/432] [RISCV][VLOPT] Remove unnecessary passthru
 restriction (#124549)

We currently check for passthrus in two places, on the instruction to
reduce in isCandidate, and on the users in checkUsers.

We cannot reduce the VL if an instruction has a user that's a passthru,
because the user will read elements past VL in the tail.

However it's fine to reduce an instruction if it itself contains a
non-undef passthru. Since the VL can only be reduced, not increased, the
previous tail will always remain the same.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    | 21 +------
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 24 +++++---
 llvm/test/CodeGen/RISCV/rvv/vl-opt.ll         | 60 ++++++++++++++++---
 3 files changed, 69 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 976c65e51c205..be897bb257796 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -1143,27 +1143,8 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   if (MI.getNumDefs() != 1)
     return false;
 
-  // If we're not using VLMAX, then we need to be careful whether we are using
-  // TA/TU when there is a non-undef Passthru. But when we are using VLMAX, it
-  // does not matter whether we are using TA/TU with a non-undef Passthru, since
-  // there are no tail elements to be preserved.
   unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
   const MachineOperand &VLOp = MI.getOperand(VLOpNum);
-  if (VLOp.isReg() || VLOp.getImm() != RISCV::VLMaxSentinel) {
-    // If MI has a non-undef passthru, we will not try to optimize it since
-    // that requires us to preserve tail elements according to TA/TU.
-    // Otherwise, The MI has an undef Passthru, so it doesn't matter whether we
-    // are using TA/TU.
-    bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
-    unsigned PassthruOpIdx = MI.getNumExplicitDefs();
-    if (HasPassthru &&
-        MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister) {
-      LLVM_DEBUG(
-          dbgs() << "  Not a candidate because it uses non-undef passthru"
-                    " with non-VLMAX VL\n");
-      return false;
-    }
-  }
 
   // If the VL is 1, then there is no need to reduce it. This is an
   // optimization, not needed to preserve correctness.
@@ -1247,7 +1228,7 @@ std::optional<MachineOperand> RISCVVLOptimizer::checkUsers(MachineInstr &MI) {
       return std::nullopt;
     }
 
-    // Tied operands might pass through.
+    // If used as a passthru, elements past VL will be read.
     if (UserOp.isTied()) {
       LLVM_DEBUG(dbgs() << "    Abort because user used as tied operand\n");
       return std::nullopt;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 49db94e1a02df..9dbe261b7cd05 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -3919,11 +3919,12 @@ define void @trunc_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -4002,11 +4003,12 @@ define void @trunc_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
@@ -4098,12 +4100,13 @@ define void @ceil_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -4189,12 +4192,13 @@ define void @ceil_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 3
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
@@ -4290,12 +4294,13 @@ define void @floor_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -4381,12 +4386,13 @@ define void @floor_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 2
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
@@ -4482,12 +4488,13 @@ define void @round_v6bf16(ptr %x) {
 ; CHECK-NEXT:    vfabs.v v8, v10
 ; CHECK-NEXT:    vmflt.vf v0, v8, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; CHECK-NEXT:    fsrm a1
 ; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
 ; CHECK-NEXT:    vse16.v v8, (a0)
 ; CHECK-NEXT:    ret
@@ -4573,12 +4580,13 @@ define void @round_v6f16(ptr %x) {
 ; ZVFHMIN-NEXT:    vfabs.v v8, v10
 ; ZVFHMIN-NEXT:    vmflt.vf v0, v8, fa5
 ; ZVFHMIN-NEXT:    fsrmi a1, 4
+; ZVFHMIN-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfcvt.x.f.v v8, v10, v0.t
 ; ZVFHMIN-NEXT:    fsrm a1
 ; ZVFHMIN-NEXT:    vfcvt.f.x.v v8, v8, v0.t
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
 ; ZVFHMIN-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
-; ZVFHMIN-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    vse16.v v8, (a0)
 ; ZVFHMIN-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 1cc30f077feb4..3e49da014d56f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -107,7 +107,8 @@ define <vscale x 4 x i32> @different_vl_with_ta(<vscale x 4 x i32> %a, <vscale x
   ret <vscale x 4 x i32> %w
 }
 
-; Test case to make sure VL won't propgate if using tail-undisturbed policy.
+; We can propagate VL to a tail-undisturbed policy, provided none of its users
+; are passthrus (i.e. read past VL).
 define <vscale x 4 x i32> @different_vl_with_tu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
 ; CHECK-LABEL: different_vl_with_tu:
 ; CHECK:       # %bb.0:
@@ -118,22 +119,65 @@ define <vscale x 4 x i32> @different_vl_with_tu(<vscale x 4 x i32> %passthru, <v
 ; CHECK-NEXT:    vadd.vv v8, v14, v10
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen %vl2)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen %vl2)
   ret <vscale x 4 x i32> %w
 }
 
-; Test case to make sure VL won't propgate if using tail-undisturbed policy.
+; We can propagate VL to a tail-undisturbed policy, provided none of its users
+; are passthrus (i.e. read past VL).
 define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; CHECK-LABEL: different_imm_vl_with_tu:
+; NOVLOPT-LABEL: different_imm_vl_with_tu:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
+; NOVLOPT-NEXT:    vmv2r.v v14, v10
+; NOVLOPT-NEXT:    vadd.vv v14, v10, v12
+; NOVLOPT-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v14, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: different_imm_vl_with_tu:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
+; VLOPT-NEXT:    vmv2r.v v14, v10
+; VLOPT-NEXT:    vadd.vv v14, v10, v12
+; VLOPT-NEXT:    vadd.vv v8, v14, v10
+; VLOPT-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
+  ret <vscale x 4 x i32> %w
+}
+
+; We can't reduce the VL as %v is used as a passthru, i.e. the elements past VL
+; are demanded.
+define <vscale x 4 x i32> @different_vl_as_passthru(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; CHECK-LABEL: different_vl_as_passthru:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vadd.vv v12, v8, v10
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT:    vadd.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %v, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl2)
+  ret <vscale x 4 x i32> %w
+}
+
+; We can't reduce the VL as %v is used as a passthru, i.e. the elements past VL
+; are demanded.
+define <vscale x 4 x i32> @different_imm_vl_as_passthru(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; CHECK-LABEL: different_imm_vl_as_passthru:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 5, e32, m2, tu, ma
-; CHECK-NEXT:    vmv2r.v v14, v10
-; CHECK-NEXT:    vadd.vv v14, v10, v12
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vadd.vv v12, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
-; CHECK-NEXT:    vadd.vv v8, v14, v10
+; CHECK-NEXT:    vadd.vv v12, v8, v10
+; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
-  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %v, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 4)
   ret <vscale x 4 x i32> %w
 }
 

From 178f47143a3b3c547df6d1f07e9707792f5d9fd4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 27 Jan 2025 15:56:22 +0000
Subject: [PATCH 216/432] [CostModel][X86] getShuffleCost - shuffles with only
 one defined element are always cheap (#124412)

If we're just moving a single element around inside a 128-bit lane (probably as an alternative to extracting it), we can assume this is cheap as a single PSRLDQ/PSHUFD/SHUFPS.

I've got the horrid feeling we're moving towards matching all SSE shuffle patterns inside the cost model, but I'm going to do my best to avoid this for now :|
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 13 ++++-
 llvm/test/Analysis/CostModel/X86/reduction.ll |  4 +-
 .../test/Transforms/PhaseOrdering/X86/hadd.ll | 21 ++++----
 .../test/Transforms/PhaseOrdering/X86/hsub.ll | 21 ++++----
 .../X86/extract-binop-inseltpoison.ll         | 48 ++++++-------------
 .../VectorCombine/X86/extract-binop.ll        | 48 ++++++-------------
 .../VectorCombine/X86/load-inseltpoison.ll    | 47 ++++++------------
 7 files changed, 76 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index cdc2ce752743c..a64d65cb770b0 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1565,19 +1565,25 @@ InstructionCost X86TTIImpl::getShuffleCost(
 
   // Attempt to detect a cheaper inlane shuffle, avoiding 128-bit subvector
   // permutation.
+  // Attempt to detect a shuffle mask with a single defined element.
   bool IsInLaneShuffle = false;
+  bool IsSingleElementMask = false;
   if (BaseTp->getPrimitiveSizeInBits() > 0 &&
       (BaseTp->getPrimitiveSizeInBits() % 128) == 0 &&
       BaseTp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
       Mask.size() == BaseTp->getElementCount().getKnownMinValue()) {
     unsigned NumLanes = BaseTp->getPrimitiveSizeInBits() / 128;
     unsigned NumEltsPerLane = Mask.size() / NumLanes;
-    if ((Mask.size() % NumLanes) == 0)
+    if ((Mask.size() % NumLanes) == 0) {
       IsInLaneShuffle = all_of(enumerate(Mask), [&](const auto &P) {
         return P.value() == PoisonMaskElem ||
                ((P.value() % Mask.size()) / NumEltsPerLane) ==
                    (P.index() / NumEltsPerLane);
       });
+      IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) {
+                              return M == PoisonMaskElem;
+                            });
+    }
   }
 
   // Treat <X x bfloat> shuffles as <X x half>.
@@ -1791,6 +1797,11 @@ InstructionCost X86TTIImpl::getShuffleCost(
     return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);
   }
 
+  // If we're just moving a single element around (probably as an alternative to
+  // extracting it), we can assume this is cheap.
+  if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
+    return TTI::TCC_Basic;
+
   static const CostTblEntry AVX512VBMIShuffleTbl[] = {
       {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
       {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll
index d7cf8e6cb8905..5ff3920c63874 100644
--- a/llvm/test/Analysis/CostModel/X86/reduction.ll
+++ b/llvm/test/Analysis/CostModel/X86/reduction.ll
@@ -638,7 +638,7 @@ define fastcc i16 @no_pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx4 = add <8 x i16> %rdx, %rdx.shuf3
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf = shufflevector <8 x i16> %bin.rdx4, <8 x i16> undef, <8 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx = add <8 x i16> %bin.rdx4, %rdx.shuf
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf7 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %bin.rdx, %rdx.shuf7
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx8, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
@@ -1133,7 +1133,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) {
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.1.1 = shufflevector <8 x i16> %bin.rdx, <8 x i16> undef, <8 x i32> <i32 1, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <8 x i16> %rdx.shuf.1.0, %rdx.shuf.1.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 4b1234fda0e18..056d9d1fba141 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -59,18 +59,15 @@ define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
 
 define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @add_v8i16_u1234567(
-; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT:    [[A23:%.*]] = add i16 [[A2]], [[A3]]
-; SSE2-NEXT:    [[A45:%.*]] = add i16 [[A4]], [[A5]]
-; SSE2-NEXT:    [[A67:%.*]] = add i16 [[A6]], [[A7]]
-; SSE2-NEXT:    [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT:    [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
-; SSE2-NEXT:    [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD1:%.*]] = add <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[HADD2:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HADD3:%.*]] = shufflevector <8 x i16> [[HADD2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index c9cba0a4cc0ff..572ec9efafe1a 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -59,18 +59,15 @@ define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
 
 define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
 ; SSE2-LABEL: @sub_v8i16_u1234567(
-; SSE2-NEXT:    [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
-; SSE2-NEXT:    [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
-; SSE2-NEXT:    [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
-; SSE2-NEXT:    [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
-; SSE2-NEXT:    [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
-; SSE2-NEXT:    [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
-; SSE2-NEXT:    [[A23:%.*]] = sub i16 [[A2]], [[A3]]
-; SSE2-NEXT:    [[A45:%.*]] = sub i16 [[A4]], [[A5]]
-; SSE2-NEXT:    [[A67:%.*]] = sub i16 [[A6]], [[A7]]
-; SSE2-NEXT:    [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
-; SSE2-NEXT:    [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
-; SSE2-NEXT:    [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT:    [[SHIFT2:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[A]], [[SHIFT2]]
+; SSE2-NEXT:    [[SHIFT3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[A]], [[SHIFT3]]
+; SSE2-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB1:%.*]] = sub <8 x i16> [[TMP7]], [[TMP4]]
+; SSE2-NEXT:    [[HSUB2:%.*]] = shufflevector <8 x i16> [[HSUB1]], <8 x i16> [[TMP5]], <8 x i32> <i32 poison, i32 1, i32 12, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT:    [[HSUB3:%.*]] = shufflevector <8 x i16> [[HSUB2]], <8 x i16> [[TMP6]], <8 x i32> <i32 poison, i32 1, i32 2, i32 14, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SSE2-NEXT:    [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index f3b7f7b72ee42..d369279c15db4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 5
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index c125b73fccddf..7cbe1c6cec906 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -252,17 +252,11 @@ define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext0_ext1_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; SSE-NEXT:    [[R:%.*]] = add nuw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext0_ext1_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext0_ext1_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 0
   %e1 = extractelement <16 x i8> %y, i32 1
@@ -271,17 +265,11 @@ define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext5_ext0_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; SSE-NEXT:    [[R:%.*]] = sub nsw i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext5_ext0_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext5_ext0_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> <i32 5, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 5
   %e1 = extractelement <16 x i8> %y, i32 0
@@ -290,17 +278,11 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
 }
 
 define i8 @ext1_ext6_add(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: @ext1_ext6_add(
-; SSE-NEXT:    [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; SSE-NEXT:    [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
-; SSE-NEXT:    [[R:%.*]] = and i8 [[E0]], [[E1]]
-; SSE-NEXT:    ret i8 [[R]]
-;
-; AVX-LABEL: @ext1_ext6_add(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
-; AVX-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
-; AVX-NEXT:    ret i8 [[R]]
+; CHECK-LABEL: @ext1_ext6_add(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <16 x i8> [[Y:%.*]], <16 x i8> poison, <16 x i32> <i32 poison, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i8> [[X:%.*]], [[SHIFT]]
+; CHECK-NEXT:    [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    ret i8 [[R]]
 ;
   %e0 = extractelement <16 x i8> %x, i32 1
   %e1 = extractelement <16 x i8> %y, i32 6
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 2db1e21b3e95a..f57583a3f53a6 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s --check-prefixes=CHECK,SSE2
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=sse2 | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=avx2 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -265,16 +265,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 2
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i16, ptr %gep, align 2
@@ -285,16 +279,10 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable
 ; Verify that alignment of the new load is not over-specified.
 
 define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[GEP]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i16, ptr %gep, align 8
@@ -603,17 +591,10 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
 ; Can't safely load the offset vector, but can load+shuffle if it is profitable.
 
 define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1
-; SSE2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0
-; SSE2-NEXT:    [[S:%.*]] = load i16, ptr [[TMP1]], align 8
-; SSE2-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
-; SSE2-NEXT:    ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
-; AVX2-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT:    ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    ret <8 x i16> [[R]]
 ;
   %gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1
   %l = load <2 x i16>, ptr %gep, align 8

From 16e9601e193e026d7f3f27e87f0adb81acf5969b Mon Sep 17 00:00:00 2001
From: ssijaric-nv <ssijaric@nvidia.com>
Date: Mon, 27 Jan 2025 08:02:18 -0800
Subject: [PATCH 217/432] [Flang] Adjust the trampoline size for AArch64 and
 PPC (#118678)

Set  the trampoline size to match that in compiler-rt/lib/builtins/trampoline_setup.c
and AArch64 and PPC lowering.
---
 .../lib/Optimizer/CodeGen/BoxedProcedure.cpp  | 10 +++++++++-
 flang/test/Fir/boxproc.fir                    | 12 ++++++++---
 llvm/include/llvm/TargetParser/Triple.h       |  3 +++
 llvm/lib/TargetParser/Triple.cpp              | 20 +++++++++++++++++++
 4 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
index c7c2dcede8aa6..26f4aee21d8bd 100644
--- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
+++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp
@@ -270,10 +270,18 @@ class BoxedProcedurePass
             // Create the thunk.
             auto module = embox->getParentOfType<mlir::ModuleOp>();
             FirOpBuilder builder(rewriter, module);
+            const auto triple{fir::getTargetTriple(module)};
             auto loc = embox.getLoc();
             mlir::Type i8Ty = builder.getI8Type();
             mlir::Type i8Ptr = builder.getRefType(i8Ty);
-            mlir::Type buffTy = SequenceType::get({32}, i8Ty);
+            // For AArch64, PPC32 and PPC64, the thunk is populated by a call to
+            // __trampoline_setup, which is defined in
+            // compiler-rt/lib/builtins/trampoline_setup.c and requires the
+            // thunk size greater than 32 bytes.  For RISCV and x86_64, the
+            // thunk setup doesn't go through __trampoline_setup and fits in 32
+            // bytes.
+            fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
+            mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
             auto buffer = builder.create<AllocaOp>(loc, buffTy);
             mlir::Value closure =
                 builder.createConvert(loc, i8Ptr, embox.getHost());
diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir
index 27d8953236e72..d5d78593dc8a7 100644
--- a/flang/test/Fir/boxproc.fir
+++ b/flang/test/Fir/boxproc.fir
@@ -1,7 +1,11 @@
-// RUN: tco %s | FileCheck %s
+// RUN: %if aarch64-registered-target %{tco --target=aarch64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-AARCH64 %}
+// RUN: %if x86-registered-target %{tco --target=x86_64-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-X86 %}
+// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy()
-// CHECK:         %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
 // CHECK:         %[[VAL_0:.*]] = alloca i32, i64 1, align 4
 // CHECK:         %[[VAL_2:.*]] = getelementptr { ptr }, ptr %[[VAL_1]], i32 0, i32 0
@@ -59,7 +63,9 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
 }
 
 // CHECK-LABEL: define void @_QPtest_proc_dummy_char()
-// CHECK:         %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
+// CHECK-X86:     %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
+// CHECK-PPC:     %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
 // CHECK:         %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8
 // CHECK:         %[[VAL_1:.*]] = alloca [10 x i8], i64 1, align 1
 // CHECK:         %[[VAL_0:.*]] = alloca [40 x i8], i64 1, align 1
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 8097300c6e630..ed6f48fba788b 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -498,6 +498,9 @@ class Triple {
     return getArchPointerBitWidth(getArch());
   }
 
+  /// Returns the trampoline size in bytes for this configuration.
+  unsigned getTrampolineSize() const;
+
   /// Test whether the architecture is 64-bit
   ///
   /// Note that this tests for 64-bit pointer width, and nothing else. Note
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index ed58e72089839..e9e6f130f757c 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -1711,6 +1711,26 @@ unsigned Triple::getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   llvm_unreachable("Invalid architecture value");
 }
 
+unsigned Triple::getTrampolineSize() const {
+  switch (getArch()) {
+  default:
+    break;
+  case Triple::ppc:
+  case Triple::ppcle:
+    if (isOSLinux())
+      return 40;
+    break;
+  case Triple::ppc64:
+  case Triple::ppc64le:
+    if (isOSLinux())
+      return 48;
+    break;
+  case Triple::aarch64:
+    return 36;
+  }
+  return 32;
+}
+
 bool Triple::isArch64Bit() const {
   return getArchPointerBitWidth(getArch()) == 64;
 }

From 7f24b9acd189e12e1289d47f7dc6fe0dfffbcbcc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 27 Jan 2025 08:05:05 -0800
Subject: [PATCH 218/432] [CI] Support multiple jobs in metrics container
 (#124457)

This patch makes it so that the metrics script can support multiple jobs
in a single workflow. This is needed so that we do not crash on an
assertion now that the windows job has been enabled within the premerge
workflow.
---
 .ci/metrics/metrics.py | 79 ++++++++++++++++++++----------------------
 1 file changed, 38 insertions(+), 41 deletions(-)

diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py
index 48d2aa2f330ec..70b787665a8b9 100644
--- a/.ci/metrics/metrics.py
+++ b/.ci/metrics/metrics.py
@@ -130,34 +130,6 @@ def get_per_workflow_metrics(
         workflow_jobs = workflow_run.jobs()
         if workflow_jobs.totalCount == 0:
             continue
-        if workflow_jobs.totalCount > 1:
-            raise ValueError(
-                f"Encountered an unexpected number of jobs: {workflow_jobs.totalCount}"
-            )
-
-        created_at = workflow_jobs[0].created_at
-        started_at = workflow_jobs[0].started_at
-        completed_at = workflow_jobs[0].completed_at
-
-        job_result = int(workflow_jobs[0].conclusion == "success")
-        if job_result:
-            # We still might want to mark the job as a failure if one of the steps
-            # failed. This is required due to use setting continue-on-error in
-            # the premerge pipeline to prevent sending emails while we are
-            # testing the infrastructure.
-            # TODO(boomanaiden154): Remove this once the premerge pipeline is no
-            # longer in a testing state and we can directly assert the workflow
-            # result.
-            for step in workflow_jobs[0].steps:
-                if step.conclusion != "success":
-                    job_result = 0
-                    break
-
-        queue_time = started_at - created_at
-        run_time = completed_at - started_at
-
-        if run_time.seconds == 0:
-            continue
 
         if (
             workflows_to_track[workflow_run.name] is None
@@ -170,20 +142,45 @@ def get_per_workflow_metrics(
         ):
             break
 
-        # The timestamp associated with the event is expected by Grafana to be
-        # in nanoseconds.
-        created_at_ns = int(created_at.timestamp()) * 10**9
-
-        workflow_metrics.append(
-            JobMetrics(
-                workflow_run.name,
-                queue_time.seconds,
-                run_time.seconds,
-                job_result,
-                created_at_ns,
-                workflow_run.id,
+        for workflow_job in workflow_jobs:
+            created_at = workflow_job.created_at
+            started_at = workflow_job.started_at
+            completed_at = workflow_job.completed_at
+
+            job_result = int(workflow_job.conclusion == "success")
+            if job_result:
+                # We still might want to mark the job as a failure if one of the steps
+                # failed. This is required due to use setting continue-on-error in
+                # the premerge pipeline to prevent sending emails while we are
+                # testing the infrastructure.
+                # TODO(boomanaiden154): Remove this once the premerge pipeline is no
+                # longer in a testing state and we can directly assert the workflow
+                # result.
+                for step in workflow_job.steps:
+                    if step.conclusion != "success":
+                        job_result = 0
+                        break
+
+            queue_time = started_at - created_at
+            run_time = completed_at - started_at
+
+            if run_time.seconds == 0:
+                continue
+
+            # The timestamp associated with the event is expected by Grafana to be
+            # in nanoseconds.
+            created_at_ns = int(created_at.timestamp()) * 10**9
+
+            workflow_metrics.append(
+                JobMetrics(
+                    workflow_run.name + "-" + workflow_job.name,
+                    queue_time.seconds,
+                    run_time.seconds,
+                    job_result,
+                    created_at_ns,
+                    workflow_run.id,
+                )
             )
-        )
 
     return workflow_metrics
 

From ad2b2aa50bc8516970387420097b1fe2b0613c2c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 27 Jan 2025 15:33:50 +0000
Subject: [PATCH 219/432] [PhaseOrdering] vector-trunc.ll - use
 -passes="default<O2>" to allow DOS to correctly evaluate the RUN command

Necessary for running update_test_checks.py on windows
---
 llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll | 2 +-
 llvm/test/Transforms/PhaseOrdering/vector-trunc.ll              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll
index b9cfc53bdac74..2daf15064bf01 100644
--- a/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/vector-trunc-inseltpoison.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O2                   -S -data-layout="e" < %s | FileCheck %s
-; RUN: opt -passes='default<O2>' -S -data-layout="e" < %s | FileCheck %s
+; RUN: opt -passes="default<O2>" -S -data-layout="e" < %s | FileCheck %s
 
 define <4 x i16> @truncate(<4 x i32> %x) {
 ; CHECK-LABEL: @truncate(
diff --git a/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll b/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
index 3e3bf56a8c8d6..13e76a15c0c7f 100644
--- a/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
+++ b/llvm/test/Transforms/PhaseOrdering/vector-trunc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -O2                   -S -data-layout="e" < %s | FileCheck %s
-; RUN: opt -passes='default<O2>' -S -data-layout="e" < %s | FileCheck %s
+; RUN: opt -passes="default<O2>" -S -data-layout="e" < %s | FileCheck %s
 
 define <4 x i16> @truncate(<4 x i32> %x) {
 ; CHECK-LABEL: @truncate(

From 1bb784a7489e901fd46ce9b77cdc0ab8840a4f3d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 27 Jan 2025 15:35:00 +0000
Subject: [PATCH 220/432] [LowerMatrixIntrinsics] multiply-minimal.ll - use
 -passes="..." to allow DOS to correctly evaluate the RUN command

Necessary for running update_test_checks.py on windows
---
 llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
index fa52ce39aad9b..279950fbaeff5 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes='lower-matrix-intrinsics<minimal>,instcombine,verify<domtree>' -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix %s -S | FileCheck %s
+; RUN: opt -passes="lower-matrix-intrinsics<minimal>,instcombine,verify<domtree>" -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix %s -S | FileCheck %s
 
 ; Test for the minimal version of the matrix lowering pass, which does not
 ; require DT or AA. Make sure no tiling is happening, even though it was

From 749443a307e8e47a25a5552cbeb27f69845e6ce8 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 27 Jan 2025 16:07:27 +0000
Subject: [PATCH 221/432] [NFC][DebugInfo] Mop up final instruction-insertion
 call sites (#124289)

These are the final places in the monorepo that make use of instruction
insertion for methods like insertBefore and moveBefore. As part of the
RemoveDIs project, instead use iterators for insertion. (see:
https://discourse.llvm.org/t/rfc-instruction-api-changes-needed-to-eliminate-debug-intrinsics-from-ir/68939
).
---
 llvm/lib/SandboxIR/Instruction.cpp                            | 2 +-
 llvm/lib/SandboxIR/Tracker.cpp                                | 4 ++--
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/SandboxIR/Instruction.cpp b/llvm/lib/SandboxIR/Instruction.cpp
index cc961418600e3..956047cf87b6b 100644
--- a/llvm/lib/SandboxIR/Instruction.cpp
+++ b/llvm/lib/SandboxIR/Instruction.cpp
@@ -129,7 +129,7 @@ void Instruction::insertBefore(Instruction *BeforeI) {
 
   // Insert the LLVM IR Instructions in program order.
   for (llvm::Instruction *I : getLLVMInstrs())
-    I->insertBefore(BeforeTopI);
+    I->insertBefore(BeforeTopI->getIterator());
 }
 
 void Instruction::insertAfter(Instruction *AfterI) {
diff --git a/llvm/lib/SandboxIR/Tracker.cpp b/llvm/lib/SandboxIR/Tracker.cpp
index 27ed37aa9bdd3..5fa9f181055ca 100644
--- a/llvm/lib/SandboxIR/Tracker.cpp
+++ b/llvm/lib/SandboxIR/Tracker.cpp
@@ -175,7 +175,7 @@ void EraseFromParent::revert(Tracker &Tracker) {
   // Place the bottom-most instruction first.
   auto [Operands, BotLLVMI] = InstrData[0];
   if (auto *NextLLVMI = dyn_cast<llvm::Instruction *>(NextLLVMIOrBB)) {
-    BotLLVMI->insertBefore(NextLLVMI);
+    BotLLVMI->insertBefore(NextLLVMI->getIterator());
   } else {
     auto *LLVMBB = cast<llvm::BasicBlock *>(NextLLVMIOrBB);
     BotLLVMI->insertInto(LLVMBB, LLVMBB->end());
@@ -185,7 +185,7 @@ void EraseFromParent::revert(Tracker &Tracker) {
 
   // Go over the rest of the instructions and stack them on top.
   for (auto [Operands, LLVMI] : drop_begin(InstrData)) {
-    LLVMI->insertBefore(BotLLVMI);
+    LLVMI->insertBefore(BotLLVMI->getIterator());
     for (auto [OpNum, Op] : enumerate(Operands))
       LLVMI->setOperand(OpNum, Op);
     BotLLVMI = LLVMI;
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3fcdefa8a2f67..eb873fd1b7f6f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3828,7 +3828,7 @@ handleDeclareTargetMapVar(MapInfoData &mapData,
           if (insn->getFunction() == func) {
             auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
                                             mapData.BasePointers[i]);
-            load->moveBefore(insn);
+            load->moveBefore(insn->getIterator());
             user->replaceUsesOfWith(mapData.OriginalValue[i], load);
           }
         }

From 09a29fcc8dbbb2cc1c0fdf26ef4f8fafab4e03d9 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 27 Jan 2025 16:12:07 +0000
Subject: [PATCH 222/432] [VPlan] Don't collect live-ins in
 collectUsersInExitBlocks. (NFC) (#123819)

Live-ins don't need to be handled, other than adding to the exit phi
recipe. Do that early and assert that otherwise the exit value is
defined in the vector loop region.

This should enable simply skipping other exit values that do not need
further fixing, e.g. if handling the exit value from the early exit
directly in handleUncountableEarlyExit.

PR: https://github.com/llvm/llvm-project/pull/123819
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 75 ++++++++-----------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 ++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 17 +++++
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 28 ++++++-
 .../Transforms/Vectorize/VPlanTransforms.h    |  2 +-
 .../single_early_exit_live_outs.ll            | 43 +++++++++--
 6 files changed, 119 insertions(+), 52 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3a4f637f177e1..f592e5557c17d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9033,7 +9033,6 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
 static SetVector<VPIRInstruction *>
 collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
                          VPlan &Plan) {
-  auto *MiddleVPBB = Plan.getMiddleBlock();
   SetVector<VPIRInstruction *> ExitUsersToFix;
   for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
     for (VPRecipeBase &R : *ExitVPBB) {
@@ -9043,33 +9042,33 @@ collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder,
       auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
       if (!ExitPhi)
         break;
-      for (VPBlockBase *PredVPBB : ExitVPBB->getPredecessors()) {
-        BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
-        if (PredVPBB != MiddleVPBB) {
-          SmallVector<BasicBlock *> ExitingBlocks;
-          OrigLoop->getExitingBlocks(ExitingBlocks);
-          assert(ExitingBlocks.size() == 2 && "only support 2 exiting blocks");
-          ExitingBB = ExitingBB == ExitingBlocks[0] ? ExitingBlocks[1]
-                                                    : ExitingBlocks[0];
-        }
-        Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
-        VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
-        ExitUsersToFix.insert(ExitIRI);
-        ExitIRI->addOperand(V);
+      if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {
+        assert(ExitIRI->getNumOperands() ==
+                   ExitVPBB->getPredecessors().size() &&
+               "early-exit must update exit values on construction");
+        continue;
       }
+      BasicBlock *ExitingBB = OrigLoop->getLoopLatch();
+      Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
+      VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);
+      ExitIRI->addOperand(V);
+      if (V->isLiveIn())
+        continue;
+      assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
+             "Only recipes defined inside a region should need fixing.");
+      ExitUsersToFix.insert(ExitIRI);
     }
   }
   return ExitUsersToFix;
 }
 
 // Add exit values to \p Plan. Extracts are added for each entry in \p
-// ExitUsersToFix if needed and their operands are updated. Returns true if all
-// exit users can be handled, otherwise return false.
-static bool
+// ExitUsersToFix if needed and their operands are updated.
+static void
 addUsersInExitBlocks(VPlan &Plan,
                      const SetVector<VPIRInstruction *> &ExitUsersToFix) {
   if (ExitUsersToFix.empty())
-    return true;
+    return;
 
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
@@ -9078,25 +9077,12 @@ addUsersInExitBlocks(VPlan &Plan,
   // Introduce extract for exiting values and update the VPIRInstructions
   // modeling the corresponding LCSSA phis.
   for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
-    for (const auto &[Idx, Op] : enumerate(ExitIRI->operands())) {
-      // Pass live-in values used by exit phis directly through to their users
-      // in the exit block.
-      if (Op->isLiveIn())
-        continue;
-
-      // Currently only live-ins can be used by exit values from blocks not
-      // exiting via the vector latch through to the middle block.
-      if (ExitIRI->getParent()->getSinglePredecessor() != MiddleVPBB)
-        return false;
-
-      LLVMContext &Ctx = ExitIRI->getInstruction().getContext();
-      VPValue *Ext = B.createNaryOp(VPInstruction::ExtractFromEnd,
-                                    {Op, Plan.getOrAddLiveIn(ConstantInt::get(
-                                             IntegerType::get(Ctx, 32), 1))});
-      ExitIRI->setOperand(Idx, Ext);
-    }
+    assert(ExitIRI->getNumOperands() == 1 &&
+           ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
+           "exit values from early exits must be fixed when branch to "
+           "early-exit is added");
+    ExitIRI->extractLastLaneOfOperand(B);
   }
-  return true;
 }
 
 /// Handle users in the exit block for first order reductions in the original
@@ -9392,20 +9378,21 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
   if (auto *UncountableExitingBlock =
           Legal->getUncountableEarlyExitingBlock()) {
-    VPlanTransforms::handleUncountableEarlyExit(
-        *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock, RecipeBuilder);
+    if (!VPlanTransforms::handleUncountableEarlyExit(
+            *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock,
+            RecipeBuilder)) {
+      reportVectorizationFailure(
+          "Some exit values in loop with uncountable exit not supported yet",
+          "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
+      return nullptr;
+    }
   }
   DenseMap<VPValue *, VPValue *> IVEndValues;
   addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);
   SetVector<VPIRInstruction *> ExitUsersToFix =
       collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);
   addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
-  if (!addUsersInExitBlocks(*Plan, ExitUsersToFix)) {
-    reportVectorizationFailure(
-        "Some exit values in loop with uncountable exit not supported yet",
-        "UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
-    return nullptr;
-  }
+  addUsersInExitBlocks(*Plan, ExitUsersToFix);
 
   // ---------------------------------------------------------------------------
   // Transform initial VPlan: Apply previously taken decisions, in order, to
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 9124905c99717..253f22d299b62 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -60,6 +60,7 @@ class RecurrenceDescriptor;
 class SCEV;
 class Type;
 class VPBasicBlock;
+class VPBuilder;
 class VPRegionBlock;
 class VPlan;
 class VPReplicateRecipe;
@@ -1422,6 +1423,11 @@ class VPIRInstruction : public VPRecipeBase {
            "Op must be an operand of the recipe");
     return true;
   }
+
+  /// Update the recipes single operand to the last lane of the operand using \p
+  /// Builder. Must only be used for single operand VPIRInstructions wrapping a
+  /// PHINode.
+  void extractLastLaneOfOperand(VPBuilder &Builder);
 };
 
 /// VPWidenRecipe is a recipe for producing a widened instruction using the
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa30eccdbd734..2ae539c98fef4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -11,6 +11,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "LoopVectorizationPlanner.h"
 #include "VPlan.h"
 #include "VPlanAnalysis.h"
 #include "VPlanPatternMatch.h"
@@ -937,6 +938,22 @@ InstructionCost VPIRInstruction::computeCost(ElementCount VF,
   return 0;
 }
 
+void VPIRInstruction::extractLastLaneOfOperand(VPBuilder &Builder) {
+  assert(isa<PHINode>(getInstruction()) &&
+         "can only add exiting operands to phi nodes");
+  assert(getNumOperands() == 1 && "must have a single operand");
+  VPValue *Exiting = getOperand(0);
+  if (!Exiting->isLiveIn()) {
+    LLVMContext &Ctx = getInstruction().getContext();
+    auto &Plan = *getParent()->getPlan();
+    Exiting = Builder.createNaryOp(
+        VPInstruction::ExtractFromEnd,
+        {Exiting,
+         Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::get(Ctx, 32), 1))});
+  }
+  setOperand(0, Exiting);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPIRInstruction::print(raw_ostream &O, const Twine &Indent,
                             VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9febd612c644e..714250a56ff57 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2062,7 +2062,7 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
   }
 }
 
-void VPlanTransforms::handleUncountableEarlyExit(
+bool VPlanTransforms::handleUncountableEarlyExit(
     VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop,
     BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder) {
   VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
@@ -2103,7 +2103,32 @@ void VPlanTransforms::handleUncountableEarlyExit(
   VPBlockUtils::connectBlocks(NewMiddle, VPEarlyExitBlock);
   NewMiddle->swapSuccessors();
 
+  // Update the exit phis in the early exit block.
   VPBuilder MiddleBuilder(NewMiddle);
+  for (VPRecipeBase &R : *VPEarlyExitBlock) {
+    auto *ExitIRI = cast<VPIRInstruction>(&R);
+    auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
+    if (!ExitPhi)
+      break;
+
+    VPValue *IncomingFromEarlyExit = RecipeBuilder.getVPValueOrAddLiveIn(
+        ExitPhi->getIncomingValueForBlock(UncountableExitingBlock));
+    // The incoming value from the early exit must be a live-in for now.
+    if (!IncomingFromEarlyExit->isLiveIn())
+      return false;
+
+    if (OrigLoop->getUniqueExitBlock()) {
+      // If there's a unique exit block, VPEarlyExitBlock has 2 predecessors
+      // (MiddleVPBB and NewMiddle). Add the incoming value from MiddleVPBB
+      // which is coming from the original latch.
+      VPValue *IncomingFromLatch = RecipeBuilder.getVPValueOrAddLiveIn(
+          ExitPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
+      ExitIRI->addOperand(IncomingFromLatch);
+      ExitIRI->extractLastLaneOfOperand(MiddleBuilder);
+    }
+    // Add the incoming value from the early exit.
+    ExitIRI->addOperand(IncomingFromEarlyExit);
+  }
   MiddleBuilder.createNaryOp(VPInstruction::BranchOnCond, {IsEarlyExitTaken});
 
   // Replace the condition controlling the non-early exit from the vector loop
@@ -2119,4 +2144,5 @@ void VPlanTransforms::handleUncountableEarlyExit(
       Instruction::Or, {IsEarlyExitTaken, IsLatchExitTaken});
   Builder.createNaryOp(VPInstruction::BranchOnCond, AnyExitTaken);
   LatchExitingBranch->eraseFromParent();
+  return true;
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index a751b8b5e8dc5..b31fef5d62456 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -130,7 +130,7 @@ struct VPlanTransforms {
   ///    exit conditions
   ///  * splitting the original middle block to branch to the early exit block
   ///    if taken.
-  static void handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
+  static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE,
                                          Loop *OrigLoop,
                                          BasicBlock *UncountableExitingBlock,
                                          VPRecipeBuilder &RecipeBuilder);
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
index 6e542bd873b8c..56d0871feacd3 100644
--- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -217,21 +217,50 @@ define i64 @same_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
 ; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
 ; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 3, i64 4, i64 5, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], splat (i1 true)
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.split:
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_END:%.*]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[LOOP_END]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 67, [[MIDDLE_BLOCK]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]]
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ [[INDEX]], [[LOOP_INC]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 67, [[MIDDLE_SPLIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL]]
 ;
 entry:
@@ -548,7 +577,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 64
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_SPLIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.split:
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[LOOP_EARLY_EXIT:%.*]], label [[MIDDLE_BLOCK:%.*]]
 ; CHECK:       middle.block:
@@ -568,7 +597,7 @@ define i64 @diff_exit_block_pre_inc_use2() {
 ; CHECK:       loop.inc:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       loop.early.exit:
 ; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP1]] ], [ 67, [[MIDDLE_SPLIT]] ]
 ; CHECK-NEXT:    ret i64 [[RETVAL1]]
@@ -1029,4 +1058,6 @@ attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
 ;.

From 5f5cdf40382f06a8c417c42ec591f97aa76eeb67 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 27 Jan 2025 16:15:22 +0000
Subject: [PATCH 223/432] [lldb][TypeSystemClang] CreateParameterDeclarations:
 don't specify SmallVector size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This was causing Ubuntu buildbot failures:
```
/home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp: In member function ‘llvm::SmallVector<clang::ParmVarDecl*> lldb_private::TypeSystemClang::CreateParameterDeclarations(clang::FunctionDecl*, const clang::FunctionProtoType&, const llvm::SmallVector<llvm::StringRef>&)’:
/home/buildbot/buildbot-root/cross-project-tests-sie-ubuntu-dwarf5/llvm-project/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp:7728:10: error: could not convert ‘params’ from ‘SmallVector<[...],12>’ to ‘SmallVector<[...],6>’
 7728 |   return params;
      |          ^~~~~~
      |          |
      |          SmallVector<[...],12>
```

It's unclear why 12 was chosen here. Given we don't set the
size in other places where we parse parameters, this patch
just removes the constant.
---
 lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index fc3dbfa311c9b..cb246fde976c2 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -7710,7 +7710,7 @@ TypeSystemClang::CreateParameterDeclarations(
   assert(parameter_names.empty() ||
          parameter_names.size() == prototype.getNumParams());
 
-  llvm::SmallVector<clang::ParmVarDecl *, 12> params;
+  llvm::SmallVector<clang::ParmVarDecl *> params;
   for (unsigned param_index = 0; param_index < prototype.getNumParams();
        ++param_index) {
     llvm::StringRef name =

From 81d18ad86419fc612c7071e888d11aa923eaeb8a Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 27 Jan 2025 16:27:54 +0000
Subject: [PATCH 224/432] [NFC][DebugInfo] Make some block-start-position
 methods return iterators (#124287)

As part of the "RemoveDIs" work to eliminate debug intrinsics, we're
replacing methods that use Instruction*'s as positions with iterators. A
number of these (such as getFirstNonPHIOrDbg) are sufficiently
infrequently used that we can just replace the pointer-returning version
with an iterator-returning version, hopefully without much/any
disruption.

Thus this patch has getFirstNonPHIOrDbg and
getFirstNonPHIOrDbgOrLifetime return an iterator, and updates all
call-sites. There are no concerns about the iterators returned being
converted to Instruction*'s and losing the debug-info bit: because the
methods skip debug intrinsics, the iterator head bit is always false
anyway.
---
 .../ExpressionParser/Clang/IRForTarget.cpp    |  6 ++--
 llvm/include/llvm/IR/BasicBlock.h             | 22 +++++++-------
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |  2 +-
 llvm/lib/IR/BasicBlock.cpp                    | 30 +++++++++++++------
 .../AArch64/AArch64TargetTransformInfo.cpp    |  2 +-
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   |  4 +--
 llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp  |  2 +-
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |  3 +-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  |  5 ++--
 llvm/lib/Transforms/IPO/IROutliner.cpp        |  2 +-
 llvm/lib/Transforms/IPO/SCCP.cpp              |  4 +--
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |  2 +-
 .../InstCombineLoadStoreAlloca.cpp            |  4 +--
 .../Transforms/Scalar/CallSiteSplitting.cpp   |  2 +-
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |  2 +-
 llvm/lib/Transforms/Utils/CodeMoverUtils.cpp  |  2 +-
 llvm/lib/Transforms/Utils/IRNormalizer.cpp    |  2 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  4 +--
 .../Frontend/OpenMPIRBuilderTest.cpp          | 24 ++++++++++-----
 19 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
index 6c728f3447489..a414ad652448e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
@@ -66,7 +66,7 @@ static llvm::Value *FindEntryInstruction(llvm::Function *function) {
   if (function->empty())
     return nullptr;
 
-  return function->getEntryBlock().getFirstNonPHIOrDbg();
+  return &*function->getEntryBlock().getFirstNonPHIOrDbg();
 }
 
 IRForTarget::IRForTarget(lldb_private::ClangExpressionDeclMap *decl_map,
@@ -361,7 +361,7 @@ bool IRForTarget::CreateResultVariable(llvm::Function &llvm_function) {
     // there's nothing to put into its equivalent persistent variable.
 
     BasicBlock &entry_block(llvm_function.getEntryBlock());
-    Instruction *first_entry_instruction(entry_block.getFirstNonPHIOrDbg());
+    Instruction *first_entry_instruction(&*entry_block.getFirstNonPHIOrDbg());
 
     if (!first_entry_instruction)
       return false;
@@ -1505,7 +1505,7 @@ bool IRForTarget::ReplaceVariables(Function &llvm_function) {
   LLDB_LOG(log, "Arg: \"{0}\"", PrintValue(argument));
 
   BasicBlock &entry_block(llvm_function.getEntryBlock());
-  Instruction *FirstEntryInstruction(entry_block.getFirstNonPHIOrDbg());
+  Instruction *FirstEntryInstruction(&*entry_block.getFirstNonPHIOrDbg());
 
   if (!FirstEntryInstruction) {
     m_error_stream.Printf("Internal error [IRForTarget]: Couldn't find the "
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index e22fe1e7e7dc8..f2e34a7d24d06 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -299,22 +299,24 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// Returns a pointer to the first instruction in this block that is not a
   /// PHINode or a debug intrinsic, or any pseudo operation if \c SkipPseudoOp
   /// is true.
-  const Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = true) const;
-  Instruction *getFirstNonPHIOrDbg(bool SkipPseudoOp = true) {
-    return const_cast<Instruction *>(
-        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg(
-            SkipPseudoOp));
+  InstListType::const_iterator
+  getFirstNonPHIOrDbg(bool SkipPseudoOp = true) const;
+  InstListType::iterator getFirstNonPHIOrDbg(bool SkipPseudoOp = true) {
+    return static_cast<const BasicBlock *>(this)
+        ->getFirstNonPHIOrDbg(SkipPseudoOp)
+        .getNonConst();
   }
 
   /// Returns a pointer to the first instruction in this block that is not a
   /// PHINode, a debug intrinsic, or a lifetime intrinsic, or any pseudo
   /// operation if \c SkipPseudoOp is true.
-  const Instruction *
+  InstListType::const_iterator
   getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = true) const;
-  Instruction *getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = true) {
-    return const_cast<Instruction *>(
-        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime(
-            SkipPseudoOp));
+  InstListType::iterator
+  getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp = true) {
+    return static_cast<const BasicBlock *>(this)
+        ->getFirstNonPHIOrDbgOrLifetime(SkipPseudoOp)
+        .getNonConst();
   }
 
   /// Returns an iterator to the first instruction in this block that is
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 94101ccca1096..47486a30bba5b 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -990,7 +990,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
                  isa<IndirectBrInst>(Pred->getTerminator())))
     return true;
 
-  if (BB->getTerminator() != BB->getFirstNonPHIOrDbg())
+  if (BB->getTerminator() != &*BB->getFirstNonPHIOrDbg())
     return true;
 
   // We use a simple cost heuristic which determine skipping merging is
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 0efc04cb2c867..0c9296659a2a7 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -383,7 +383,8 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIIt() const {
   return It;
 }
 
-const Instruction *BasicBlock::getFirstNonPHIOrDbg(bool SkipPseudoOp) const {
+BasicBlock::const_iterator
+BasicBlock::getFirstNonPHIOrDbg(bool SkipPseudoOp) const {
   for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
@@ -391,12 +392,16 @@ const Instruction *BasicBlock::getFirstNonPHIOrDbg(bool SkipPseudoOp) const {
     if (SkipPseudoOp && isa<PseudoProbeInst>(I))
       continue;
 
-    return &I;
+    BasicBlock::const_iterator It = I.getIterator();
+    // This position comes after any debug records, the head bit should remain
+    // unset.
+    assert(!It.getHeadBit());
+    return It;
   }
-  return nullptr;
+  return end();
 }
 
-const Instruction *
+BasicBlock::const_iterator
 BasicBlock::getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp) const {
   for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
@@ -408,9 +413,14 @@ BasicBlock::getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp) const {
     if (SkipPseudoOp && isa<PseudoProbeInst>(I))
       continue;
 
-    return &I;
+    BasicBlock::const_iterator It = I.getIterator();
+    // This position comes after any debug records, the head bit should remain
+    // unset.
+    assert(!It.getHeadBit());
+
+    return It;
   }
-  return nullptr;
+  return end();
 }
 
 BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
@@ -428,11 +438,10 @@ BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
 }
 
 BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
-  const Instruction *FirstNonPHI = getFirstNonPHI();
-  if (!FirstNonPHI)
+  const_iterator InsertPt = getFirstNonPHIIt();
+  if (InsertPt == end())
     return end();
 
-  const_iterator InsertPt = FirstNonPHI->getIterator();
   if (InsertPt->isEHPad())
     ++InsertPt;
 
@@ -448,6 +457,9 @@ BasicBlock::const_iterator BasicBlock::getFirstNonPHIOrDbgOrAlloca() const {
       ++InsertPt;
     }
   }
+
+  // Signal that this comes after any debug records.
+  InsertPt.setHeadBit(false);
   return InsertPt;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e2389145cf33f..aae2fdaf5bec3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2188,7 +2188,7 @@ static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
     NI = NI->getNextNonDebugInstruction();
     if (!NI) {
       if (auto *SuccBB = NIBB->getUniqueSuccessor())
-        NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
+        NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
       else
         break;
     }
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 4ff6fc32b642d..df0c2080e0795 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -232,7 +232,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
     } else if (L->contains(Inst)) {
       Insert = Term;
     } else {
-      Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+      Insert = &*L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
 
     return CreateBreak(Insert);
@@ -247,7 +247,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
   }
 
   if (isa<Argument>(Cond)) {
-    Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
+    Instruction *Insert = &*L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     return CreateBreak(Insert);
   }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 75fcf6829c504..c734d3d430073 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -86,7 +86,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     if (F.isDeclaration()) {
       continue;
     }
-    IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+    IRBuilder<> Builder(&*F.getEntryBlock().getFirstNonPHIOrDbg());
     for (BasicBlock &BB : F) {
       for (Instruction &II : BB) {
         for (unsigned i = 0, e = II.getNumOperands(); i < e; ++i) {
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 73d4fb9065831..ee4c3f8082858 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1697,7 +1697,8 @@ static void eliminateSwiftErrorAlloca(Function &F, AllocaInst *Alloca,
 static void eliminateSwiftErrorArgument(Function &F, Argument &Arg,
                                         coro::Shape &Shape,
                              SmallVectorImpl<AllocaInst*> &AllocasToPromote) {
-  IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+  IRBuilder<> Builder(&F.getEntryBlock(),
+                      F.getEntryBlock().getFirstNonPHIOrDbg());
 
   auto ArgTy = cast<PointerType>(Arg.getType());
   auto ValueTy = PointerType::getUnqual(F.getContext());
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 2630ae19dabaa..797c2f3f5bb81 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -597,7 +597,8 @@ static void replaceSwiftErrorOps(Function &F, coro::Shape &Shape,
     }
 
     // Create a swifterror alloca.
-    IRBuilder<> Builder(F.getEntryBlock().getFirstNonPHIOrDbg());
+    IRBuilder<> Builder(&F.getEntryBlock(),
+                        F.getEntryBlock().getFirstNonPHIOrDbg());
     auto Alloca = Builder.CreateAlloca(ValueTy);
     Alloca->setSwiftError(true);
 
@@ -828,7 +829,7 @@ static void updateScopeLine(Instruction *ActiveSuspend,
   // instructions are not in the same BB.
   if (auto *Branch = dyn_cast_or_null<BranchInst>(Successor);
       Branch && Branch->isUnconditional())
-    Successor = Branch->getSuccessor(0)->getFirstNonPHIOrDbg();
+    Successor = &*Branch->getSuccessor(0)->getFirstNonPHIOrDbg();
 
   // Find the first successor of ActiveSuspend with a non-zero line location.
   // If that matches the file of ActiveSuspend, use it.
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index 3f54106bd09fe..41bc67f2b6891 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -197,7 +197,7 @@ Value *OutlinableRegion::findCorrespondingValueIn(const OutlinableRegion &Other,
 BasicBlock *
 OutlinableRegion::findCorrespondingBlockIn(const OutlinableRegion &Other,
                                            BasicBlock *BB) {
-  Instruction *FirstNonPHI = BB->getFirstNonPHIOrDbg();
+  Instruction *FirstNonPHI = &*BB->getFirstNonPHIOrDbg();
   assert(FirstNonPHI && "block is empty?");
   Value *CorrespondingVal = findCorrespondingValueIn(Other, FirstNonPHI);
   if (!CorrespondingVal)
diff --git a/llvm/lib/Transforms/IPO/SCCP.cpp b/llvm/lib/Transforms/IPO/SCCP.cpp
index e80c6f7c0f49d..2afcdf09af016 100644
--- a/llvm/lib/Transforms/IPO/SCCP.cpp
+++ b/llvm/lib/Transforms/IPO/SCCP.cpp
@@ -235,11 +235,11 @@ static bool runIPSCCP(
     // nodes in executable blocks we found values for. The function's entry
     // block is not part of BlocksToErase, so we have to handle it separately.
     for (BasicBlock *BB : BlocksToErase) {
-      NumInstRemoved += changeToUnreachable(BB->getFirstNonPHIOrDbg(),
+      NumInstRemoved += changeToUnreachable(&*BB->getFirstNonPHIOrDbg(),
                                             /*PreserveLCSSA=*/false, &DTU);
     }
     if (!Solver.isBlockExecutable(&F.front()))
-      NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHIOrDbg(),
+      NumInstRemoved += changeToUnreachable(&*F.front().getFirstNonPHIOrDbg(),
                                             /*PreserveLCSSA=*/false, &DTU);
 
     BasicBlock *NewUnreachableBB = nullptr;
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b978c54ef96fd..e1d5b07405a09 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1747,7 +1747,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
       if (Weight != 0) {
         if (Weight > MaxWeight) {
           MaxWeight = Weight;
-          MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
+          MaxDestInst = &*Succ->getFirstNonPHIOrDbgOrLifetime();
         }
       }
     }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index f80bbffbab547..4b42e86e25161 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -462,8 +462,8 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
 
       // Get the first instruction in the entry block.
       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
-      Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
-      if (FirstInst != &AI) {
+      BasicBlock::iterator FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+      if (&*FirstInst != &AI) {
         // If the entry block doesn't start with a zero-size alloca then move
         // this one to the start of the entry block.  There is no problem with
         // dominance as the array size was forced to a constant earlier already.
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index bbc7a005b9ff4..350b0c3754bb5 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -415,7 +415,7 @@ static void splitCallSite(CallBase &CB,
 // constant incoming values.
 static bool isPredicatedOnPHI(CallBase &CB) {
   BasicBlock *Parent = CB.getParent();
-  if (&CB != Parent->getFirstNonPHIOrDbg())
+  if (&CB != &*Parent->getFirstNonPHIOrDbg())
     return false;
 
   for (auto &PN : Parent->phis()) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index c2f7c5dcaf160..bcc56203a4bea 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -770,7 +770,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
     // instruction in the block.
     auto *TI = BBToCheck.getTerminator();
     bool isUnreachable = isa<UnreachableInst>(TI);
-    return !isUnreachable || BBToCheck.getFirstNonPHIOrDbg() != TI;
+    return !isUnreachable || &*BBToCheck.getFirstNonPHIOrDbg() != TI;
   };
 
   SmallVector<int, 4> ExitCaseIndices;
diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
index f34e9c5818dd6..b0105ae8fa116 100644
--- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -427,7 +427,7 @@ void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
                                           DependenceInfo &DI) {
   for (Instruction &I :
        llvm::make_early_inc_range(llvm::drop_begin(llvm::reverse(FromBB)))) {
-    Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
+    BasicBlock::iterator MovePos = ToBB.getFirstNonPHIOrDbg();
 
     if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
       I.moveBeforePreserving(MovePos);
diff --git a/llvm/lib/Transforms/Utils/IRNormalizer.cpp b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
index 47ec7f3177db7..d36da331fedf7 100644
--- a/llvm/lib/Transforms/Utils/IRNormalizer.cpp
+++ b/llvm/lib/Transforms/Utils/IRNormalizer.cpp
@@ -475,7 +475,7 @@ void IRNormalizer::reorderInstructions(Function &F) const {
             Call->getIntrinsicID() == Intrinsic::experimental_convergence_loop)
           FirstNonPHIOrDbgOrAlloca++;
       }
-      Instruction->moveBefore(&*FirstNonPHIOrDbgOrAlloca);
+      Instruction->moveBefore(FirstNonPHIOrDbgOrAlloca);
       TopologicalSort.pop();
     }
   }
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cf3c2b360d090..9cc754e71c0e4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6004,7 +6004,7 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
 /// the phi node, and set PhiIndex to BB's index in the phi node.
 static PHINode *findPHIForConditionForwarding(ConstantInt *CaseValue,
                                               BasicBlock *BB, int *PhiIndex) {
-  if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
+  if (&*BB->getFirstNonPHIIt() != BB->getTerminator())
     return nullptr; // BB must be empty to be a candidate for simplification.
   if (!BB->getSinglePredecessor())
     return nullptr; // BB must be dominated by the switch.
@@ -7885,7 +7885,7 @@ bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
       Options.NeedCanonicalLoop &&
       (!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) &&
        (is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ)));
-  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg(true)->getIterator();
+  BasicBlock::iterator I = BB->getFirstNonPHIOrDbg();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU))
     return true;
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 64fef02c8f3f8..47830069a9d97 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -4643,9 +4643,11 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithThreadLimit) {
       dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
-  Instruction *NextInstruction =
+  BasicBlock::iterator NextInstruction =
       BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
-  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  CallInst *ForkTeamsCI = nullptr;
+  if (NextInstruction != BrInst->getSuccessor(0)->end())
+    ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
   ASSERT_NE(ForkTeamsCI, nullptr);
   EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
             OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
@@ -4698,9 +4700,11 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsUpper) {
       dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
-  Instruction *NextInstruction =
+  BasicBlock::iterator NextInstruction =
       BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
-  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  CallInst *ForkTeamsCI = nullptr;
+  if (NextInstruction != BrInst->getSuccessor(0)->end())
+    ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
   ASSERT_NE(ForkTeamsCI, nullptr);
   EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
             OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
@@ -4756,9 +4760,11 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsBoth) {
       dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
-  Instruction *NextInstruction =
+  BasicBlock::iterator NextInstruction =
       BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
-  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  CallInst *ForkTeamsCI = nullptr;
+  if (NextInstruction != BrInst->getSuccessor(0)->end())
+    ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
   ASSERT_NE(ForkTeamsCI, nullptr);
   EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
             OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));
@@ -4820,9 +4826,11 @@ TEST_F(OpenMPIRBuilderTest, CreateTeamsWithNumTeamsAndThreadLimit) {
       dyn_cast<BranchInst>(PushNumTeamsCallInst->getNextNonDebugInstruction());
   ASSERT_NE(BrInst, nullptr);
   ASSERT_EQ(BrInst->getNumSuccessors(), 1U);
-  Instruction *NextInstruction =
+  BasicBlock::iterator NextInstruction =
       BrInst->getSuccessor(0)->getFirstNonPHIOrDbgOrLifetime();
-  CallInst *ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
+  CallInst *ForkTeamsCI = nullptr;
+  if (NextInstruction != BrInst->getSuccessor(0)->end())
+    ForkTeamsCI = dyn_cast_if_present<CallInst>(NextInstruction);
   ASSERT_NE(ForkTeamsCI, nullptr);
   EXPECT_EQ(ForkTeamsCI->getCalledFunction(),
             OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_teams));

From 6ff86f2c0a5b58b07921ee895f0d16d8cd3d4015 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 10:35:27 -0600
Subject: [PATCH 225/432] [AMDGPU] Use the AMDGPUToolChain when targeting C/C++
 directly (#99687)

Summary:
The `getToolChain` pass uses the triple to determine which toolchain to
create. Currently the `amdgcn-amd-amdhsa` triple maps to the
`ROCmToolChain` which uses things expected to be provided by `ROCm`.
This is neded for OpenCL, but directly targeting C++ does not want this
since it's primarily being used for creating GPU runtime code. As far as
I know I'm the only user of this, so this shouldn't change anything.

Unfortunately, there's no good logic for detercting this, so I simply
checked ahead of time if the input is either `foo.cl` or `-x cl foo.c`
to choose between the two. This allows us to use the AMDGPU target
normally, as otherwise it will error without passing `-nogpulib`.
---
 clang/lib/Driver/Driver.cpp          | 16 ++++++++++++++--
 clang/test/Driver/amdgpu-toolchain.c |  3 +++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 87855fdb79971..a9bd52920d823 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6669,9 +6669,21 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::CUDA:
       TC = std::make_unique<toolchains::NVPTXToolChain>(*this, Target, Args);
       break;
-    case llvm::Triple::AMDHSA:
-      TC = std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args);
+    case llvm::Triple::AMDHSA: {
+      bool CL = llvm::any_of(Args, [&](Arg *A) {
+        return (A->getOption().matches(options::OPT_x) &&
+                types::isOpenCL(
+                    types::lookupTypeForExtension(A->getValue()))) ||
+               (A->getOption().getKind() == Option::InputClass &&
+                StringRef(A->getValue()).rfind('.') != StringRef::npos &&
+                types::isOpenCL(types::lookupTypeForExtension(
+                    &A->getValue()[StringRef(A->getValue()).rfind('.') + 1])));
+      });
+      TC = CL ? std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args)
+              : std::make_unique<toolchains::AMDGPUToolChain>(*this, Target,
+                                                              Args);
       break;
+    }
     case llvm::Triple::AMDPAL:
     case llvm::Triple::Mesa3D:
       TC = std::make_unique<toolchains::AMDGPUToolChain>(*this, Target, Args);
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index c1c5aa8e90e68..3299930ee40db 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -36,3 +36,6 @@
 // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
 // RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
 // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
+//
+// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 %s 2>&1 | FileCheck -check-prefix=ROCM %s
+// ROCM-NOT: -mlink-builtin-bitcode

From 38b3f45a811282511c014cffd09a8ae2a96435ba Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 10:35:35 -0600
Subject: [PATCH 226/432] [Offload] Fix offload-info interface

Summary:
The offload info tool doesn't initialize things properly, just check
this first instead.
---
 offload/plugins-nextgen/common/src/PluginInterface.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 0d5169191c5af..16f510de3ecc5 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1633,7 +1633,7 @@ Error GenericPluginTy::deinit() {
   if (GlobalHandler)
     delete GlobalHandler;
 
-  if (RPCServer->Thread->Running.load(std::memory_order_relaxed))
+  if (RPCServer && RPCServer->Thread->Running.load(std::memory_order_relaxed))
     if (Error Err = RPCServer->shutDown())
       return Err;
 

From c3a0fcc982061f9a69cfc1199dc91bd1fc3158c0 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Mon, 27 Jan 2025 16:37:21 +0000
Subject: [PATCH 227/432] [libclc] Optimize CLC vector any/all builtins
 (#124568)

By using the vector reduction buitins we can avoid scalarization.
Targets that don't support vector reductions will scalarize later on
anyway. The vector reduction builtins should be well-enough supported by
the middle-end to be a generic solution.

This produces conceptually equivalent code: all vector elements are
OR'd/AND'd together and the final scalar is bit-shifted and masked to
produce the final result.

The 'normalize' builtin uses 'all' so its code has similarly improved in
places.
---
 libclc/clc/lib/generic/relational/clc_all.cl | 37 ++++++++------------
 libclc/clc/lib/generic/relational/clc_any.cl | 37 ++++++++------------
 2 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/libclc/clc/lib/generic/relational/clc_all.cl b/libclc/clc/lib/generic/relational/clc_all.cl
index e371126d144f0..5193f0f743179 100644
--- a/libclc/clc/lib/generic/relational/clc_all.cl
+++ b/libclc/clc/lib/generic/relational/clc_all.cl
@@ -1,28 +1,21 @@
 #include <clc/internal/clc.h>
 
 #define _CLC_ALL(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
-#define _CLC_ALL2(v) (_CLC_ALL((v).s0) & _CLC_ALL((v).s1))
-#define _CLC_ALL3(v) (_CLC_ALL2((v)) & _CLC_ALL((v).s2))
-#define _CLC_ALL4(v) (_CLC_ALL3((v)) & _CLC_ALL((v).s3))
-#define _CLC_ALL8(v)                                                           \
-  (_CLC_ALL4((v)) & _CLC_ALL((v).s4) & _CLC_ALL((v).s5) & _CLC_ALL((v).s6) &   \
-   _CLC_ALL((v).s7))
-#define _CLC_ALL16(v)                                                          \
-  (_CLC_ALL8((v)) & _CLC_ALL((v).s8) & _CLC_ALL((v).s9) & _CLC_ALL((v).sA) &   \
-   _CLC_ALL((v).sB) & _CLC_ALL((v).sC) & _CLC_ALL((v).sD) & _CLC_ALL((v).sE) & \
-   _CLC_ALL((v).sf))
 
-#define ALL_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int __clc_all(TYPE v)
+#define _CLC_ALL_VEC(TYPE)                                                     \
+  _CLC_OVERLOAD _CLC_DEF int __clc_all(TYPE v) {                               \
+    return _CLC_ALL(__builtin_reduce_and(v));                                  \
+  }
 
-#define ALL_VECTORIZE(TYPE)                                                    \
-  ALL_ID(TYPE) { return _CLC_ALL(v); }                                         \
-  ALL_ID(TYPE##2) { return _CLC_ALL2(v); }                                     \
-  ALL_ID(TYPE##3) { return _CLC_ALL3(v); }                                     \
-  ALL_ID(TYPE##4) { return _CLC_ALL4(v); }                                     \
-  ALL_ID(TYPE##8) { return _CLC_ALL8(v); }                                     \
-  ALL_ID(TYPE##16) { return _CLC_ALL16(v); }
+#define _CLC_DEFINE_ALL(TYPE)                                                  \
+  _CLC_OVERLOAD _CLC_DEF int __clc_all(TYPE v) { return _CLC_ALL(v); }         \
+  _CLC_ALL_VEC(TYPE##2)                                                        \
+  _CLC_ALL_VEC(TYPE##3)                                                        \
+  _CLC_ALL_VEC(TYPE##4)                                                        \
+  _CLC_ALL_VEC(TYPE##8)                                                        \
+  _CLC_ALL_VEC(TYPE##16)
 
-ALL_VECTORIZE(char)
-ALL_VECTORIZE(short)
-ALL_VECTORIZE(int)
-ALL_VECTORIZE(long)
+_CLC_DEFINE_ALL(char)
+_CLC_DEFINE_ALL(short)
+_CLC_DEFINE_ALL(int)
+_CLC_DEFINE_ALL(long)
diff --git a/libclc/clc/lib/generic/relational/clc_any.cl b/libclc/clc/lib/generic/relational/clc_any.cl
index e69f2113c94f5..e86bafaac1993 100644
--- a/libclc/clc/lib/generic/relational/clc_any.cl
+++ b/libclc/clc/lib/generic/relational/clc_any.cl
@@ -1,28 +1,21 @@
 #include <clc/internal/clc.h>
 
 #define _CLC_ANY(v) (((v) >> ((sizeof(v) * 8) - 1)) & 0x1)
-#define _CLC_ANY2(v) (_CLC_ANY((v).s0) | _CLC_ANY((v).s1))
-#define _CLC_ANY3(v) (_CLC_ANY2((v)) | _CLC_ANY((v).s2))
-#define _CLC_ANY4(v) (_CLC_ANY3((v)) | _CLC_ANY((v).s3))
-#define _CLC_ANY8(v)                                                           \
-  (_CLC_ANY4((v)) | _CLC_ANY((v).s4) | _CLC_ANY((v).s5) | _CLC_ANY((v).s6) |   \
-   _CLC_ANY((v).s7))
-#define _CLC_ANY16(v)                                                          \
-  (_CLC_ANY8((v)) | _CLC_ANY((v).s8) | _CLC_ANY((v).s9) | _CLC_ANY((v).sA) |   \
-   _CLC_ANY((v).sB) | _CLC_ANY((v).sC) | _CLC_ANY((v).sD) | _CLC_ANY((v).sE) | \
-   _CLC_ANY((v).sf))
 
-#define ANY_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int __clc_any(TYPE v)
+#define _CLC_ANY_VEC(TYPE)                                                     \
+  _CLC_OVERLOAD _CLC_DEF int __clc_any(TYPE v) {                               \
+    return _CLC_ANY(__builtin_reduce_or(v));                                   \
+  }
 
-#define ANY_VECTORIZE(TYPE)                                                    \
-  ANY_ID(TYPE) { return _CLC_ANY(v); }                                         \
-  ANY_ID(TYPE##2) { return _CLC_ANY2(v); }                                     \
-  ANY_ID(TYPE##3) { return _CLC_ANY3(v); }                                     \
-  ANY_ID(TYPE##4) { return _CLC_ANY4(v); }                                     \
-  ANY_ID(TYPE##8) { return _CLC_ANY8(v); }                                     \
-  ANY_ID(TYPE##16) { return _CLC_ANY16(v); }
+#define _CLC_DEFINE_ANY(TYPE)                                                  \
+  _CLC_OVERLOAD _CLC_DEF int __clc_any(TYPE v) { return _CLC_ANY(v); }         \
+  _CLC_ANY_VEC(TYPE##2)                                                        \
+  _CLC_ANY_VEC(TYPE##3)                                                        \
+  _CLC_ANY_VEC(TYPE##4)                                                        \
+  _CLC_ANY_VEC(TYPE##8)                                                        \
+  _CLC_ANY_VEC(TYPE##16)
 
-ANY_VECTORIZE(char)
-ANY_VECTORIZE(short)
-ANY_VECTORIZE(int)
-ANY_VECTORIZE(long)
+_CLC_DEFINE_ANY(char)
+_CLC_DEFINE_ANY(short)
+_CLC_DEFINE_ANY(int)
+_CLC_DEFINE_ANY(long)

From 038b42ba5b47b1aa2d47ef5706a713f6bfbbc37c Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:43:41 -0800
Subject: [PATCH 228/432] [flang] Safer hermetic module file reading (#121002)

When a hermetic module file is read, use a new scope to hold its
dependent modules so that they don't conflict with any modules in the
global scope.
---
 flang/docs/ModFiles.md                        |  10 ++
 flang/include/flang/Common/Fortran-features.h |   3 +-
 flang/include/flang/Evaluate/type.h           |   2 +
 flang/include/flang/Semantics/semantics.h     |   7 +
 flang/include/flang/Semantics/symbol.h        |   8 +-
 flang/lib/Evaluate/type.cpp                   |  34 ++--
 flang/lib/Semantics/mod-file.cpp              |  19 +++
 flang/lib/Semantics/resolve-names.cpp         | 148 ++++++++++++++++--
 flang/lib/Semantics/symbol.cpp                |  11 +-
 flang/lib/Semantics/tools.cpp                 |  16 +-
 flang/test/Semantics/modfile71.F90            | 121 ++++++++++++++
 11 files changed, 346 insertions(+), 33 deletions(-)
 create mode 100644 flang/test/Semantics/modfile71.F90

diff --git a/flang/docs/ModFiles.md b/flang/docs/ModFiles.md
index 7463454c8563a..a4c2395d308fb 100644
--- a/flang/docs/ModFiles.md
+++ b/flang/docs/ModFiles.md
@@ -164,3 +164,13 @@ a diagnostic but we still wouldn't have line numbers.
 To provide line numbers and character positions or source lines as the user
 wrote them we would have to save some amount of provenance information in the
 module file as well.
+
+## Hermetic modules files
+
+Top-level module files for libraries can be build with `-fhermetic-module-files`.
+This option causes these module files to contain copies of all of the non-intrinsic
+modules on which they depend, so that non-top-level local modules and the
+modules of dependent libraries need not also be packaged with the library.
+When the compiler reads a hermetic module file, the copies of the dependent
+modules are read into their own scope, and will not conflict with other modules
+of the same name that client code might `USE`.
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 44f88009f8f2c..9549e8bfbbef0 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -73,7 +73,8 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     PreviousScalarUse, RedeclaredInaccessibleComponent, ImplicitShared,
     IndexVarRedefinition, IncompatibleImplicitInterfaces, BadTypeForTarget,
     VectorSubscriptFinalization, UndefinedFunctionResult, UselessIomsg,
-    MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation)
+    MismatchingDummyProcedure, SubscriptedEmptyArray, UnsignedLiteralTruncation,
+    CompatibleDeclarationsFromDistinctModules)
 
 using LanguageFeatures = EnumSet<LanguageFeature, LanguageFeature_enumSize>;
 using UsageWarnings = EnumSet<UsageWarning, UsageWarning_enumSize>;
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 1f9296ac4fea7..a4afe49d6077a 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -510,6 +510,8 @@ bool AreSameDerivedType(
     const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
 bool AreSameDerivedTypeIgnoringTypeParameters(
     const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
+bool AreSameDerivedTypeIgnoringSequence(
+    const semantics::DerivedTypeSpec &, const semantics::DerivedTypeSpec &);
 
 // For generating "[extern] template class", &c. boilerplate
 #define EXPAND_FOR_EACH_INTEGER_KIND(M, P, S) \
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index c981d86fbd94c..821ce021b3226 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -110,6 +110,12 @@ class SemanticsContext {
   }
   Scope &globalScope() { return globalScope_; }
   Scope &intrinsicModulesScope() { return intrinsicModulesScope_; }
+  Scope *currentHermeticModuleFileScope() {
+    return currentHermeticModuleFileScope_;
+  }
+  void set_currentHermeticModuleFileScope(Scope *scope) {
+    currentHermeticModuleFileScope_ = scope;
+  }
   parser::Messages &messages() { return messages_; }
   evaluate::FoldingContext &foldingContext() { return foldingContext_; }
   parser::AllCookedSources &allCookedSources() { return allCookedSources_; }
@@ -313,6 +319,7 @@ class SemanticsContext {
   evaluate::TargetCharacteristics targetCharacteristics_;
   Scope globalScope_;
   Scope &intrinsicModulesScope_;
+  Scope *currentHermeticModuleFileScope_{nullptr};
   ScopeIndex scopeIndex_;
   parser::Messages messages_;
   evaluate::FoldingContext foldingContext_;
diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h
index bc6abccac1bb8..235fade1ed937 100644
--- a/flang/include/flang/Semantics/symbol.h
+++ b/flang/include/flang/Semantics/symbol.h
@@ -605,12 +605,12 @@ class UseDetails {
 class UseErrorDetails {
 public:
   UseErrorDetails(const UseDetails &);
-  UseErrorDetails &add_occurrence(const SourceName &, const Scope &);
-  using listType = std::list<std::pair<SourceName, const Scope *>>;
-  const listType occurrences() const { return occurrences_; };
+  UseErrorDetails &add_occurrence(const SourceName &, const Symbol &);
+  using ListType = std::list<std::pair<SourceName, const Symbol *>>;
+  const ListType occurrences() const { return occurrences_; };
 
 private:
-  listType occurrences_;
+  ListType occurrences_;
 };
 
 // A symbol host-associated from an enclosing scope.
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 0c2784d9cbe30..c8f75f91ed9c6 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -293,11 +293,13 @@ using SetOfDerivedTypePairs =
 
 static bool AreSameDerivedType(const semantics::DerivedTypeSpec &,
     const semantics::DerivedTypeSpec &, bool ignoreTypeParameterValues,
-    bool ignoreLenParameters, SetOfDerivedTypePairs &inProgress);
+    bool ignoreLenParameters, bool ignoreSequence,
+    SetOfDerivedTypePairs &inProgress);
 
 // F2023 7.5.3.2
 static bool AreSameComponent(const semantics::Symbol &x,
-    const semantics::Symbol &y, SetOfDerivedTypePairs &inProgress) {
+    const semantics::Symbol &y, bool ignoreSequence,
+    SetOfDerivedTypePairs &inProgress) {
   if (x.attrs() != y.attrs()) {
     return false;
   }
@@ -325,7 +327,8 @@ static bool AreSameComponent(const semantics::Symbol &x,
               !yType->IsUnlimitedPolymorphic() ||
           (!xType->IsUnlimitedPolymorphic() &&
               !AreSameDerivedType(xType->GetDerivedTypeSpec(),
-                  yType->GetDerivedTypeSpec(), false, false, inProgress))) {
+                  yType->GetDerivedTypeSpec(), false, false, ignoreSequence,
+                  inProgress))) {
         return false;
       }
     } else if (!xType->IsTkLenCompatibleWith(*yType)) {
@@ -449,7 +452,8 @@ static bool AreTypeParamCompatible(const semantics::DerivedTypeSpec &x,
 // F2023 7.5.3.2
 static bool AreSameDerivedType(const semantics::DerivedTypeSpec &x,
     const semantics::DerivedTypeSpec &y, bool ignoreTypeParameterValues,
-    bool ignoreLenParameters, SetOfDerivedTypePairs &inProgress) {
+    bool ignoreLenParameters, bool ignoreSequence,
+    SetOfDerivedTypePairs &inProgress) {
   if (&x == &y) {
     return true;
   }
@@ -472,7 +476,12 @@ static bool AreSameDerivedType(const semantics::DerivedTypeSpec &x,
   inProgress.insert(thisQuery);
   const auto &xDetails{xSymbol.get<semantics::DerivedTypeDetails>()};
   const auto &yDetails{ySymbol.get<semantics::DerivedTypeDetails>()};
-  if (!(xDetails.sequence() && yDetails.sequence()) &&
+  if (xDetails.sequence() != yDetails.sequence() ||
+      xSymbol.attrs().test(semantics::Attr::BIND_C) !=
+          ySymbol.attrs().test(semantics::Attr::BIND_C)) {
+    return false;
+  }
+  if (!ignoreSequence && !(xDetails.sequence() && yDetails.sequence()) &&
       !(xSymbol.attrs().test(semantics::Attr::BIND_C) &&
           ySymbol.attrs().test(semantics::Attr::BIND_C))) {
     // PGI does not enforce this requirement; all other Fortran
@@ -493,7 +502,8 @@ static bool AreSameDerivedType(const semantics::DerivedTypeSpec &x,
     const auto yLookup{ySymbol.scope()->find(*yComponentName)};
     if (xLookup == xSymbol.scope()->end() ||
         yLookup == ySymbol.scope()->end() ||
-        !AreSameComponent(*xLookup->second, *yLookup->second, inProgress)) {
+        !AreSameComponent(
+            *xLookup->second, *yLookup->second, ignoreSequence, inProgress)) {
       return false;
     }
   }
@@ -503,13 +513,19 @@ static bool AreSameDerivedType(const semantics::DerivedTypeSpec &x,
 bool AreSameDerivedType(
     const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y) {
   SetOfDerivedTypePairs inProgress;
-  return AreSameDerivedType(x, y, false, false, inProgress);
+  return AreSameDerivedType(x, y, false, false, false, inProgress);
 }
 
 bool AreSameDerivedTypeIgnoringTypeParameters(
     const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y) {
   SetOfDerivedTypePairs inProgress;
-  return AreSameDerivedType(x, y, true, true, inProgress);
+  return AreSameDerivedType(x, y, true, true, false, inProgress);
+}
+
+bool AreSameDerivedTypeIgnoringSequence(
+    const semantics::DerivedTypeSpec &x, const semantics::DerivedTypeSpec &y) {
+  SetOfDerivedTypePairs inProgress;
+  return AreSameDerivedType(x, y, false, false, true, inProgress);
 }
 
 static bool AreSameDerivedType(
@@ -536,7 +552,7 @@ static bool AreCompatibleDerivedTypes(const semantics::DerivedTypeSpec *x,
   } else {
     SetOfDerivedTypePairs inProgress;
     if (AreSameDerivedType(*x, *y, ignoreTypeParameterValues,
-            ignoreLenTypeParameters, inProgress)) {
+            ignoreLenTypeParameters, false, inProgress)) {
       return true;
     } else {
       return isPolymorphic &&
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 51ff70c3ed834..4367dd1dab395 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -1366,6 +1366,12 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
       name.ToString(), isIntrinsic.value_or(false))};
   if (!isIntrinsic.value_or(false) && !ancestor) {
     // Already present in the symbol table as a usable non-intrinsic module?
+    if (Scope * hermeticScope{context_.currentHermeticModuleFileScope()}) {
+      auto it{hermeticScope->find(name)};
+      if (it != hermeticScope->end()) {
+        return it->second->scope();
+      }
+    }
     auto it{context_.globalScope().find(name)};
     if (it != context_.globalScope().end()) {
       Scope *scope{it->second->scope()};
@@ -1544,9 +1550,22 @@ Scope *ModFileReader::Read(SourceName name, std::optional<bool> isIntrinsic,
   // Process declarations from the module file
   auto wasModuleFileName{context_.foldingContext().moduleFileName()};
   context_.foldingContext().set_moduleFileName(name);
+  // Are there multiple modules in the module file due to it having been
+  // created under -fhermetic-module-files?  If so, process them first in
+  // their own nested scope that will be visible only to USE statements
+  // within the module file.
+  if (parseTree.v.size() > 1) {
+    parser::Program hermeticModules{std::move(parseTree.v)};
+    parseTree.v.emplace_back(std::move(hermeticModules.v.front()));
+    hermeticModules.v.pop_front();
+    Scope &hermeticScope{topScope.MakeScope(Scope::Kind::Global)};
+    context_.set_currentHermeticModuleFileScope(&hermeticScope);
+    ResolveNames(context_, hermeticModules, hermeticScope);
+  }
   GetModuleDependences(context_.moduleDependences(), sourceFile->content());
   ResolveNames(context_, parseTree, topScope);
   context_.foldingContext().set_moduleFileName(wasModuleFileName);
+  context_.set_currentHermeticModuleFileScope(nullptr);
   if (!moduleSymbol) {
     // Submodule symbols' storage are owned by their parents' scopes,
     // but their names are not in their parents' dictionaries -- we
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index aee0357333159..0d690803c36dc 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -2591,9 +2591,11 @@ void ScopeHandler::PopScope() {
     ConvertToObjectEntity(*pair.second);
   }
   funcResultStack_.Pop();
-  // If popping back into a global scope, pop back to the main global scope.
-  SetScope(currScope_->parent().IsGlobal() ? context().globalScope()
-                                           : currScope_->parent());
+  // If popping back into a global scope, pop back to the top scope.
+  Scope *hermetic{context().currentHermeticModuleFileScope()};
+  SetScope(currScope_->parent().IsGlobal()
+          ? (hermetic ? *hermetic : context().globalScope())
+          : currScope_->parent());
 }
 void ScopeHandler::SetScope(Scope &scope) {
   currScope_ = &scope;
@@ -3204,9 +3206,9 @@ ModuleVisitor::SymbolRename ModuleVisitor::AddUse(
 // symbol must be either a Use or a Generic formed by merging two uses.
 // Convert it to a UseError with this additional location.
 static bool ConvertToUseError(
-    Symbol &symbol, const SourceName &location, const Scope &module) {
+    Symbol &symbol, const SourceName &location, const Symbol &used) {
   if (auto *ued{symbol.detailsIf<UseErrorDetails>()}) {
-    ued->add_occurrence(location, module);
+    ued->add_occurrence(location, used);
     return true;
   }
   const auto *useDetails{symbol.detailsIf<UseDetails>()};
@@ -3219,18 +3221,104 @@ static bool ConvertToUseError(
   }
   if (useDetails) {
     symbol.set_details(
-        UseErrorDetails{*useDetails}.add_occurrence(location, module));
+        UseErrorDetails{*useDetails}.add_occurrence(location, used));
     return true;
   } else {
     return false;
   }
 }
 
+// Two ultimate symbols are distinct, but they have the same name and come
+// from modules with the same name.  At link time, their mangled names
+// would conflict, so they had better resolve to the same definition.
+// Check whether the two ultimate symbols have compatible definitions.
+// Returns true if no further processing is required in DoAddUse().
+static bool CheckCompatibleDistinctUltimates(SemanticsContext &context,
+    SourceName location, SourceName localName, const Symbol &localSymbol,
+    const Symbol &localUltimate, const Symbol &useUltimate, bool &isError) {
+  isError = false;
+  if (localUltimate.has<GenericDetails>()) {
+    if (useUltimate.has<GenericDetails>() ||
+        useUltimate.has<SubprogramDetails>() ||
+        useUltimate.has<DerivedTypeDetails>()) {
+      return false; // can try to merge them
+    } else {
+      isError = true;
+    }
+  } else if (useUltimate.has<GenericDetails>()) {
+    if (localUltimate.has<SubprogramDetails>() ||
+        localUltimate.has<DerivedTypeDetails>()) {
+      return false; // can try to merge them
+    } else {
+      isError = true;
+    }
+  } else if (localUltimate.has<SubprogramDetails>()) {
+    if (useUltimate.has<SubprogramDetails>()) {
+      auto localCharacteristics{
+          evaluate::characteristics::Procedure::Characterize(
+              localUltimate, context.foldingContext())};
+      auto useCharacteristics{
+          evaluate::characteristics::Procedure::Characterize(
+              useUltimate, context.foldingContext())};
+      if ((localCharacteristics &&
+              (!useCharacteristics ||
+                  *localCharacteristics != *useCharacteristics)) ||
+          (!localCharacteristics && useCharacteristics)) {
+        isError = true;
+      }
+    } else {
+      isError = true;
+    }
+  } else if (useUltimate.has<SubprogramDetails>()) {
+    isError = true;
+  } else if (const auto *localObject{
+                 localUltimate.detailsIf<ObjectEntityDetails>()}) {
+    if (const auto *useObject{useUltimate.detailsIf<ObjectEntityDetails>()}) {
+      auto localType{evaluate::DynamicType::From(localUltimate)};
+      auto useType{evaluate::DynamicType::From(useUltimate)};
+      if (localUltimate.size() != useUltimate.size() ||
+          (localType &&
+              (!useType || !localType->IsTkLenCompatibleWith(*useType) ||
+                  !useType->IsTkLenCompatibleWith(*localType))) ||
+          (!localType && useType)) {
+        isError = true;
+      } else if (IsNamedConstant(localUltimate)) {
+        isError = !IsNamedConstant(useUltimate) ||
+            !(*localObject->init() == *useObject->init());
+      } else {
+        isError = IsNamedConstant(useUltimate);
+      }
+    } else {
+      isError = true;
+    }
+  } else if (useUltimate.has<ObjectEntityDetails>()) {
+    isError = true;
+  } else if (IsProcedurePointer(localUltimate)) {
+    isError = !IsProcedurePointer(useUltimate);
+  } else if (IsProcedurePointer(useUltimate)) {
+    isError = true;
+  } else if (localUltimate.has<DerivedTypeDetails>()) {
+    isError = !(useUltimate.has<DerivedTypeDetails>() &&
+        evaluate::AreSameDerivedTypeIgnoringSequence(
+            DerivedTypeSpec{localUltimate.name(), localUltimate},
+            DerivedTypeSpec{useUltimate.name(), useUltimate}));
+  } else if (useUltimate.has<DerivedTypeDetails>()) {
+    isError = true;
+  } else if (localUltimate.has<NamelistDetails>() &&
+      useUltimate.has<NamelistDetails>()) {
+  } else if (localUltimate.has<CommonBlockDetails>() &&
+      useUltimate.has<CommonBlockDetails>()) {
+  } else {
+    isError = true;
+  }
+  return true; // don't try to merge generics (or whatever)
+}
+
 void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     Symbol &originalLocal, const Symbol &useSymbol) {
   Symbol *localSymbol{&originalLocal};
   if (auto *details{localSymbol->detailsIf<UseErrorDetails>()}) {
-    details->add_occurrence(location, *useModuleScope_);
+    details->add_occurrence(location, useSymbol);
     return;
   }
   const Symbol &useUltimate{useSymbol.GetUltimate()};
@@ -3267,6 +3355,40 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     return;
   }
 
+  if (localUltimate.name() == useUltimate.name() &&
+      localUltimate.owner().IsModule() && useUltimate.owner().IsModule() &&
+      localUltimate.owner().GetName() &&
+      localUltimate.owner().GetName() == useUltimate.owner().GetName()) {
+    bool isError{false};
+    if (CheckCompatibleDistinctUltimates(context(), location, localName,
+            *localSymbol, localUltimate, useUltimate, isError)) {
+      if (isError) {
+        // Convert the local symbol to a UseErrorDetails, if possible;
+        // otherwise emit a fatal error.
+        if (!ConvertToUseError(*localSymbol, location, useSymbol)) {
+          context()
+              .Say(location,
+                  "'%s' use-associated from '%s' in module '%s' is incompatible with '%s' from another module"_err_en_US,
+                  localName, useUltimate.name(),
+                  useUltimate.owner().GetName().value(), localUltimate.name())
+              .Attach(useUltimate.name(), "First declaration"_en_US)
+              .Attach(localUltimate.name(), "Other declaration"_en_US);
+          return;
+        }
+      }
+      if (auto *msg{context().Warn(
+              common::UsageWarning::CompatibleDeclarationsFromDistinctModules,
+              location,
+              "'%s' is use-associated from '%s' in two distinct instances of module '%s'"_warn_en_US,
+              localName, localUltimate.name(),
+              localUltimate.owner().GetName().value())}) {
+        msg->Attach(localUltimate.name(), "Previous declaration"_en_US)
+            .Attach(useUltimate.name(), "Later declaration"_en_US);
+      }
+      return;
+    }
+  }
+
   // There are many possible combinations of symbol types that could arrive
   // with the same (local) name vie USE association from distinct modules.
   // Fortran allows a generic interface to share its name with a derived type,
@@ -3328,7 +3450,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
     EraseSymbol(*localSymbol);
     CHECK(localSymbol->has<UseDetails>());
     UseErrorDetails details{localSymbol->get<UseDetails>()};
-    details.add_occurrence(location, *useModuleScope_);
+    details.add_occurrence(location, useSymbol);
     Symbol *newSymbol{&MakeSymbol(localName, Attrs{}, std::move(details))};
     // Restore *localSymbol in currScope
     auto iter{currScope().find(localName)};
@@ -3365,7 +3487,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
       if (localGeneric) {
         combinedDerivedType = CreateLocalUseError();
       } else {
-        ConvertToUseError(*localSymbol, location, *useModuleScope_);
+        ConvertToUseError(*localSymbol, location, useSymbol);
         localDerivedType = nullptr;
         localGeneric = nullptr;
         combinedDerivedType = localSymbol;
@@ -3473,7 +3595,7 @@ void ModuleVisitor::DoAddUse(SourceName location, SourceName localName,
 
   // If symbols are not combinable, create a use error.
   if (cantCombine) {
-    if (!ConvertToUseError(*localSymbol, location, *useModuleScope_)) {
+    if (!ConvertToUseError(*localSymbol, location, useSymbol)) {
       Say(location,
           "Cannot use-associate '%s'; it is already declared in this scope"_err_en_US,
           localName)
@@ -9465,6 +9587,12 @@ template <typename A> std::set<SourceName> GetUses(const A &x) {
 }
 
 bool ResolveNamesVisitor::Pre(const parser::Program &x) {
+  if (Scope * hermetic{context().currentHermeticModuleFileScope()}) {
+    // Processing either the dependent modules or first module of a
+    // hermetic module file; ensure that the hermetic module scope has
+    // its implicit rules map entry.
+    ImplicitRulesVisitor::BeginScope(*hermetic);
+  }
   std::map<SourceName, const parser::ProgramUnit *> modules;
   std::set<SourceName> uses;
   bool disordered{false};
diff --git a/flang/lib/Semantics/symbol.cpp b/flang/lib/Semantics/symbol.cpp
index 14d6564664f2c..61982295f323a 100644
--- a/flang/lib/Semantics/symbol.cpp
+++ b/flang/lib/Semantics/symbol.cpp
@@ -177,11 +177,11 @@ ProcEntityDetails::ProcEntityDetails(EntityDetails &&d)
     : EntityDetails(std::move(d)) {}
 
 UseErrorDetails::UseErrorDetails(const UseDetails &useDetails) {
-  add_occurrence(useDetails.location(), *GetUsedModule(useDetails).scope());
+  add_occurrence(useDetails.location(), useDetails.symbol());
 }
 UseErrorDetails &UseErrorDetails::add_occurrence(
-    const SourceName &location, const Scope &module) {
-  occurrences_.push_back(std::make_pair(location, &module));
+    const SourceName &location, const Symbol &used) {
+  occurrences_.push_back(std::make_pair(location, &used));
   return *this;
 }
 
@@ -557,9 +557,8 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &os, const Details &details) {
           [&](const UseErrorDetails &x) {
             os << " uses:";
             char sep{':'};
-            for (const auto &[location, module] : x.occurrences()) {
-              os << sep << " from " << module->GetName().value() << " at "
-                 << location;
+            for (const auto &[location, sym] : x.occurrences()) {
+              os << sep << " from " << sym->name() << " at " << location;
               sep = ',';
             }
           },
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 052d71be43472..8a1e856e0cd95 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1764,9 +1764,19 @@ bool HadUseError(
           symbol ? symbol->detailsIf<UseErrorDetails>() : nullptr}) {
     auto &msg{context.Say(
         at, "Reference to '%s' is ambiguous"_err_en_US, symbol->name())};
-    for (const auto &[location, module] : details->occurrences()) {
-      msg.Attach(location, "'%s' was use-associated from module '%s'"_en_US, at,
-          module->GetName().value());
+    for (const auto &[location, sym] : details->occurrences()) {
+      const Symbol &ultimate{sym->GetUltimate()};
+      auto &attachment{
+          msg.Attach(location, "'%s' was use-associated from module '%s'"_en_US,
+              at, sym->owner().GetName().value())};
+      if (&*sym != &ultimate) {
+        // For incompatible definitions where one comes from a hermetic
+        // module file's incorporated dependences and the other from another
+        // module of the same name.
+        attachment.Attach(ultimate.name(),
+            "ultimately from '%s' in module '%s'"_en_US, ultimate.name(),
+            ultimate.owner().GetName().value());
+      }
     }
     context.SetError(*symbol);
     return true;
diff --git a/flang/test/Semantics/modfile71.F90 b/flang/test/Semantics/modfile71.F90
new file mode 100644
index 0000000000000..7c3c7f5b48958
--- /dev/null
+++ b/flang/test/Semantics/modfile71.F90
@@ -0,0 +1,121 @@
+!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s
+!RUN: %flang_fc1 -fsyntax-only -DSTEP=2 %s
+!RUN: not %flang_fc1 -fsyntax-only -pedantic %s 2>&1 | FileCheck %s
+
+! Tests that a module captured in a hermetic module file is compatible when
+! USE'd with a module of the same name USE'd directly.
+
+#if STEP == 1
+module modfile71a
+  ! not errors
+  integer, parameter :: good_named_const = 123
+  integer :: good_var = 1
+  type :: good_derived
+    integer component
+  end type
+  procedure(), pointer :: good_proc_ptr
+  generic :: gen => bad_subroutine
+  ! bad, but okay if unused
+  integer, parameter :: unused_bad_named_const = 123
+  integer :: unused_bad_var = 1
+  type :: unused_bad_derived
+    integer component
+  end type
+  procedure(), pointer :: unused_bad_proc_ptr
+  ! errors
+  integer, parameter :: bad_named_const = 123
+  integer :: bad_var = 1
+  type :: bad_derived
+    integer component
+  end type
+  procedure(), pointer :: bad_proc_ptr
+ contains
+  subroutine good_subroutine
+  end
+  subroutine unused_bad_subroutine(x)
+    integer x
+  end
+  subroutine bad_subroutine(x)
+    integer x
+  end
+end
+
+module modfile71b
+  use modfile71a ! capture hermetically
+end
+
+#elif STEP == 2
+module modfile71a
+  ! not errors
+  integer, parameter :: good_named_const = 123
+  integer :: good_var = 1
+  type :: good_derived
+    integer component
+  end type
+  procedure(), pointer :: good_proc_ptr
+  generic :: gen => bad_subroutine
+  ! bad, but okay if unused
+  integer, parameter :: unused_bad_named_const = 666
+  real :: unused_bad_var = 1.
+  type :: unused_bad_derived
+    real component
+  end type
+  real, pointer :: unused_bad_proc_ptr
+  ! errors
+  integer, parameter :: bad_named_const = 666
+  real :: bad_var = 1.
+  type :: bad_derived
+    real component
+  end type
+  real, pointer :: bad_proc_ptr
+ contains
+  subroutine good_subroutine
+  end
+  subroutine unused_bad_subroutine(x)
+    real x
+  end
+  subroutine bad_subroutine(x)
+    real x
+  end
+end
+
+#else
+
+!CHECK: warning: 'bad_derived' is use-associated from 'bad_derived' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'bad_named_const' is use-associated from 'bad_named_const' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'bad_proc_ptr' is use-associated from 'bad_proc_ptr' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'bad_subroutine' is use-associated from 'bad_subroutine' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'bad_var' is use-associated from 'bad_var' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'good_derived' is use-associated from 'good_derived' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'good_named_const' is use-associated from 'good_named_const' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'good_proc_ptr' is use-associated from 'good_proc_ptr' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'good_subroutine' is use-associated from 'good_subroutine' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'good_var' is use-associated from 'good_var' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'unused_bad_derived' is use-associated from 'unused_bad_derived' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'unused_bad_named_const' is use-associated from 'unused_bad_named_const' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'unused_bad_proc_ptr' is use-associated from 'unused_bad_proc_ptr' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'unused_bad_subroutine' is use-associated from 'unused_bad_subroutine' in two distinct instances of module 'modfile71a'
+!CHECK: warning: 'unused_bad_var' is use-associated from 'unused_bad_var' in two distinct instances of module 'modfile71a'
+!CHECK: error: Reference to 'bad_derived' is ambiguous
+!CHECK: error: Reference to 'bad_named_const' is ambiguous
+!CHECK: error: Reference to 'bad_var' is ambiguous
+!CHECK: error: Reference to 'bad_proc_ptr' is ambiguous
+!CHECK: error: Reference to 'bad_subroutine' is ambiguous
+!CHECK-NOT: error:
+!CHECK-NOT: warning:
+
+program main
+  use modfile71a
+  use modfile71b
+  type(good_derived) goodx
+  type(bad_derived) badx
+  print *, good_named_const
+  good_var = 1
+  good_proc_ptr => null()
+  call good_subroutine
+  print *, bad_named_const
+  print *, bad_var
+  bad_proc_ptr => null()
+  call bad_subroutine(1)
+end
+#endif

From 34b139594aa20fe712bc2ad68544632b3e4d8512 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 27 Jan 2025 16:44:14 +0000
Subject: [PATCH 229/432] [NFC][DebugInfo] Switch more call-sites to using
 iterator-insertion (#124283)

To finalise the "RemoveDIs" work removing debug intrinsics, we're
updating call sites that insert instructions to use iterators instead.
This set of changes are those where it's not immediately obvious that
just calling getIterator to fetch an iterator is correct, and one or two
places where more than one line needs to change.

Overall the same rule holds though: iterators generated for the start of
a block such as getFirstNonPHIIt need to be passed into insert/move
methods without being unwrapped/rewrapped, everything else can use
getIterator.
---
 .../llvm/Transforms/Coroutines/CoroInstr.h    |  2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 +--
 llvm/lib/IR/BasicBlock.cpp                    |  6 ++---
 .../Hexagon/HexagonOptimizeSZextends.cpp      |  2 +-
 .../Instrumentation/AddressSanitizer.cpp      |  7 +++---
 .../Transforms/Scalar/CallSiteSplitting.cpp   |  8 +++---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  4 +--
 .../Scalar/MergedLoadStoreMotion.cpp          |  2 +-
 .../Scalar/RewriteStatepointsForGC.cpp        | 25 ++++++++++---------
 llvm/lib/Transforms/Scalar/SROA.cpp           |  4 +--
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |  4 +--
 .../Transforms/Utils/AssumeBundleBuilder.cpp  | 10 ++++----
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |  8 +++---
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |  5 ++--
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 15 ++++++-----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |  2 +-
 16 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
index 3aa30bec85c3a..fbc76219ead86 100644
--- a/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/include/llvm/Transforms/Coroutines/CoroInstr.h
@@ -170,7 +170,7 @@ class CoroIdInst : public AnyCoroIdInst {
       Inst->eraseFromParent();
       return;
     }
-    Inst->moveBefore(getCoroBegin()->getNextNode());
+    Inst->moveBefore(std::next(getCoroBegin()->getIterator()));
   }
 
   // Info argument of coro.id is
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 8cc3a99d92023..3e74f4c6d8ccd 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -678,7 +678,7 @@ void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder,
                                                      Function *Function) {
   BasicBlock &EntryBlock = Function->getEntryBlock();
-  Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
+  BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
 
   // Loop over blocks looking for constant allocas, skipping the entry block
   // as any allocas there are already in the desired location.
@@ -6918,7 +6918,7 @@ static Expected<Function *> createOutlinedFunction(
   Builder.CreateRetVoid();
 
   // New Alloca IP at entry point of created device function.
-  Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
+  Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
   auto AllocaIP = Builder.saveIP();
 
   Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 0c9296659a2a7..0fbe02335edb3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -555,7 +555,7 @@ void BasicBlock::removePredecessor(BasicBlock *Pred,
 }
 
 bool BasicBlock::canSplitPredecessors() const {
-  const Instruction *FirstNonPHI = getFirstNonPHI();
+  const_iterator FirstNonPHI = getFirstNonPHIIt();
   if (isa<LandingPadInst>(FirstNonPHI))
     return true;
   // This is perhaps a little conservative because constructs like
@@ -687,11 +687,11 @@ void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
 }
 
 bool BasicBlock::isLandingPad() const {
-  return isa<LandingPadInst>(getFirstNonPHI());
+  return isa<LandingPadInst>(getFirstNonPHIIt());
 }
 
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
-  return dyn_cast<LandingPadInst>(getFirstNonPHI());
+  return dyn_cast<LandingPadInst>(getFirstNonPHIIt());
 }
 
 std::optional<uint64_t> BasicBlock::getIrrLoopHeaderWeight() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
index bfd02802b7829..c29cf034ce089 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp
@@ -81,7 +81,7 @@ bool HexagonOptimizeSZextends::runOnFunction(Function &F) {
             assert (EVT::getEVT(SI->getType()) ==
                     (EVT::getEVT(Use->getType())));
             Use->replaceAllUsesWith(SI);
-            Instruction* First = &F.getEntryBlock().front();
+            BasicBlock::iterator First = F.getEntryBlock().begin();
             SI->insertBefore(First);
             Use->eraseFromParent();
           }
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0aa0cbcfc48b3..bbe7040121649 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -3414,7 +3414,7 @@ void FunctionStackPoisoner::processStaticAllocas() {
   assert(InsBeforeB == &F.getEntryBlock());
   for (auto *AI : StaticAllocasToMoveUp)
     if (AI->getParent() == InsBeforeB)
-      AI->moveBefore(InsBefore);
+      AI->moveBefore(InsBefore->getIterator());
 
   // Move stores of arguments into entry-block allocas as well. This prevents
   // extra stack slots from being generated (to house the argument values until
@@ -3423,10 +3423,11 @@ void FunctionStackPoisoner::processStaticAllocas() {
   SmallVector<Instruction *, 8> ArgInitInsts;
   findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts);
   for (Instruction *ArgInitInst : ArgInitInsts)
-    ArgInitInst->moveBefore(InsBefore);
+    ArgInitInst->moveBefore(InsBefore->getIterator());
 
   // If we have a call to llvm.localescape, keep it in the entry block.
-  if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+  if (LocalEscapeCall)
+    LocalEscapeCall->moveBefore(InsBefore->getIterator());
 
   SmallVector<ASanStackVariableDescription, 16> SVD;
   SVD.reserve(AllocaVec.size());
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 350b0c3754bb5..e644636d42f63 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -218,8 +218,8 @@ static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
   return true;
 }
 
-static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
-                                         Value *V) {
+static Instruction *
+cloneInstForMustTail(Instruction *I, BasicBlock::iterator Before, Value *V) {
   Instruction *Copy = I->clone();
   Copy->setName(I->getName());
   Copy->insertBefore(Before);
@@ -251,8 +251,8 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
   Instruction *TI = SplitBB->getTerminator();
   Value *V = NewCI;
   if (BCI)
-    V = cloneInstForMustTail(BCI, TI, V);
-  cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
+    V = cloneInstForMustTail(BCI, TI->getIterator(), V);
+  cloneInstForMustTail(RI, TI->getIterator(), IsVoid ? nullptr : V);
 
   // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
   // that prevents doing this now.
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index f57ec3f57b23b..2c3d64b0e07d9 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6176,11 +6176,11 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
     // CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
     // no good place to stick any instructions.
     if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
-       auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
+       auto FirstNonPHI = PN->getParent()->getFirstNonPHIIt();
        if (isa<FuncletPadInst>(FirstNonPHI) ||
            isa<CatchSwitchInst>(FirstNonPHI))
          for (BasicBlock *PredBB : PN->blocks())
-           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
+           if (isa<CatchSwitchInst>(PredBB->getFirstNonPHIIt()))
              return;
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index cc67a455672be..60fbb689c33f3 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -281,7 +281,7 @@ void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
     auto *GEP0 = cast<GetElementPtrInst>(Ptr0);
     auto *GEP1 = cast<GetElementPtrInst>(Ptr1);
     Instruction *GEPNew = GEP0->clone();
-    GEPNew->insertBefore(SNew);
+    GEPNew->insertBefore(SNew->getIterator());
     GEPNew->applyMergedLocation(GEP0->getDebugLoc(), GEP1->getDebugLoc());
     SNew->setOperand(1, GEPNew);
     GEP0->replaceAllUsesWith(GEPNew);
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 7b848ae547bd5..c7b55c2fb4f45 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1371,7 +1371,7 @@ static void recomputeLiveInValues(
 // and inserts them before "InsertBefore". Returns rematerialized value
 // which should be used after statepoint.
 static Instruction *rematerializeChain(ArrayRef<Instruction *> ChainToBase,
-                                       Instruction *InsertBefore,
+                                       BasicBlock::iterator InsertBefore,
                                        Value *RootOfChain,
                                        Value *AlternateLiveBase) {
   Instruction *LastClonedValue = nullptr;
@@ -2185,16 +2185,16 @@ static void relocationViaAlloca(
         // InvokeInst is a terminator so the store need to be inserted into its
         // normal destination block.
         BasicBlock *NormalDest = Invoke->getNormalDest();
-        Store->insertBefore(NormalDest->getFirstNonPHI());
+        Store->insertBefore(NormalDest->getFirstNonPHIIt());
       } else {
         assert(!Inst->isTerminator() &&
                "The only terminator that can produce a value is "
                "InvokeInst which is handled above.");
-        Store->insertAfter(Inst);
+        Store->insertAfter(Inst->getIterator());
       }
     } else {
       assert(isa<Argument>(Def));
-      Store->insertAfter(cast<Instruction>(Alloca));
+      Store->insertAfter(cast<Instruction>(Alloca)->getIterator());
     }
   }
 
@@ -2499,8 +2499,9 @@ static void rematerializeLiveValuesAtUses(
     //   statepoint between uses in the block.
     while (!Cand->user_empty()) {
       Instruction *UserI = cast<Instruction>(*Cand->user_begin());
-      Instruction *RematChain = rematerializeChain(
-          Record.ChainToBase, UserI, Record.RootOfChain, PointerToBase[Cand]);
+      Instruction *RematChain =
+          rematerializeChain(Record.ChainToBase, UserI->getIterator(),
+                             Record.RootOfChain, PointerToBase[Cand]);
       UserI->replaceUsesOfWith(Cand, RematChain);
       PointerToBase[RematChain] = PointerToBase[Cand];
     }
@@ -2573,16 +2574,16 @@ static void rematerializeLiveValues(CallBase *Call,
       Instruction *InsertBefore = Call->getNextNode();
       assert(InsertBefore);
       Instruction *RematerializedValue =
-          rematerializeChain(Record.ChainToBase, InsertBefore,
+          rematerializeChain(Record.ChainToBase, InsertBefore->getIterator(),
                              Record.RootOfChain, PointerToBase[LiveValue]);
       Info.RematerializedValues[RematerializedValue] = LiveValue;
     } else {
       auto *Invoke = cast<InvokeInst>(Call);
 
-      Instruction *NormalInsertBefore =
-          &*Invoke->getNormalDest()->getFirstInsertionPt();
-      Instruction *UnwindInsertBefore =
-          &*Invoke->getUnwindDest()->getFirstInsertionPt();
+      BasicBlock::iterator NormalInsertBefore =
+          Invoke->getNormalDest()->getFirstInsertionPt();
+      BasicBlock::iterator UnwindInsertBefore =
+          Invoke->getUnwindDest()->getFirstInsertionPt();
 
       Instruction *NormalRematerializedValue =
           rematerializeChain(Record.ChainToBase, NormalInsertBefore,
@@ -3131,7 +3132,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
       // most instructions without side effects or memory access.
       if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
         MadeChange = true;
-        Cond->moveBefore(TI);
+        Cond->moveBefore(TI->getIterator());
       }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index f6179cadab425..29240aaaa21be 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -480,7 +480,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     // noted as slightly offset (in code) from the store. In practice this
     // should have little effect on the debugging experience due to the fact
     // that all the split stores should get the same line number.
-    NewAssign->moveBefore(DbgAssign);
+    NewAssign->moveBefore(DbgAssign->getIterator());
 
     NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
     LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
@@ -1843,7 +1843,7 @@ static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
       CondMemOp.dropUBImplyingAttrsAndMetadata();
       ++NumLoadsSpeculated;
     }
-    CondMemOp.insertBefore(NewMemOpBB->getTerminator());
+    CondMemOp.insertBefore(NewMemOpBB->getTerminator()->getIterator());
     Value *Ptr = SI.getOperand(1 + SuccIdx);
     CondMemOp.setOperand(I.getPointerOperandIndex(), Ptr);
     if (isa<LoadInst>(I)) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index bcc56203a4bea..4f7956514b7b5 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -3303,8 +3303,8 @@ static bool isSafeForNoNTrivialUnswitching(Loop &L, LoopInfo &LI) {
   // FIXME: We should teach SplitBlock to handle this and remove this
   // restriction.
   for (auto *ExitBB : ExitBlocks) {
-    auto *I = ExitBB->getFirstNonPHI();
-    if (isa<CleanupPadInst>(I) || isa<CatchSwitchInst>(I)) {
+    auto It = ExitBB->getFirstNonPHIIt();
+    if (isa<CleanupPadInst>(It) || isa<CatchSwitchInst>(It)) {
       LLVM_DEBUG(dbgs() << "Cannot unswitch because of cleanuppad/catchswitch "
                            "in exit block\n");
       return false;
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index b05ae00a1e0ea..2d9a3d1f8a110 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -474,9 +474,9 @@ struct AssumeSimplify {
     AssumeBuilderState Builder(F.getParent());
 
     /// For now it is initialized to the best value it could have
-    Instruction *InsertPt = BB->getFirstNonPHI();
+    BasicBlock::iterator InsertPt = BB->getFirstNonPHIIt();
     if (isa<LandingPadInst>(InsertPt))
-      InsertPt = InsertPt->getNextNode();
+      InsertPt = std::next(InsertPt);
     for (IntrinsicInst *I : make_range(Begin, End)) {
       CleanupToDo.insert(I);
       for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) {
@@ -487,8 +487,8 @@ struct AssumeSimplify {
         Builder.addKnowledge(RK);
         if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn))
           if (I->getParent() == InsertPt->getParent() &&
-              (InsertPt->comesBefore(I) || InsertPt == I))
-            InsertPt = I->getNextNode();
+              (InsertPt->comesBefore(I) || &*InsertPt == I))
+            InsertPt = I->getNextNode()->getIterator();
       }
     }
 
@@ -498,7 +498,7 @@ struct AssumeSimplify {
       for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator();
            It != E; --It)
         if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
-          InsertPt = It->getNextNode();
+          InsertPt = std::next(It);
           break;
         }
     auto *MergedAssume = Builder.build();
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 5d191c27b5505..f81420460f08d 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -836,7 +836,7 @@ BasicBlock *llvm::ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
                                    const CriticalEdgeSplittingOptions &Options,
                                    const Twine &BBName) {
 
-  auto *PadInst = Succ->getFirstNonPHI();
+  auto PadInst = Succ->getFirstNonPHIIt();
   if (!LandingPadReplacement && !PadInst->isEHPad())
     return SplitEdge(BB, Succ, Options.DT, Options.LI, Options.MSSAU, BBName);
 
@@ -981,7 +981,7 @@ BasicBlock *llvm::ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
 void llvm::createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
                                       BasicBlock *SplitBB, BasicBlock *DestBB) {
   // SplitBB shouldn't have anything non-trivial in it yet.
-  assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+  assert((&*SplitBB->getFirstNonPHIIt() == SplitBB->getTerminator() ||
           SplitBB->isLandingPad()) &&
          "SplitBB has non-PHI nodes!");
 
@@ -1450,7 +1450,7 @@ static void SplitLandingPadPredecessorsImpl(
 
   // The new block unconditionally branches to the old block.
   BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
-  BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+  BI1->setDebugLoc(OrigBB->getFirstNonPHIIt()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB1 instead of OrigBB.
   for (BasicBlock *Pred : Preds) {
@@ -1491,7 +1491,7 @@ static void SplitLandingPadPredecessorsImpl(
 
     // The new block unconditionally branches to the old block.
     BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
-    BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+    BI2->setDebugLoc(OrigBB->getFirstNonPHIIt()->getDebugLoc());
 
     // Move the remaining edges from OrigBB to point to NewBB2.
     for (BasicBlock *NewBB2Pred : NewBB2Preds)
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 49209e33f2d1d..0b532b68e3721 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -165,7 +165,7 @@ static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
 static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
                                               BasicBlock *Latch,
-                                              Instruction *InsertLoc,
+                                              BasicBlock::iterator InsertLoc,
                                               BasicBlockSet &AftBlocks) {
   // We need to ensure we move the instructions in the correct order,
   // starting with the earliest required instruction and moving forward.
@@ -329,7 +329,8 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   // Move any instructions from fore phi operands from AftBlocks into Fore.
   moveHeaderPhiOperandsToForeBlocks(
-      Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
+      Header, LatchBlock, ForeBlocksLast[0]->getTerminator()->getIterator(),
+      AftBlocks);
 
   // The current on-the-fly SSA update requires blocks to be processed in
   // reverse postorder so that LastValueMap contains the correct value at each
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 9cc754e71c0e4..12dd49da279b9 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -5262,8 +5262,8 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
 bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
   if (isa<PHINode>(RI->getValue()))
     return simplifyCommonResume(RI);
-  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
-           RI->getValue() == RI->getParent()->getFirstNonPHI())
+  else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHIIt()) &&
+           RI->getValue() == &*RI->getParent()->getFirstNonPHIIt())
     // The resume must unwind the exception that caused control to branch here.
     return simplifySingleResume(RI);
 
@@ -5297,8 +5297,8 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
 
   // Check that there are no other instructions except for debug and lifetime
   // intrinsics between the phi's and resume instruction.
-  if (!isCleanupBlockEmpty(
-          make_range(RI->getParent()->getFirstNonPHI(), BB->getTerminator())))
+  if (!isCleanupBlockEmpty(make_range(RI->getParent()->getFirstNonPHIIt(),
+                                      BB->getTerminator()->getIterator())))
     return false;
 
   SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
@@ -5315,7 +5315,7 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
     if (IncomingBB->getUniqueSuccessor() != BB)
       continue;
 
-    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+    auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHIIt());
     // Not the landing pad that caused the control to branch here.
     if (IncomingValue != LandingPad)
       continue;
@@ -5364,7 +5364,7 @@ bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
 // Simplify resume that is only used by a single (non-phi) landing pad.
 bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
-  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
+  auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHIIt());
   assert(RI->getValue() == LPInst &&
          "Resume must unwind the exception that caused control to here");
 
@@ -5412,7 +5412,6 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
   // If the cleanup return we are simplifying unwinds to the caller, this will
   // set UnwindDest to nullptr.
   BasicBlock *UnwindDest = RI->getUnwindDest();
-  Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
 
   // We're about to remove BB from the control flow.  Before we do, sink any
   // PHINodes into the unwind destination.  Doing this before changing the
@@ -5449,7 +5448,7 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
     }
 
     // Sink any remaining PHI nodes directly into UnwindDest.
-    Instruction *InsertPt = DestEHPad;
+    BasicBlock::iterator InsertPt = UnwindDest->getFirstNonPHIIt();
     for (PHINode &PN : make_early_inc_range(BB->phis())) {
       if (PN.use_empty() || !PN.isUsedOutsideOfBlock(BB))
         // If the PHI node has no uses or all of its uses are in this basic
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 83c54a9b9c259..35da93ee3b407 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1024,7 +1024,7 @@ void VPlan::execute(VPTransformState *State) {
       // Move the last step to the end of the latch block. This ensures
       // consistent placement of all induction updates.
       Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
-      Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
+      Inc->moveBefore(std::prev(VectorLatchBB->getTerminator()->getIterator()));
 
       // Use the steps for the last part as backedge value for the induction.
       if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))

From 512b44d5e1534ef60b5db7a99818e1021cf6064c Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:44:39 -0800
Subject: [PATCH 230/432] [flang] Define ATOMIC_ADD as an intrinsic procedure
 (#122993)

This one appears to have been omitted when other ATOMIC_xxx intrinsic
procedures were defined. There's already tests for it, but they
apparently work even when ATOMIC_ADD must be interpreted as an external
procedure with an implicit interface. Extend the tests with INTRINSIC
NONE(EXTERNAL, TYPE) statements to ensure that they require the
intrinsic interpretation.
---
 flang/lib/Evaluate/intrinsics.cpp | 40 ++++++++++++++++++-------------
 flang/test/Semantics/atomic01.f90 | 28 +++++++++++++++++-----
 flang/test/Semantics/atomic02.f90 |  2 +-
 flang/test/Semantics/atomic03.f90 | 10 ++++----
 flang/test/Semantics/atomic04.f90 | 10 ++++----
 flang/test/Semantics/atomic05.f90 |  2 +-
 flang/test/Semantics/atomic06.f90 |  2 +-
 flang/test/Semantics/atomic07.f90 |  2 +-
 flang/test/Semantics/atomic08.f90 |  2 +-
 flang/test/Semantics/atomic09.f90 |  2 +-
 flang/test/Semantics/atomic10.f90 | 10 ++++----
 flang/test/Semantics/atomic11.f90 |  2 +-
 12 files changed, 67 insertions(+), 45 deletions(-)

diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index f234241cfe14a..77d37d40bbddc 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -1311,6 +1311,14 @@ static const SpecificIntrinsicInterface specificIntrinsicFunction[]{
 
 static const IntrinsicInterface intrinsicSubroutine[]{
     {"abort", {}, {}, Rank::elemental, IntrinsicClass::impureSubroutine},
+    {"atomic_add",
+        {{"atom", AtomicInt, Rank::atom, Optionality::required,
+             common::Intent::InOut},
+            {"value", AnyInt, Rank::scalar, Optionality::required,
+                common::Intent::In},
+            {"stat", AnyInt, Rank::scalar, Optionality::optional,
+                common::Intent::Out}},
+        {}, Rank::elemental, IntrinsicClass::atomicSubroutine},
     {"atomic_and",
         {{"atom", AtomicInt, Rank::atom, Optionality::required,
              common::Intent::InOut},
@@ -1585,7 +1593,6 @@ static const IntrinsicInterface intrinsicSubroutine[]{
 };
 
 // TODO: Intrinsic subroutine EVENT_QUERY
-// TODO: Atomic intrinsic subroutines: ATOMIC_ADD
 // TODO: Collective intrinsic subroutines: co_reduce
 
 // Finds a built-in derived type and returns it as a DynamicType.
@@ -1713,8 +1720,8 @@ static bool CheckAndPushMinMaxArgument(ActualArgument &arg,
 }
 
 static bool CheckAtomicKind(const ActualArgument &arg,
-    const semantics::Scope *builtinsScope,
-    parser::ContextualMessages &messages) {
+    const semantics::Scope *builtinsScope, parser::ContextualMessages &messages,
+    const char *keyword) {
   std::string atomicKindStr;
   std::optional<DynamicType> type{arg.GetType()};
 
@@ -1727,11 +1734,12 @@ static bool CheckAtomicKind(const ActualArgument &arg,
                 "must be used with IntType or LogicalType");
   }
 
-  bool argOk = type->kind() ==
-      GetBuiltinKind(builtinsScope, ("__builtin_" + atomicKindStr).c_str());
+  bool argOk{type->kind() ==
+      GetBuiltinKind(builtinsScope, ("__builtin_" + atomicKindStr).c_str())};
   if (!argOk) {
     messages.Say(arg.sourceLocation(),
-        "Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is '%s'"_err_en_US,
+        "Actual argument for '%s=' must have kind=atomic_%s_kind, but is '%s'"_err_en_US,
+        keyword, type->category() == TypeCategory::Integer ? "int" : "logical",
         type->AsFortran());
   }
   return argOk;
@@ -2052,7 +2060,7 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
     case KindCode::sameAtom:
       if (!sameArg) {
         sameArg = arg;
-        argOk = CheckAtomicKind(DEREF(arg), builtinsScope, messages);
+        argOk = CheckAtomicKind(DEREF(arg), builtinsScope, messages, d.keyword);
       } else {
         argOk = type->IsTkCompatibleWith(sameArg->GetType().value());
         if (!argOk) {
@@ -2061,23 +2069,21 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
               d.keyword, type->AsFortran());
         }
       }
-      if (!argOk)
+      if (!argOk) {
         return std::nullopt;
+      }
       break;
     case KindCode::atomicIntKind:
-      argOk = type->kind() ==
-          GetBuiltinKind(builtinsScope, "__builtin_atomic_int_kind");
+      argOk = CheckAtomicKind(DEREF(arg), builtinsScope, messages, d.keyword);
       if (!argOk) {
-        messages.Say(arg->sourceLocation(),
-            "Actual argument for '%s=' must have kind=atomic_int_kind, but is '%s'"_err_en_US,
-            d.keyword, type->AsFortran());
         return std::nullopt;
       }
       break;
     case KindCode::atomicIntOrLogicalKind:
-      argOk = CheckAtomicKind(DEREF(arg), builtinsScope, messages);
-      if (!argOk)
+      argOk = CheckAtomicKind(DEREF(arg), builtinsScope, messages, d.keyword);
+      if (!argOk) {
         return std::nullopt;
+      }
       break;
     default:
       CRASH_NO_CASE;
@@ -3232,8 +3238,8 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
           arg ? arg->sourceLocation() : context.messages().at(),
           "Argument of ALLOCATED() must be an ALLOCATABLE object or component"_err_en_US);
     }
-  } else if (name == "atomic_and" || name == "atomic_or" ||
-      name == "atomic_xor") {
+  } else if (name == "atomic_add" || name == "atomic_and" ||
+      name == "atomic_or" || name == "atomic_xor") {
     return CheckForCoindexedObject(
         context.messages(), call.arguments[2], name, "stat");
   } else if (name == "atomic_cas") {
diff --git a/flang/test/Semantics/atomic01.f90 b/flang/test/Semantics/atomic01.f90
index 046692e87c4ad..cf3804b0d605a 100644
--- a/flang/test/Semantics/atomic01.f90
+++ b/flang/test/Semantics/atomic01.f90
@@ -1,14 +1,13 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
-! XFAIL: *
 ! This test checks for semantic errors in atomic_add() subroutine based on the
 ! statement specification in section 16.9.20 of the Fortran 2018 standard.
 
 program test_atomic_add
   use iso_fortran_env, only : atomic_int_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) atom_object[*], atom_array(2)[*], quantity, array(1), coarray[*], non_coarray
-  integer non_atom_object[*], non_atom, non_scalar(1), status, stat_array(1), coindexed[*]
+  integer non_atom_object[*], non_scalar(1), status, stat_array(1), coindexed[*]
   logical non_integer
 
   !___ standard-conforming calls with required arguments _______
@@ -31,63 +30,80 @@ program test_atomic_add
   !___ non-standard-conforming calls _______
 
   ! atom must be of kind atomic_int_kind
+  ! ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(4)'
   call atomic_add(non_atom_object, quantity)
 
   ! atom must be a coarray
+  ! ERROR: 'atom=' argument must be a scalar coarray or coindexed object for intrinsic 'atomic_add'
   call atomic_add(non_coarray, quantity)
 
   ! atom must be a scalar variable
+  ! ERROR: 'atom=' argument must be a scalar coarray or coindexed object for intrinsic 'atomic_add'
   call atomic_add(atom_array, quantity)
 
   ! atom has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'atomic_add'
   call atomic_add(atoms=atom_object, value=quantity)
 
   ! atom has an argument mismatch
+  ! ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(4)'
   call atomic_add(atom=non_atom_object, value=quantity)
 
   ! value must be an integer
+  ! ERROR: Actual argument for 'value=' has bad type 'LOGICAL(4)'
   call atomic_add(atom_object, non_integer)
 
   ! value must be an integer scalar
+  ! ERROR: 'value=' argument has unacceptable rank 1
   call atomic_add(atom_object, array)
 
-  ! value must be of kind atomic_int_kind
-  call atomic_add(atom_object, non_atom)
-
   ! value has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'atomic_add'
   call atomic_add(atom_object, values=quantity)
 
   ! value has an argument mismatch
+  ! ERROR: Actual argument for 'value=' has bad type 'LOGICAL(4)'
   call atomic_add(atom_object, value=non_integer)
 
   ! stat must be an integer
+  ! ERROR: Actual argument for 'stat=' has bad type 'LOGICAL(4)'
   call atomic_add(atom_object, quantity, non_integer)
 
   ! stat must be an integer scalar
+  ! ERROR: 'stat=' argument has unacceptable rank 1
   call atomic_add(atom_object, quantity, non_scalar)
 
   ! stat is an intent(out) argument
+  ! ERROR: Actual argument associated with INTENT(OUT) dummy argument 'stat=' is not definable
+  ! ERROR: '8_4' is not a variable or pointer
   call atomic_add(atom_object, quantity, 8)
 
   ! stat has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'atomic_add'
   call atomic_add(atom_object, quantity, statuses=status)
 
   ! stat has an argument mismatch
+  ! ERROR: Actual argument for 'stat=' has bad type 'LOGICAL(4)'
   call atomic_add(atom_object, quantity, stat=non_integer)
 
   ! stat must not be coindexed
+  ! ERROR: 'stat' argument to 'atomic_add' may not be a coindexed object
   call atomic_add(atom_object, quantity, coindexed[1])
 
   ! Too many arguments
+  ! ERROR: too many actual arguments for intrinsic 'atomic_add'
   call atomic_add(atom_object, quantity, status, stat_array(1))
 
   ! Repeated atom keyword
+  ! ERROR: repeated keyword argument to intrinsic 'atomic_add'
   call atomic_add(atom=atom_object, atom=atom_array(1), value=quantity)
 
   ! Repeated value keyword
+  ! ERROR: repeated keyword argument to intrinsic 'atomic_add'
   call atomic_add(atom=atom_object, value=quantity, value=array(1))
 
   ! Repeated stat keyword
+  ! ERROR: repeated keyword argument to intrinsic 'atomic_add'
   call atomic_add(atom=atom_object, value=quantity, stat=status, stat=stat_array(1))
 
 end program test_atomic_add
diff --git a/flang/test/Semantics/atomic02.f90 b/flang/test/Semantics/atomic02.f90
index 10a7c126dbb6d..484239a23ede2 100644
--- a/flang/test/Semantics/atomic02.f90
+++ b/flang/test/Semantics/atomic02.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_and
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_val, array(10)
diff --git a/flang/test/Semantics/atomic03.f90 b/flang/test/Semantics/atomic03.f90
index 9bb1d1c0df6b1..495df5eb97192 100644
--- a/flang/test/Semantics/atomic03.f90
+++ b/flang/test/Semantics/atomic03.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_cas
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: int_scalar_coarray[*], non_scalar_coarray(10)[*], non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], array(10)
@@ -70,16 +70,16 @@ program test_atomic_cas
 
 ! mismatches where 'atom' has wrong kind
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(4)'
   call atomic_cas(default_kind_coarray, old_int, compare_int, new_int)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(1)'
   call atomic_cas(kind1_coarray, old_int, compare_int, new_int)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(4)'
   call atomic_cas(default_kind_logical_coarray, old_logical, compare_logical, new_logical)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(1)'
   call atomic_cas(kind1_logical_coarray, old_logical, compare_logical, new_logical)
 
 ! mismatch where 'atom' has wrong type
diff --git a/flang/test/Semantics/atomic04.f90 b/flang/test/Semantics/atomic04.f90
index f065bf6404f1a..9df0b56d192a8 100644
--- a/flang/test/Semantics/atomic04.f90
+++ b/flang/test/Semantics/atomic04.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_define
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_val, array(10)
@@ -64,16 +64,16 @@ program test_atomic_define
   !ERROR: 'atom=' argument must be a scalar coarray or coindexed object for intrinsic 'atomic_define'
   call atomic_define(array, val)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(4)'
   call atomic_define(default_kind_coarray, val)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(1)'
   call atomic_define(kind1_coarray, val)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(4)'
   call atomic_define(default_kind_logical_coarray, val_logical)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(1)'
   call atomic_define(kind1_logical_coarray, val_logical)
 
   !ERROR: 'value=' argument to 'atomic_define' must have same type as 'atom=', but is 'LOGICAL(8)'
diff --git a/flang/test/Semantics/atomic05.f90 b/flang/test/Semantics/atomic05.f90
index 04c29cdd6046b..98d6b19b1f23d 100644
--- a/flang/test/Semantics/atomic05.f90
+++ b/flang/test/Semantics/atomic05.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_fetch_add
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, old_val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_old, repeated_val, array(10)
diff --git a/flang/test/Semantics/atomic06.f90 b/flang/test/Semantics/atomic06.f90
index e6307d129262e..c6a23dd0077ca 100644
--- a/flang/test/Semantics/atomic06.f90
+++ b/flang/test/Semantics/atomic06.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_fetch_and
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, old_val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_old, repeated_val, array(10)
diff --git a/flang/test/Semantics/atomic07.f90 b/flang/test/Semantics/atomic07.f90
index 0ac7ad152e86b..2bc544b757864 100644
--- a/flang/test/Semantics/atomic07.f90
+++ b/flang/test/Semantics/atomic07.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_fetch_or
   use iso_fortran_env, only: atomic_int_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, old_val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_old, repeated_val, array(10), val_coarray[*], old_val_coarray[*]
diff --git a/flang/test/Semantics/atomic08.f90 b/flang/test/Semantics/atomic08.f90
index a08512f1c7fe8..f519f9735e00e 100644
--- a/flang/test/Semantics/atomic08.f90
+++ b/flang/test/Semantics/atomic08.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_fetch_xor
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, old_val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_old, repeated_val, array(10)
diff --git a/flang/test/Semantics/atomic09.f90 b/flang/test/Semantics/atomic09.f90
index fc09724d53bc0..e4e062252659a 100644
--- a/flang/test/Semantics/atomic09.f90
+++ b/flang/test/Semantics/atomic09.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_or
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_val, array(10)
diff --git a/flang/test/Semantics/atomic10.f90 b/flang/test/Semantics/atomic10.f90
index 46fcf537f1810..04efbd6e80fd2 100644
--- a/flang/test/Semantics/atomic10.f90
+++ b/flang/test/Semantics/atomic10.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_ref
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_val, array(10)
@@ -64,16 +64,16 @@ program test_atomic_ref
   !ERROR: 'atom=' argument must be a scalar coarray or coindexed object for intrinsic 'atomic_ref'
   call atomic_ref(val, array)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(4)'
   call atomic_ref(val, default_kind_coarray)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'INTEGER(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind, but is 'INTEGER(1)'
   call atomic_ref(val, kind1_coarray)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(4)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(4)'
   call atomic_ref(val_logical, default_kind_logical_coarray)
 
-  !ERROR: Actual argument for 'atom=' must have kind=atomic_int_kind or atomic_logical_kind, but is 'LOGICAL(1)'
+  !ERROR: Actual argument for 'atom=' must have kind=atomic_logical_kind, but is 'LOGICAL(1)'
   call atomic_ref(val_logical, kind1_logical_coarray)
 
   !ERROR: 'value=' argument to 'atomic_ref' must have same type as 'atom=', but is 'LOGICAL(8)'
diff --git a/flang/test/Semantics/atomic11.f90 b/flang/test/Semantics/atomic11.f90
index 1c50825e5541f..d4f951ea02c32 100644
--- a/flang/test/Semantics/atomic11.f90
+++ b/flang/test/Semantics/atomic11.f90
@@ -4,7 +4,7 @@
 
 program test_atomic_xor
   use iso_fortran_env, only: atomic_int_kind, atomic_logical_kind
-  implicit none
+  implicit none(external, type)
 
   integer(kind=atomic_int_kind) :: scalar_coarray[*], non_scalar_coarray(10)[*], val, non_coarray
   integer(kind=atomic_int_kind) :: repeated_atom[*], repeated_val, array(10)

From 2625510ef8094457413661ef0ce2651844f584d2 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:45:11 -0800
Subject: [PATCH 231/432] [flang] Refine EVENT_TYPE/LOCK_TYPE usage checks
 (#123244)

The event variable in an EVENT POST/WAIT statement can be a coarray
reference, and need not be an entire coarray.

Variables and potential subobject components with EVENT_TYPE/LOCK_TYPE
must be coarrays, unless they are potential subobjects nested within
coarrays or pointers.
---
 flang/include/flang/Semantics/tools.h      |  5 ++-
 flang/lib/Semantics/check-coarray.cpp      |  3 --
 flang/lib/Semantics/check-declarations.cpp | 15 +++++++-
 flang/lib/Semantics/tools.cpp              | 43 +++++++++++++++++-----
 flang/test/Semantics/call04.f90            |  4 +-
 flang/test/Semantics/critical02.f90        |  2 +-
 flang/test/Semantics/doconcurrent01.f90    |  2 +-
 flang/test/Semantics/event01b.f90          | 37 +++++++++++++++++--
 flang/test/Semantics/event02b.f90          |  4 +-
 flang/test/Semantics/sync-stat-list.f90    |  2 +-
 10 files changed, 90 insertions(+), 27 deletions(-)

diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 07103f98ff041..87ddd38e5ae65 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -529,6 +529,9 @@ template <ComponentKind componentKind> class ComponentIterator {
     // having to check against an end() iterator.
     explicit operator bool() const { return !componentPath_.empty(); }
 
+    // Returns the current sequence of components, including parent components.
+    SymbolVector GetComponentPath() const;
+
     // Builds a designator name of the referenced component for messages.
     // The designator helps when the component referred to by the iterator
     // may be "buried" into other components. This gives the full
@@ -626,7 +629,7 @@ using PotentialAndPointerComponentIterator =
 // is returned. Otherwise, the returned iterator casts to true and can be
 // dereferenced.
 PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
-    const DerivedTypeSpec &);
+    const DerivedTypeSpec &, bool ignoreCoarrays = false);
 UltimateComponentIterator::const_iterator FindCoarrayUltimateComponent(
     const DerivedTypeSpec &);
 UltimateComponentIterator::const_iterator FindPointerUltimateComponent(
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 6cf61a6b923db..6bed525d7f687 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -133,9 +133,6 @@ static void CheckEventVariable(
     if (!IsEventType(evaluate::GetDerivedTypeSpec(expr->GetType()))) { // C1176
       context.Say(parser::FindSourceLocation(eventVar),
           "The event-variable must be of type EVENT_TYPE from module ISO_FORTRAN_ENV"_err_en_US);
-    } else if (!evaluate::IsCoarray(*expr)) { // C1604
-      context.Say(parser::FindSourceLocation(eventVar),
-          "The event-variable must be a coarray"_err_en_US);
     }
   }
 }
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index a7e6cf32e85ee..3d960a0620caa 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -683,7 +683,20 @@ void CheckHelper::CheckObjectEntity(
   const DeclTypeSpec *type{details.type()};
   const DerivedTypeSpec *derived{type ? type->AsDerived() : nullptr};
   bool isComponent{symbol.owner().IsDerivedType()};
-  if (!details.coshape().empty()) {
+  if (details.coshape().empty()) { // not a coarray
+    if (!isComponent && !IsPointer(symbol) && derived) {
+      if (IsEventTypeOrLockType(derived)) {
+        messages_.Say(
+            "Variable '%s' with EVENT_TYPE or LOCK_TYPE must be a coarray"_err_en_US,
+            symbol.name());
+      } else if (auto component{FindEventOrLockPotentialComponent(
+                     *derived, /*ignoreCoarrays=*/true)}) {
+        messages_.Say(
+            "Variable '%s' with EVENT_TYPE or LOCK_TYPE potential component '%s' must be a coarray"_err_en_US,
+            symbol.name(), component.BuildResultDesignatorName());
+      }
+    }
+  } else { // it's a coarray
     bool isDeferredCoshape{details.coshape().CanBeDeferredShape()};
     if (IsAllocatable(symbol)) {
       if (!isDeferredCoshape) { // C827
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 8a1e856e0cd95..013d006e614b4 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1364,13 +1364,23 @@ void ComponentIterator<componentKind>::const_iterator::Increment() {
   }
 }
 
+template <ComponentKind componentKind>
+SymbolVector
+ComponentIterator<componentKind>::const_iterator::GetComponentPath() const {
+  SymbolVector result;
+  for (const auto &node : componentPath_) {
+    result.push_back(DEREF(node.component()));
+  }
+  return result;
+}
+
 template <ComponentKind componentKind>
 std::string
 ComponentIterator<componentKind>::const_iterator::BuildResultDesignatorName()
     const {
   std::string designator;
-  for (const auto &node : componentPath_) {
-    designator += "%"s + DEREF(node.component()).name().ToString();
+  for (const Symbol &component : GetComponentPath()) {
+    designator += "%"s + component.name().ToString();
   }
   return designator;
 }
@@ -1396,16 +1406,29 @@ UltimateComponentIterator::const_iterator FindPointerUltimateComponent(
 }
 
 PotentialComponentIterator::const_iterator FindEventOrLockPotentialComponent(
-    const DerivedTypeSpec &derived) {
+    const DerivedTypeSpec &derived, bool ignoreCoarrays) {
   PotentialComponentIterator potentials{derived};
-  return std::find_if(
-      potentials.begin(), potentials.end(), [](const Symbol &component) {
-        if (const auto *details{component.detailsIf<ObjectEntityDetails>()}) {
-          const DeclTypeSpec *type{details->type()};
-          return type && IsEventTypeOrLockType(type->AsDerived());
+  auto iter{potentials.begin()};
+  for (auto end{potentials.end()}; iter != end; ++iter) {
+    const Symbol &component{*iter};
+    if (const auto *object{component.detailsIf<ObjectEntityDetails>()}) {
+      if (const DeclTypeSpec * type{object->type()}) {
+        if (IsEventTypeOrLockType(type->AsDerived())) {
+          if (!ignoreCoarrays) {
+            break; // found one
+          }
+          auto path{iter.GetComponentPath()};
+          path.pop_back();
+          if (std::find_if(path.begin(), path.end(), [](const Symbol &sym) {
+                return evaluate::IsCoarray(sym);
+              }) == path.end()) {
+            break; // found one not in a coarray
+          }
         }
-        return false;
-      });
+      }
+    }
+  }
+  return iter;
 }
 
 UltimateComponentIterator::const_iterator FindAllocatableUltimateComponent(
diff --git a/flang/test/Semantics/call04.f90 b/flang/test/Semantics/call04.f90
index 9be579fb696c0..3b079aa4fb2b1 100644
--- a/flang/test/Semantics/call04.f90
+++ b/flang/test/Semantics/call04.f90
@@ -56,11 +56,11 @@ subroutine s05(x) ! C846
 subroutine s06(x) ! C847
   use ISO_FORTRAN_ENV, only: lock_type
   !ERROR: An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE
-  type(lock_type), intent(out) :: x
+  type(lock_type), intent(out) :: x[*]
 end subroutine
 
 subroutine s07(x) ! C847
   use ISO_FORTRAN_ENV, only: event_type
   !ERROR: An INTENT(OUT) dummy argument may not be, or contain, EVENT_TYPE or LOCK_TYPE
-  type(event_type), intent(out) :: x
+  type(event_type), intent(out) :: x[*]
 end subroutine
diff --git a/flang/test/Semantics/critical02.f90 b/flang/test/Semantics/critical02.f90
index e1c9bb3e0ff10..692b06b025861 100644
--- a/flang/test/Semantics/critical02.f90
+++ b/flang/test/Semantics/critical02.f90
@@ -82,7 +82,7 @@ end subroutine test8
 
 subroutine test9()
   use iso_fortran_env
-  type(lock_type) :: l
+  type(lock_type), save :: l[*]
 
   critical
     !ERROR: An image control statement is not allowed in a CRITICAL construct
diff --git a/flang/test/Semantics/doconcurrent01.f90 b/flang/test/Semantics/doconcurrent01.f90
index 9bb2b45376835..9d2c9e1ab3115 100644
--- a/flang/test/Semantics/doconcurrent01.f90
+++ b/flang/test/Semantics/doconcurrent01.f90
@@ -97,7 +97,7 @@ end subroutine s3
 
 subroutine s4()
   use iso_fortran_env
-  type(lock_type) :: l
+  type(lock_type), save :: l[*]
 
   do concurrent (i = 1:n)
 !ERROR: An image control statement is not allowed in DO CONCURRENT
diff --git a/flang/test/Semantics/event01b.f90 b/flang/test/Semantics/event01b.f90
index 6a207427f6d4e..0cd8a5bcb1f1f 100644
--- a/flang/test/Semantics/event01b.f90
+++ b/flang/test/Semantics/event01b.f90
@@ -10,8 +10,41 @@ program test_event_post
   implicit none
 
   ! event_type variables must be coarrays
+  !ERROR: Variable 'non_coarray' with EVENT_TYPE or LOCK_TYPE must be a coarray
   type(event_type) non_coarray
 
+  ! event_type potential object components must be nested in coarrays
+  type :: has_event
+    type(event_type) event
+  end type
+  type :: bad1
+    type(has_event) component
+  end type
+  type :: bad2
+    type(has_event), allocatable :: component
+  end type
+  type :: good1
+    type(has_event), pointer :: component
+  end type
+  type :: good2
+    type(has_event), allocatable :: component[:]
+  end type
+  !ERROR: Variable 'non_coarray_component1' with EVENT_TYPE or LOCK_TYPE potential component '%event' must be a coarray
+  type(has_event) non_coarray_component1
+  !ERROR: Variable 'non_coarray_component2' with EVENT_TYPE or LOCK_TYPE potential component '%component%event' must be a coarray
+  type(bad1) non_coarray_component2
+  !ERROR: Variable 'non_coarray_component3' with EVENT_TYPE or LOCK_TYPE potential component '%component%event' must be a coarray
+  type(bad2) non_coarray_component3
+  ! these are okay
+  type(has_event) ok_non_coarray_component1[*]
+  type(has_event), pointer :: ok_non_coarray_component2
+  type(bad1) :: ok_non_coarray_component3[*]
+  type(bad1), pointer :: ok_non_coarray_component4
+  type(bad2) :: ok_non_coarray_component5[*]
+  type(bad2), pointer :: ok_non_coarray_component6
+  type(good1) ok_non_coarray_component7
+  type(good2) ok_non_coarray_component8
+
   type(event_type) concert[*], occurrences(2)[*]
   integer non_event[*], sync_status, co_indexed_integer[*], superfluous_stat, non_scalar(1)
   character(len=128) error_message, co_indexed_character[*], superfluous_errmsg
@@ -25,10 +58,6 @@ program test_event_post
   !ERROR: The event-variable must be of type EVENT_TYPE from module ISO_FORTRAN_ENV
   event post(non_event)
 
-  ! event-variable must be a coarray
-  !ERROR: The event-variable must be a coarray
-  event post(non_coarray)
-
   !ERROR: Must be a scalar value, but is a rank-1 array
   event post(occurrences)
 
diff --git a/flang/test/Semantics/event02b.f90 b/flang/test/Semantics/event02b.f90
index 20ee4047a1fed..94971022878ac 100644
--- a/flang/test/Semantics/event02b.f90
+++ b/flang/test/Semantics/event02b.f90
@@ -10,6 +10,7 @@ program test_event_wait
   implicit none
 
   ! event_type variables must be coarrays
+  !ERROR: Variable 'non_coarray' with EVENT_TYPE or LOCK_TYPE must be a coarray
   type(event_type) non_coarray
 
   type(event_type) concert[*], occurrences(2)[*]
@@ -24,9 +25,6 @@ program test_event_wait
   !ERROR: The event-variable must be of type EVENT_TYPE from module ISO_FORTRAN_ENV
   event wait(non_event)
 
-  !ERROR: The event-variable must be a coarray
-  event wait(non_coarray)
-
   !ERROR: A event-variable in a EVENT WAIT statement may not be a coindexed object
   event wait(concert[1])
 
diff --git a/flang/test/Semantics/sync-stat-list.f90 b/flang/test/Semantics/sync-stat-list.f90
index 85a85f8224534..545733049ca35 100644
--- a/flang/test/Semantics/sync-stat-list.f90
+++ b/flang/test/Semantics/sync-stat-list.f90
@@ -16,7 +16,7 @@ program test_sync_stat_list
   character(len=128) error_message, superfluous_errmsg, coindexed_character[*]
   logical invalid_type
   type(team_type) :: home
-  type(lock_type) :: latch
+  type(lock_type) :: latch[*]
 
   ! valid
   change team (home, stat=sync_status, errmsg=error_message)

From 4e231014c1980d64812a0322223a4759798d1be3 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 16:45:40 +0000
Subject: [PATCH 232/432] [AArch64] Generate zeroing forms of certain SVE2.2
 instructions (10/11) (#116836)

SVE2.2 introduces instructions with predicated forms with zeroing of
the inactive lanes. This allows in some cases to save a `movprfx` or
a `mov` instruction when emitting code for `_x` or `_z` variants of
intrinsics.

This patch adds support for emitting the zeroing forms of certain
`RBIT`, `REVB`, `REVH`, `REVW`, and `REVD` instructions.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   10 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   38 +-
 .../test/CodeGen/AArch64/zeroing-forms-rev.ll | 1502 +++++++++++++++++
 3 files changed, 1542 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index dee94d3184146..498967bfc8f9c 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4339,11 +4339,11 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm LASTP_XPP  : sve_int_pcount_pred_tmp<0b010, "lastp">;
 
   // SVE reverse within elements, zeroing predicate
-  defm RBIT_ZPzZ : sve_int_perm_rev_rbit_z<"rbit">;
-  defm REVB_ZPzZ : sve_int_perm_rev_revb_z<"revb">;
-  defm REVH_ZPzZ : sve_int_perm_rev_revh_z<"revh">;
-  def  REVW_ZPzZ : sve_int_perm_rev_z<0b11, 0b0110, "revw", ZPR64>;
-  def  REVD_ZPzZ : sve_int_perm_rev_z<0b00, 0b1110, "revd", ZPR128>;
+  defm RBIT_ZPzZ : sve_int_perm_rev_rbit_z<"rbit", AArch64rbit_mt>;
+  defm REVB_ZPzZ : sve_int_perm_rev_revb_z<"revb", AArch64revb_mt>;
+  defm REVH_ZPzZ : sve_int_perm_rev_revh_z<"revh", AArch64revh_mt>;
+  defm REVW_ZPzZ : sve_int_perm_rev_revw_z<"revw", AArch64revw_mt>;
+  defm REVD_ZPzZ : sve_int_perm_rev_revd_z<"revd", AArch64revd_mt>;
 } // End HasSME2p2orSVE2p2
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 82bee17b24be5..14cb03411c5a4 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -7657,22 +7657,54 @@ class sve_int_perm_rev_z<bits<2> sz, bits<4> opc, string asm,
   let hasSideEffects = 0;
 }
 
-multiclass sve_int_perm_rev_rbit_z<string asm> {
+multiclass sve_int_perm_rev_rbit_z<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_rev_z<0b00, 0b0111, asm, ZPR8>;
   def _H : sve_int_perm_rev_z<0b01, 0b0111, asm, ZPR16>;
   def _S : sve_int_perm_rev_z<0b10, 0b0111, asm, ZPR32>;
   def _D : sve_int_perm_rev_z<0b11, 0b0111, asm, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_perm_rev_revb_z<string asm> {
+multiclass sve_int_perm_rev_revb_z<string asm, SDPatternOperator op> {
   def _H : sve_int_perm_rev_z<0b01, 0b0100, asm, ZPR16>;
   def _S : sve_int_perm_rev_z<0b10, 0b0100, asm, ZPR32>;
   def _D : sve_int_perm_rev_z<0b11, 0b0100, asm, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_perm_rev_revh_z<string asm> {
+multiclass sve_int_perm_rev_revh_z<string asm, SDPatternOperator op> {
   def _S : sve_int_perm_rev_z<0b10, 0b0101, asm, ZPR32>;
   def _D : sve_int_perm_rev_z<0b11, 0b0101, asm, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_perm_rev_revw_z<string asm, SDPatternOperator op> {
+  def _D : sve_int_perm_rev_z<0b11, 0b0110, asm, ZPR64>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1, nxv2i64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_perm_rev_revd_z<string asm, SDPatternOperator op> {
+  def NAME : sve_int_perm_rev_z<0b00, 0b1110, asm, ZPR128>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv16i8, op, nxv16i1, nxv16i8, !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8i16, op, nxv8i1,  nxv8i16, !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4i32, op, nxv4i1,  nxv4i32, !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2i64, op, nxv2i1,  nxv2i64, !cast<Instruction>(NAME)>;
+
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8bf16, op, nxv8i1, nxv8bf16, !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv8f16,  op, nxv8i1, nxv8f16,  !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv4f32,  op, nxv4i1, nxv4f32,  !cast<Instruction>(NAME)>;
+  defm : SVE_1_Op_PassthruUndefZero_Pat<nxv2f64,  op, nxv2i1, nxv2f64,  !cast<Instruction>(NAME)>;
 }
 
 class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll
new file mode 100644
index 0000000000000..d7a51c8cf8062
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-rev.ll
@@ -0,0 +1,1502 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme    -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 16 x i8> @test_svrbit_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrbit_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.b, p0/z, z0.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrbit_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrbit_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrbit_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrbit_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    rbit z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svrbit_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrbit_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrbit_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrbit_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrbit_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrbit_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    rbit z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrbit_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrbit_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrbit_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrbit_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrbit_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrbit_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    rbit z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrbit_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrbit_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrbit_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrbit_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rbit z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrbit_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrbit_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    rbit z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    rbit z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svrevb_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevb_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevb_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevb_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevb_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevb_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    revb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrevb_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevb_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevb_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevb_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevb_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevb_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    revb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevb_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevb_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevb_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevb_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevb_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevb_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    revb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svrevh_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevh_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revh z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevh_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevh_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revh z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevh_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevh_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    revh z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevh_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevh_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revh z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevh_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevh_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revh z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevh_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevh_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    revh z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revh z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevw_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevw_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevw_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revw z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevw_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevw_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevw_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevw_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevw_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    revw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevw_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svrevd_s8_x_1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrevd_s8_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s8_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrevd_s8_x_2(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrevd_s8_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s8_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrevd_s8_z(<vscale x 16 x i1> %pg, double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrevd_s8_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.b, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s8_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svrevd_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevd_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevd_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevd_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevd_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevd_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrevd_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevd_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevd_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevd_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevd_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevd_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevd_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevd_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevd_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevd_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevd_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevd_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x half> @test_svrevd_f16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrevd_f16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrevd_f16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrevd_f16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrevd_f16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrevd_f16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x bfloat> @test_svrevd_bf16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: test_svrevd_bf16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_bf16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x bfloat> @test_svrevd_bf16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: test_svrevd_bf16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_bf16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x bfloat> @test_svrevd_bf16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: test_svrevd_bf16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_bf16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 4 x float> @test_svrevd_f32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrevd_f32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrevd_f32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrevd_f32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrevd_f32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrevd_f32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrevd_f64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrevd_f64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z0.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z0.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrevd_f64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrevd_f64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrevd_f64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrevd_f64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_f64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 16 x i8> @test_svrbit_nxv16i8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrbit_nxv16i8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rbit z0.b, p0/m, z1.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv16i8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    rbit z0.b, p0/z, z1.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrbit_nxv16i8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svrbit_nxv16i8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    rbit z0.b, p0/m, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv16i8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    rbit z0.b, p0/z, z2.b
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.rbit.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svrbit_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrbit_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    rbit z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    rbit z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrbit_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svrbit_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    rbit z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    rbit z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.rbit.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrbit_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrbit_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rbit z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    rbit z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrbit_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svrbit_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    rbit z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    rbit z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.rbit.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrbit_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrbit_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rbit z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    rbit z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrbit_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svrbit_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    rbit z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrbit_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    rbit z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.rbit.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svrevb_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevb_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevb_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svrevb_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revb z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revb z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revb.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrevb_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevb_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevb_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svrevb_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revb z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revb z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revb.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevb_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevb_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevb_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svrevb_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revb z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevb_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revb z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revb.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svrevh_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevh_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revh z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revh z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevh_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svrevh_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revh z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revh z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revh.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevh_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevh_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revh z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revh z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevh_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svrevh_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revh z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevh_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revh z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revh.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevw_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevw_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevw_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevw_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svrevw_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revw z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevw_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revw z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revw.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 16 x i8> @test_svrevd_nxv16i8_ptrue_u(double %z0, <vscale x 16 x i8> %x) {
+; CHECK-LABEL: test_svrevd_nxv16i8_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv16i8_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> poison, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %x)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 16 x i8> @test_svrevd_nxv16i8_ptrue(double %z0, <vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
+; CHECK-LABEL: test_svrevd_nxv16i8_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv16i8_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.b
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 31)
+  %0 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.revd.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i1> %pg, <vscale x 16 x i8> %y)
+  ret <vscale x 16 x i8> %0
+}
+
+define <vscale x 8 x i16> @test_svrevd_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svrevd_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svrevd_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svrevd_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.revd.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svrevd_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svrevd_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svrevd_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svrevd_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.revd.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svrevd_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svrevd_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svrevd_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svrevd_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.revd.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x half> @test_svrevd_nxv8f16_ptrue_u(double %z0, <vscale x 8 x half> %x) {
+; CHECK-LABEL: test_svrevd_nxv8f16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8f16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x i1> %pg, <vscale x 8 x half> %x)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x half> @test_svrevd_nxv8f16_ptrue(double %z0, <vscale x 8 x half> %x, <vscale x 8 x half> %y) {
+; CHECK-LABEL: test_svrevd_nxv8f16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8f16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x half> @llvm.aarch64.sve.revd.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x i1> %pg, <vscale x 8 x half> %y)
+  ret <vscale x 8 x half> %0
+}
+
+define <vscale x 8 x bfloat> @test_svrevd_nxv8bf16_ptrue_u(double %z0, <vscale x 8 x bfloat> %x) {
+; CHECK-LABEL: test_svrevd_nxv8bf16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8bf16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> poison, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %x)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 8 x bfloat> @test_svrevd_nxv8bf16_ptrue(double %z0, <vscale x 8 x bfloat> %x, <vscale x 8 x bfloat> %y) {
+; CHECK-LABEL: test_svrevd_nxv8bf16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv8bf16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.revd.nxv8bf16(<vscale x 8 x bfloat> %x, <vscale x 8 x i1> %pg, <vscale x 8 x bfloat> %y)
+  ret <vscale x 8 x bfloat> %0
+}
+
+define <vscale x 4 x float> @test_svrevd_nxv4f32_ptrue_u(double %z0, <vscale x 4 x float> %x) {
+; CHECK-LABEL: test_svrevd_nxv4f32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv4f32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x i1> %pg, <vscale x 4 x float> %x)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 4 x float> @test_svrevd_nxv4f32_ptrue(double %z0, <vscale x 4 x float> %x, <vscale x 4 x float> %y) {
+; CHECK-LABEL: test_svrevd_nxv4f32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv4f32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x float> @llvm.aarch64.sve.revd.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x i1> %pg, <vscale x 4 x float> %y)
+  ret <vscale x 4 x float> %0
+}
+
+define <vscale x 2 x double> @test_svrevd_nxv2f64_ptrue_u(double %z0, <vscale x 2 x double> %x) {
+; CHECK-LABEL: test_svrevd_nxv2f64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revd z0.q, p0/m, z1.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv2f64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z1.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x i1> %pg, <vscale x 2 x double> %x)
+  ret <vscale x 2 x double> %0
+}
+
+define <vscale x 2 x double> @test_svrevd_nxv2f64_ptrue(double %z0, <vscale x 2 x double> %x, <vscale x 2 x double> %y) {
+; CHECK-LABEL: test_svrevd_nxv2f64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    revd z0.q, p0/m, z2.q
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svrevd_nxv2f64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    revd z0.q, p0/z, z2.q
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x double> @llvm.aarch64.sve.revd.nxv2f64(<vscale x 2 x double> %x, <vscale x 2 x i1> %pg, <vscale x 2 x double> %y)
+  ret <vscale x 2 x double> %0
+}

From 73f9034036c942058827877a657d72071dd766e7 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:46:02 -0800
Subject: [PATCH 233/432] [flang] Fix failure to fold character array (#123418)

When a character component reference is applied to a constant array of
derived type, ensure that the length of the resulting character array is
properly defined.

Fixes https://github.com/llvm/llvm-project/issues/123362.
---
 flang/lib/Evaluate/fold-implementation.h        |  4 ++++
 flang/test/Evaluate/fold-arr-char-component.f90 | 11 +++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 flang/test/Evaluate/fold-arr-char-component.f90

diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 31d043f490fd8..4dcc737688ca0 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -290,6 +290,9 @@ std::optional<Constant<T>> Folder<T>::ApplyComponent(
             auto *typedExpr{UnwrapExpr<Expr<T>>(expr.value())};
             CHECK(typedExpr);
             array = std::make_unique<ArrayConstructor<T>>(*typedExpr);
+            if constexpr (T::category == TypeCategory::Character) {
+              array->set_LEN(Expr<SubscriptInteger>{value->LEN()});
+            }
           }
           if (subscripts) {
             if (auto element{ApplySubscripts(*value, *subscripts)}) {
@@ -407,6 +410,7 @@ template <typename T> Expr<T> Folder<T>::Folding(Designator<T> &&designator) {
 template <typename T>
 Constant<T> *Folder<T>::Folding(std::optional<ActualArgument> &arg) {
   if (auto *expr{UnwrapExpr<Expr<SomeType>>(arg)}) {
+    *expr = Fold(context_, std::move(*expr));
     if constexpr (T::category != TypeCategory::Derived) {
       if (!UnwrapExpr<Expr<T>>(*expr)) {
         if (const Symbol *
diff --git a/flang/test/Evaluate/fold-arr-char-component.f90 b/flang/test/Evaluate/fold-arr-char-component.f90
new file mode 100644
index 0000000000000..9835db960d4ae
--- /dev/null
+++ b/flang/test/Evaluate/fold-arr-char-component.f90
@@ -0,0 +1,11 @@
+! RUN: %python %S/test_folding.py %s %flang_fc1
+! Ensure that array-valued component references have lengths
+! (see https://github.com/llvm/llvm-project/issues/123362)
+module m
+  type cdt
+    character(7) :: a = "ibm704", b = "cdc6600"
+  end type
+  type(cdt), parameter :: arr(2) = cdt()
+  integer, parameter :: check(*) = scan(arr%a, arr%b)
+  logical, parameter :: test1 = all(check == 5) ! the '0'
+end

From 73db9ee1e87b4cfccbc9d67d2b47d9476f92413f Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Mon, 27 Jan 2025 08:49:32 -0800
Subject: [PATCH 234/432] [clang][Sema][FMV] Add a note to the 'cannot become
 multiversioned' diagnostic (#124364)

... pointing out the previous declaration.
---
 clang/lib/Sema/SemaDecl.cpp           | 7 +++++--
 clang/test/Sema/attr-cpuspecific.c    | 3 ++-
 clang/test/Sema/attr-target-mv.c      | 3 ++-
 clang/test/Sema/attr-target-version.c | 3 ++-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index c3ff247a6316d..fe68eadc951b5 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11380,8 +11380,11 @@ static bool CheckMultiVersionAdditionalRules(Sema &S, const FunctionDecl *OldFD,
     return true;
 
   // Only allow transition to MultiVersion if it hasn't been used.
-  if (OldFD && CausesMV && OldFD->isUsed(false))
-    return S.Diag(NewFD->getLocation(), diag::err_multiversion_after_used);
+  if (OldFD && CausesMV && OldFD->isUsed(false)) {
+    S.Diag(NewFD->getLocation(), diag::err_multiversion_after_used);
+    S.Diag(OldFD->getLocation(), diag::note_previous_declaration);
+    return true;
+  }
 
   return S.areMultiversionVariantFunctionsCompatible(
       OldFD, NewFD, S.PDiag(diag::err_multiversion_noproto),
diff --git a/clang/test/Sema/attr-cpuspecific.c b/clang/test/Sema/attr-cpuspecific.c
index 3cd58f49faa5e..238db0ac0b85d 100644
--- a/clang/test/Sema/attr-cpuspecific.c
+++ b/clang/test/Sema/attr-cpuspecific.c
@@ -44,7 +44,8 @@ int allow_fwd_decl2(void);
 void use_fwd_decl(void) {
   allow_fwd_decl2();
 }
-// expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-error@+2 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-note@-5 {{previous declaration is here}}
 int __attribute__((cpu_dispatch(atom))) allow_fwd_decl2(void) {}
 
 
diff --git a/clang/test/Sema/attr-target-mv.c b/clang/test/Sema/attr-target-mv.c
index ddb1d82b02f09..dfc3d614dc1e0 100644
--- a/clang/test/Sema/attr-target-mv.c
+++ b/clang/test/Sema/attr-target-mv.c
@@ -66,7 +66,8 @@ int use3(void) {
   return mv_after_use();
 }
 
-// expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-error@+2 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-note@-6 {{previous declaration is here}}
 int __attribute__((target("arch=sandybridge")))  mv_after_use(void) { return 2; }
 
 int __attribute__((target("sse4.2,arch=sandybridge"))) mangle(void) { return 1; }
diff --git a/clang/test/Sema/attr-target-version.c b/clang/test/Sema/attr-target-version.c
index cfcc1622abe5c..d062212848daf 100644
--- a/clang/test/Sema/attr-target-version.c
+++ b/clang/test/Sema/attr-target-version.c
@@ -88,7 +88,8 @@ int bar() {
   nodef();
   return def();
 }
-// expected-error@+1 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-error@+2 {{function declaration cannot become a multiversioned function after first usage}}
+// expected-note@-13 {{previous declaration is here}}
 int __attribute__((target_version("sha2"))) def(void) { return 1; }
 
 int __attribute__((target_version("sve"))) prot();

From 210e675cfd7be3d7e0d93c29368acd27b51f9a17 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:54:56 -0800
Subject: [PATCH 235/432] [flang] Accept CHANGE TEAM/END TEAM as branch target
 (#123822)

It is valid to jump to a CHANGE TEAM statement from anywhere in the
containing executable part, and valid to jump to an END TEAM statement
from within the construct.
---
 flang/lib/Semantics/resolve-labels.cpp | 23 ++++++++++++++---------
 flang/test/Semantics/label19.f90       | 19 +++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)
 create mode 100644 flang/test/Semantics/label19.f90

diff --git a/flang/lib/Semantics/resolve-labels.cpp b/flang/lib/Semantics/resolve-labels.cpp
index 04e4b142efedf..b0cbc4b56e889 100644
--- a/flang/lib/Semantics/resolve-labels.cpp
+++ b/flang/lib/Semantics/resolve-labels.cpp
@@ -122,6 +122,8 @@ constexpr Legality IsLegalBranchTarget(const parser::Statement<A> &) {
       std::is_same_v<A, parser::EndCriticalStmt> ||
       std::is_same_v<A, parser::ForallConstructStmt> ||
       std::is_same_v<A, parser::WhereConstructStmt> ||
+      std::is_same_v<A, parser::ChangeTeamStmt> ||
+      std::is_same_v<A, parser::EndChangeTeamStmt> ||
       std::is_same_v<A, parser::EndFunctionStmt> ||
       std::is_same_v<A, parser::EndMpSubprogramStmt> ||
       std::is_same_v<A, parser::EndProgramStmt> ||
@@ -210,8 +212,9 @@ class ParseTreeAnalyzer {
         // subprograms.  Visit that statement in advance so that results
         // are placed in the correct programUnits_ slot.
         auto targetFlags{ConstructBranchTargetFlags(endStmt)};
-        AddTargetLabelDefinition(
-            endStmt.label.value(), targetFlags, currentScope_);
+        AddTargetLabelDefinition(endStmt.label.value(), targetFlags,
+            currentScope_,
+            /*isExecutableConstructEndStmt=*/false);
       }
     }
     return true;
@@ -238,18 +241,20 @@ class ParseTreeAnalyzer {
             parser::EndProgramStmt, parser::EndSubroutineStmt>;
     auto targetFlags{ConstructBranchTargetFlags(statement)};
     if constexpr (common::HasMember<A, LabeledConstructStmts>) {
-      AddTargetLabelDefinition(label.value(), targetFlags, ParentScope());
+      AddTargetLabelDefinition(label.value(), targetFlags, ParentScope(),
+          /*isExecutableConstructEndStmt=*/false);
     } else if constexpr (std::is_same_v<A, parser::EndIfStmt> ||
         std::is_same_v<A, parser::EndSelectStmt>) {
       // the label on an END IF/SELECT is not in the last part/case
-      AddTargetLabelDefinition(label.value(), targetFlags, ParentScope(), true);
+      AddTargetLabelDefinition(label.value(), targetFlags, ParentScope(),
+          /*isExecutableConstructEndStmt=*/true);
     } else if constexpr (common::HasMember<A, LabeledConstructEndStmts>) {
-      constexpr bool isExecutableConstructEndStmt{true};
       AddTargetLabelDefinition(label.value(), targetFlags, currentScope_,
-          isExecutableConstructEndStmt);
+          /*isExecutableConstructEndStmt=*/true);
     } else if constexpr (!common::HasMember<A, LabeledProgramUnitEndStmts>) {
       // Program unit END statements have already been processed.
-      AddTargetLabelDefinition(label.value(), targetFlags, currentScope_);
+      AddTargetLabelDefinition(label.value(), targetFlags, currentScope_,
+          /*isExecutableConstructEndStmt=*/false);
     }
     return true;
   }
@@ -826,7 +831,7 @@ class ParseTreeAnalyzer {
   // 6.2.5., paragraph 2
   void AddTargetLabelDefinition(parser::Label label,
       LabeledStmtClassificationSet labeledStmtClassificationSet,
-      ProxyForScope scope, bool isExecutableConstructEndStmt = false) {
+      ProxyForScope scope, bool isExecutableConstructEndStmt) {
     CheckLabelInRange(label);
     TargetStmtMap &targetStmtMap{disposableMaps_.empty()
             ? programUnits_.back().targetStmts
@@ -912,7 +917,7 @@ bool InBody(const parser::CharBlock &position,
   return false;
 }
 
-LabeledStatementInfoTuplePOD GetLabel(
+static LabeledStatementInfoTuplePOD GetLabel(
     const TargetStmtMap &labels, const parser::Label &label) {
   const auto iter{labels.find(label)};
   if (iter == labels.cend()) {
diff --git a/flang/test/Semantics/label19.f90 b/flang/test/Semantics/label19.f90
new file mode 100644
index 0000000000000..f8ad05335d070
--- /dev/null
+++ b/flang/test/Semantics/label19.f90
@@ -0,0 +1,19 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+program main
+  use, intrinsic:: iso_fortran_env, only: team_type
+  type(team_type) team
+  logical :: p = false
+1 change team(team)
+2 if (p) goto 1 ! ok
+  if (p) goto 2 ! ok
+  if (p) goto 3 ! ok
+  if (p) goto 4 ! ok
+  if (p) goto 5 ! ok
+3 end team
+4 continue
+  if (p) goto 1 ! ok
+  !ERROR: Label '2' is in a construct that prevents its use as a branch target here
+  if (p) goto 2
+  !ERROR: Label '3' is in a construct that prevents its use as a branch target here
+  if (p) goto 3
+5 end

From b16c989697208795f6428432f9ab05c5535b6085 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:55:20 -0800
Subject: [PATCH 236/432] [flang] Fix check for coarray actual passed to
 implicit interface (#123836)

The check for a coarray actual argument being passed to a procedure with
an implicit interface was incorrect, yielding false positives for
coindexed objects. Fix.
---
 flang/lib/Semantics/check-call.cpp | 4 ----
 flang/test/Semantics/call13.f90    | 4 ++--
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index ba68a0f898d46..79caf50d8597e 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -67,10 +67,6 @@ static void CheckImplicitInterfaceArg(evaluate::ActualArgument &arg,
           "Null pointer argument requires an explicit interface"_err_en_US);
     } else if (auto named{evaluate::ExtractNamedEntity(*expr)}) {
       const Symbol &symbol{named->GetLastSymbol()};
-      if (symbol.Corank() > 0) {
-        messages.Say(
-            "Coarray argument requires an explicit interface"_err_en_US);
-      }
       if (evaluate::IsAssumedRank(symbol)) {
         messages.Say(
             "Assumed rank argument requires an explicit interface"_err_en_US);
diff --git a/flang/test/Semantics/call13.f90 b/flang/test/Semantics/call13.f90
index 8b203e4b715d5..3f7fb2efc8f63 100644
--- a/flang/test/Semantics/call13.f90
+++ b/flang/test/Semantics/call13.f90
@@ -22,8 +22,8 @@ subroutine s(assumedRank, coarray, class, classStar, typeStar)
   call implicit10(1, 2, keyword=3)  ! 15.4.2.2(1)
   !ERROR: Assumed rank argument requires an explicit interface
   call implicit11(assumedRank)  ! 15.4.2.2(3)(c)
-  !ERROR: Coarray argument requires an explicit interface
-  call implicit12(coarray)  ! 15.4.2.2(3)(d)
+  call implicit12(coarray)  ! ok
+  call implicit12a(coarray[1]) ! ok
   !ERROR: Parameterized derived type actual argument requires an explicit interface
   call implicit13(pdtx)  ! 15.4.2.2(3)(e)
   call implicit14(class)  ! ok

From 3ac00784ac3cd8b435c0c6be36f81f786ca5e489 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:55:56 -0800
Subject: [PATCH 237/432] [flang] Fix crash on erroneous program (#123843)

Catch and report multiple initializations of the same procedure pointer
rather than assuming that control wouldn't reach a given point in name
resolution in that case.

Fixes https://github.com/llvm/llvm-project/issues/123538.
---
 flang/lib/Semantics/resolve-names.cpp | 10 ++++++----
 flang/test/Semantics/bug123538.f90    |  7 +++++++
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Semantics/bug123538.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 0d690803c36dc..695c8265293a8 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -8630,8 +8630,11 @@ void DeclarationVisitor::PointerInitialization(
     if (!context().HasError(ultimate)) {
       if (IsProcedurePointer(ultimate)) {
         auto &details{ultimate.get<ProcEntityDetails>()};
-        CHECK(!details.init());
-        if (const auto *targetName{std::get_if<parser::Name>(&target.u)}) {
+        if (details.init()) {
+          Say(name, "'%s' was previously initialized"_err_en_US);
+          context().SetError(ultimate);
+        } else if (const auto *targetName{
+                       std::get_if<parser::Name>(&target.u)}) {
           Walk(target);
           if (!CheckUseError(*targetName) && targetName->symbol) {
             // Validation is done in declaration checking.
@@ -8642,8 +8645,7 @@ void DeclarationVisitor::PointerInitialization(
         }
       } else {
         Say(name,
-            "'%s' is not a procedure pointer but is initialized "
-            "like one"_err_en_US);
+            "'%s' is not a procedure pointer but is initialized like one"_err_en_US);
         context().SetError(ultimate);
       }
     }
diff --git a/flang/test/Semantics/bug123538.f90 b/flang/test/Semantics/bug123538.f90
new file mode 100644
index 0000000000000..2245abe3829e2
--- /dev/null
+++ b/flang/test/Semantics/bug123538.f90
@@ -0,0 +1,7 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1
+procedure(), pointer :: pp => tan
+!ERROR: EXTERNAL attribute was already specified on 'pp'
+!ERROR: POINTER attribute was already specified on 'pp'
+!ERROR: 'pp' was previously initialized
+procedure(real), pointer :: pp => tan
+end

From f5ddb1012de1c7c7c958aa288932caead9607b07 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:56:25 -0800
Subject: [PATCH 238/432] [flang] Fix crash in module file generation (#123859)

An assertion in module file generation didn't allow for a case that has
arisen in a test; remove it, extend commentary, and add a regression
test.

Fixes https://github.com/llvm/llvm-project/issues/123534.
---
 flang/lib/Semantics/mod-file.cpp   |  4 ++--
 flang/test/Semantics/bug123534.f90 | 33 ++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/bug123534.f90

diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 4367dd1dab395..b45f1c060da2f 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -1799,8 +1799,8 @@ bool SubprogramSymbolCollector::NeedImport(
     return found->has<UseDetails>() && found->owner() != scope_;
   } else {
     // "found" can be null in the case of a use-associated derived type's
-    // parent type
-    CHECK(symbol.has<DerivedTypeDetails>());
+    // parent type, and also in the case of an object (like a dummy argument)
+    // used to define a length or bound of a nested interface.
     return false;
   }
 }
diff --git a/flang/test/Semantics/bug123534.f90 b/flang/test/Semantics/bug123534.f90
new file mode 100644
index 0000000000000..0a94336732110
--- /dev/null
+++ b/flang/test/Semantics/bug123534.f90
@@ -0,0 +1,33 @@
+! RUN: %python %S/test_modfile.py %s %flang_fc1
+! Simplified regression test for crashreported in
+! https://github.com/llvm/llvm-project/issues/123534.
+module m
+  interface
+    ! f1 returns a pointer to a procedure whose result characteristics
+    ! depend on the value of a dummy argument.
+    function f1()
+      interface
+        function f2(n)
+          integer, intent(in) :: n
+          character(n), pointer :: f2
+        end
+      end interface
+      procedure (f2), pointer :: f1
+    end
+  end interface
+end
+
+!Expect: m.mod
+!module m
+!interface
+!function f1()
+!interface
+!function f2(n)
+!integer(4),intent(in)::n
+!character(n,1),pointer::f2
+!end
+!end interface
+!procedure(f2),pointer::f1
+!end
+!end interface
+!end

From ec6b2c63d93d8f8edeafcc7330d0b2349463d73d Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:56:58 -0800
Subject: [PATCH 239/432] [flang] Fold character array constructor with unknown
 length (#123983)

When a character array constructor does not have an explicit type with a
constant length, the compiler can still fold it if all of its elements
are constants. These array constructors will have been wrapped up in the
internal %SET_LENGTH operation, which will determine the final length of
the folded value, so use the maximum length of the constant elements as
the length of the folded array constructor.

Fixes https://github.com/llvm/llvm-project/issues/123766.
---
 flang/lib/Evaluate/fold-implementation.h | 22 ++++++++++++++++------
 flang/test/Evaluate/bug123766.f90        |  5 +++++
 2 files changed, 21 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Evaluate/bug123766.f90

diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 4dcc737688ca0..b0f39e63d0941 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -1267,6 +1267,12 @@ template <typename T> class ArrayConstructorFolder {
   explicit ArrayConstructorFolder(FoldingContext &c) : context_{c} {}
 
   Expr<T> FoldArray(ArrayConstructor<T> &&array) {
+    if constexpr (T::category == TypeCategory::Character) {
+      if (const auto *len{array.LEN()}) {
+        charLength_ = ToInt64(Fold(context_, common::Clone(*len)));
+        knownCharLength_ = charLength_.has_value();
+      }
+    }
     // Calls FoldArray(const ArrayConstructorValues<T> &) below
     if (FoldArray(array)) {
       auto n{static_cast<ConstantSubscript>(elements_.size())};
@@ -1274,12 +1280,9 @@ template <typename T> class ArrayConstructorFolder {
         return Expr<T>{Constant<T>{array.GetType().GetDerivedTypeSpec(),
             std::move(elements_), ConstantSubscripts{n}}};
       } else if constexpr (T::category == TypeCategory::Character) {
-        if (const auto *len{array.LEN()}) {
-          auto length{Fold(context_, common::Clone(*len))};
-          if (std::optional<ConstantSubscript> lengthValue{ToInt64(length)}) {
-            return Expr<T>{Constant<T>{
-                *lengthValue, std::move(elements_), ConstantSubscripts{n}}};
-          }
+        if (charLength_) {
+          return Expr<T>{Constant<T>{
+              *charLength_, std::move(elements_), ConstantSubscripts{n}}};
         }
       } else {
         return Expr<T>{
@@ -1300,6 +1303,11 @@ template <typename T> class ArrayConstructorFolder {
           elements_.emplace_back(c->At(index));
         } while (c->IncrementSubscripts(index));
       }
+      if constexpr (T::category == TypeCategory::Character) {
+        if (!knownCharLength_) {
+          charLength_ = std::max(c->LEN(), charLength_.value_or(-1));
+        }
+      }
       return true;
     } else {
       return false;
@@ -1349,6 +1357,8 @@ template <typename T> class ArrayConstructorFolder {
 
   FoldingContext &context_;
   std::vector<Scalar<T>> elements_;
+  std::optional<ConstantSubscript> charLength_;
+  bool knownCharLength_{false};
 };
 
 template <typename T>
diff --git a/flang/test/Evaluate/bug123766.f90 b/flang/test/Evaluate/bug123766.f90
new file mode 100644
index 0000000000000..b58989e6c26d8
--- /dev/null
+++ b/flang/test/Evaluate/bug123766.f90
@@ -0,0 +1,5 @@
+! RUN: %python %S/test_folding.py %s %flang_fc1
+character(10), parameter :: a = '0123456789'
+character(3), parameter :: arr(3) = [(a(1:i), i=1,3)]
+logical, parameter :: test1 = all(arr == ["0", "01", "012"])
+end

From d1d952c206efc3a651270c69331b180330ac3efc Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:57:27 -0800
Subject: [PATCH 240/432] [flang] Interpret 'Q' exponent letter as kind=16 even
 on x86 (#124158)

The compiler was interpreting 'Q' as an exponent letter in a literal
real constant as meaning real(kind=10) on x86-64, which is the legacy
80387 80-bit extended precision floating-point type. It turns out that
'Q' means kind=16 with all other compilers, even for x86-64 targets.
Change to conform.
---
 flang/lib/Common/default-kinds.cpp      |  6 +-----
 flang/lib/Frontend/CompilerInstance.cpp |  3 ---
 flang/test/Semantics/kinds04_q10.f90    | 13 ++++---------
 3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/flang/lib/Common/default-kinds.cpp b/flang/lib/Common/default-kinds.cpp
index fbafd827ff0d0..d2ca910351361 100644
--- a/flang/lib/Common/default-kinds.cpp
+++ b/flang/lib/Common/default-kinds.cpp
@@ -11,11 +11,7 @@
 
 namespace Fortran::common {
 
-IntrinsicTypeDefaultKinds::IntrinsicTypeDefaultKinds() {
-#if __x86_64__
-  quadPrecisionKind_ = 10;
-#endif
-}
+IntrinsicTypeDefaultKinds::IntrinsicTypeDefaultKinds() {}
 
 IntrinsicTypeDefaultKinds &IntrinsicTypeDefaultKinds::set_defaultIntegerKind(
     int k) {
diff --git a/flang/lib/Frontend/CompilerInstance.cpp b/flang/lib/Frontend/CompilerInstance.cpp
index 298790bae6655..dfd15b9c3c39d 100644
--- a/flang/lib/Frontend/CompilerInstance.cpp
+++ b/flang/lib/Frontend/CompilerInstance.cpp
@@ -153,9 +153,6 @@ bool CompilerInstance::executeAction(FrontendAction &act) {
   CompilerInvocation &invoc = this->getInvocation();
 
   llvm::Triple targetTriple{llvm::Triple(invoc.getTargetOpts().triple)};
-  if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
-    invoc.getDefaultKinds().set_quadPrecisionKind(10);
-  }
 
   // Set some sane defaults for the frontend.
   invoc.setDefaultFortranOpts();
diff --git a/flang/test/Semantics/kinds04_q10.f90 b/flang/test/Semantics/kinds04_q10.f90
index d352daa1cbbf0..aa5c4abe2f1df 100644
--- a/flang/test/Semantics/kinds04_q10.f90
+++ b/flang/test/Semantics/kinds04_q10.f90
@@ -1,14 +1,9 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1 -triple x86_64-unknown-linux-gnu
+! RUN: %python %S/test_errors.py %s %flang_fc1
 ! C716 If both kind-param and exponent-letter appear, exponent-letter
 ! shall be E. (As an extension we also allow an exponent-letter which matches
 ! the kind-param)
 ! C717 The value of kind-param shall specify an approximation method that
 ! exists on the processor.
-!
-! This test is for x86_64, where exponent-letter 'q' is for
-! 10-byte extended precision
-! UNSUPPORTED: system-windows, system-aix
-! REQUIRES: x86-registered-target
 
 subroutine s(var)
   real :: realvar1 = 4.0E6_4
@@ -16,9 +11,9 @@ subroutine s(var)
   real :: realvar3 = 4.0Q6
   !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar4 = 4.0D6_8
-  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
-  real :: realvar5 = 4.0Q6_10
   !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
+  real :: realvar5 = 4.0Q6_10
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   real :: realvar6 = 4.0Q6_16
   real :: realvar7 = 4.0E6_8
   real :: realvar8 = 4.0E6_10
@@ -31,7 +26,7 @@ subroutine s(var)
   double precision :: doublevar3 = 4.0Q6
   !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar4 = 4.0D6_8
-  !WARNING: Explicit kind parameter on real constant disagrees with exponent letter 'q'
+  !PORTABILITY: Explicit kind parameter together with non-'E' exponent letter is not standard
   double precision :: doublevar5 = 4.0Q6_16
   double precision :: doublevar6 = 4.0E6_8
   double precision :: doublevar7 = 4.0E6_10

From c596aae47ad8cfaee0fe4af3c104cb89a1125ac5 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:57:48 -0800
Subject: [PATCH 241/432] [flang] Catch assumed-length interoperability error
 (#124179)

An assumed-length character dummy argument is interoperable only if it
is neither a pointer nor allocatable.
---
 flang/lib/Semantics/check-declarations.cpp | 9 +++++----
 flang/test/Semantics/bind-c06.f90          | 9 +++++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 3d960a0620caa..5c26469b9fa24 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3089,16 +3089,17 @@ parser::Messages CheckHelper::WhyNotInteroperableObject(
       }
     }
     if (type->IsAssumedType()) { // ok
-    } else if (IsAssumedLengthCharacter(symbol)) {
+    } else if (IsAssumedLengthCharacter(symbol) &&
+        !IsAllocatableOrPointer(symbol)) {
     } else if (IsAllocatableOrPointer(symbol) &&
         type->category() == DeclTypeSpec::Character &&
         type->characterTypeSpec().length().isDeferred()) {
       // ok; F'2023 18.3.7 p2(6)
     } else if (derived) { // type has been checked
     } else if (auto dyType{evaluate::DynamicType::From(*type)}; dyType &&
-               evaluate::IsInteroperableIntrinsicType(*dyType,
-                   InModuleFile() ? nullptr : &context_.languageFeatures())
-                   .value_or(false)) {
+        evaluate::IsInteroperableIntrinsicType(
+            *dyType, InModuleFile() ? nullptr : &context_.languageFeatures())
+            .value_or(false)) {
       // F'2023 18.3.7 p2(4,5)
       // N.B. Language features are not passed to IsInteroperableIntrinsicType
       // when processing a module file, since the module file might have been
diff --git a/flang/test/Semantics/bind-c06.f90 b/flang/test/Semantics/bind-c06.f90
index 3ad3078c4b4a0..ff78a4743deee 100644
--- a/flang/test/Semantics/bind-c06.f90
+++ b/flang/test/Semantics/bind-c06.f90
@@ -95,4 +95,13 @@ program main
     real :: x(0)
   end type
 
+  interface
+    subroutine badAssumedLen(x,y,z) bind(c)
+      !ERROR: A BIND(C) object must have an interoperable type
+      character(*), pointer :: x
+      !ERROR: A BIND(C) object must have an interoperable type
+      character(*), allocatable :: y
+      character(*) z ! ok
+    end
+  end interface
 end

From b0fab14e9ca24a9160581ea26c19661c6f3a053f Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:58:20 -0800
Subject: [PATCH 242/432] [flang] Fix spurious error in character sequence
 association (#124204)

When an allocatable or pointer was being associated as a storage
sequence with a dummy argument, the checks were using the actual storage
size of the allocatable or pointer's descriptor, not the size of the
storage that it references.

Fixes https://github.com/llvm/llvm-project/issues/123807.
---
 flang/lib/Semantics/check-call.cpp |  8 +++++--
 flang/test/Semantics/call38.f90    | 36 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 79caf50d8597e..e925e1c1c653e 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -163,7 +163,9 @@ static void CheckCharacterActual(evaluate::Expr<evaluate::SomeType> &actual,
                   context.foldingContext(), /*getLastComponent=*/true};
               if (auto actualOffset{folder.FoldDesignator(actual)}) {
                 std::int64_t actualChars{*actualLength};
-                if (static_cast<std::size_t>(actualOffset->offset()) >=
+                if (IsAllocatableOrPointer(actualOffset->symbol())) {
+                  // don't use actualOffset->symbol().size()!
+                } else if (static_cast<std::size_t>(actualOffset->offset()) >=
                         actualOffset->symbol().size() ||
                     !evaluate::IsContiguous(
                         actualOffset->symbol(), foldingContext)) {
@@ -630,7 +632,9 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
                 context.foldingContext(), /*getLastComponent=*/true};
             if (auto actualOffset{folder.FoldDesignator(actual)}) {
               std::optional<std::int64_t> actualElements;
-              if (static_cast<std::size_t>(actualOffset->offset()) >=
+              if (IsAllocatableOrPointer(actualOffset->symbol())) {
+                // don't use actualOffset->symbol().size()!
+              } else if (static_cast<std::size_t>(actualOffset->offset()) >=
                       actualOffset->symbol().size() ||
                   !evaluate::IsContiguous(
                       actualOffset->symbol(), foldingContext)) {
diff --git a/flang/test/Semantics/call38.f90 b/flang/test/Semantics/call38.f90
index 34aae6b8b1835..b1a35973e35fe 100644
--- a/flang/test/Semantics/call38.f90
+++ b/flang/test/Semantics/call38.f90
@@ -544,3 +544,39 @@ subroutine sub2(arg2)
       character(*) :: arg2(10)
     end subroutine sub2
 end subroutine
+
+subroutine bug123807
+  interface
+    subroutine test(s)
+      character(5), intent(inout) :: s(5)
+    end
+  end interface
+  character(30) :: s30a
+  character(30), allocatable :: s30b
+  character(6) :: s30c(5)
+  character(24) :: s24a
+  character(24), allocatable :: s24b
+  character(4) :: s24c(6)
+  allocate(s30b)
+  allocate(s24b)
+  call test(s30a)
+  call test(s30a(6:))
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s30a(7:))
+  call test(s30b)
+  call test(s30b(6:))
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s30b(7:))
+  call test(s30c)
+  call test(s30c(1)(6:))
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s30c(2))
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s30c(2)(1:))
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s24a)
+  !ERROR: Actual argument has fewer characters remaining in storage sequence (24) than dummy argument 's=' (25)
+  call test(s24b)
+  !ERROR: Actual argument array has fewer characters (24) than dummy argument 's=' array (25)
+  call test(s24c)
+end

From 873426bea3dd67d80dd10650e64e91c69796614f Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Mon, 27 Jan 2025 08:58:50 -0800
Subject: [PATCH 243/432] [lldb-dap] Ensure the IO forwarding threads are
 managed by the DAP object lifecycle. (#122783)

This moves the ownership of the threads that forward stdout/stderr to
the DAP object itself to ensure that the threads are joined and that the
forwarding is cleaned up when the DAP connection is disconnected.

This is part of a larger refactor to allow lldb-dap to run in a
listening mode and accept multiple connections.

This reverts the previous revert and now that the underlying Windows
issue was fixed by 3ea2b546a8d17014d3ecf05356ecfaadf26ed846.
---
 lldb/tools/lldb-dap/CMakeLists.txt       |   9 +-
 lldb/tools/lldb-dap/DAP.cpp              | 105 ++++++++++----
 lldb/tools/lldb-dap/DAP.h                |  67 +++++----
 lldb/tools/lldb-dap/IOStream.h           |   6 +
 lldb/tools/lldb-dap/OutputRedirector.cpp |  76 ++++++----
 lldb/tools/lldb-dap/OutputRedirector.h   |  34 ++++-
 lldb/tools/lldb-dap/lldb-dap.cpp         | 174 +++++++++++++++--------
 7 files changed, 314 insertions(+), 157 deletions(-)

diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index d68098bf7b326..43fc18873feb3 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -1,7 +1,3 @@
-if ( CMAKE_SYSTEM_NAME MATCHES "Windows" OR CMAKE_SYSTEM_NAME MATCHES "NetBSD" )
-  list(APPEND extra_libs lldbHost)
-endif ()
-
 if (HAVE_LIBPTHREAD)
   list(APPEND extra_libs pthread)
 endif ()
@@ -26,9 +22,11 @@ add_lldb_tool(lldb-dap
   lldb-dap.cpp
   Breakpoint.cpp
   BreakpointBase.cpp
+  DAP.cpp
   ExceptionBreakpoint.cpp
   FifoFiles.cpp
   FunctionBreakpoint.cpp
+  InstructionBreakpoint.cpp
   IOStream.cpp
   JSONUtils.cpp
   LLDBUtils.cpp
@@ -36,12 +34,11 @@ add_lldb_tool(lldb-dap
   ProgressEvent.cpp
   RunInTerminal.cpp
   SourceBreakpoint.cpp
-  DAP.cpp
   Watchpoint.cpp
-  InstructionBreakpoint.cpp
 
   LINK_LIBS
     liblldb
+    lldbHost
     ${extra_libs}
 
   LINK_COMPONENTS
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 35250d9eef608..a67abe582abd4 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -6,34 +6,62 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <chrono>
-#include <cstdarg>
-#include <fstream>
-#include <mutex>
-
 #include "DAP.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
+#include "OutputRedirector.h"
+#include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBCommandInterpreter.h"
+#include "lldb/API/SBCommandReturnObject.h"
 #include "lldb/API/SBLanguageRuntime.h"
 #include "lldb/API/SBListener.h"
+#include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Utility/Status.h"
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstdarg>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <utility>
 
 #if defined(_WIN32)
 #define NOMINMAX
 #include <fcntl.h>
 #include <io.h>
 #include <windows.h>
+#else
+#include <unistd.h>
 #endif
 
 using namespace lldb_dap;
 
+namespace {
+#ifdef _WIN32
+const char DEV_NULL[] = "nul";
+#else
+const char DEV_NULL[] = "/dev/null";
+#endif
+} // namespace
+
 namespace lldb_dap {
 
-DAP::DAP(llvm::StringRef path, ReplMode repl_mode)
-    : debug_adaptor_path(path), broadcaster("lldb-dap"),
+DAP::DAP(llvm::StringRef path, std::ofstream *log, ReplMode repl_mode,
+         StreamDescriptor input, StreamDescriptor output)
+    : debug_adaptor_path(path), log(log), input(std::move(input)),
+      output(std::move(output)), broadcaster("lldb-dap"),
       exception_breakpoints(), focus_tid(LLDB_INVALID_THREAD_ID),
       stop_at_entry(false), is_attach(false),
       enable_auto_variable_summaries(false),
@@ -43,21 +71,7 @@ DAP::DAP(llvm::StringRef path, ReplMode repl_mode)
       configuration_done_sent(false), waiting_for_run_in_terminal(false),
       progress_event_reporter(
           [&](const ProgressEvent &event) { SendJSON(event.ToJSON()); }),
-      reverse_request_seq(0), repl_mode(repl_mode) {
-  const char *log_file_path = getenv("LLDBDAP_LOG");
-#if defined(_WIN32)
-  // Windows opens stdout and stdin in text mode which converts \n to 13,10
-  // while the value is just 10 on Darwin/Linux. Setting the file mode to binary
-  // fixes this.
-  int result = _setmode(fileno(stdout), _O_BINARY);
-  assert(result);
-  result = _setmode(fileno(stdin), _O_BINARY);
-  UNUSED_IF_ASSERT_DISABLED(result);
-  assert(result);
-#endif
-  if (log_file_path)
-    log.reset(new std::ofstream(log_file_path));
-}
+      reverse_request_seq(0), repl_mode(repl_mode) {}
 
 DAP::~DAP() = default;
 
@@ -173,6 +187,45 @@ ExceptionBreakpoint *DAP::GetExceptionBreakpoint(const lldb::break_id_t bp_id) {
   return nullptr;
 }
 
+llvm::Error DAP::ConfigureIO(std::FILE *overrideOut, std::FILE *overrideErr) {
+  in = lldb::SBFile(std::fopen(DEV_NULL, "r"), /*transfer_ownership=*/true);
+
+  if (auto Error = out.RedirectTo([this](llvm::StringRef output) {
+        SendOutput(OutputType::Stdout, output);
+      }))
+    return Error;
+
+  if (overrideOut) {
+    auto fd = out.GetWriteFileDescriptor();
+    if (auto Error = fd.takeError())
+      return Error;
+
+    if (dup2(*fd, fileno(overrideOut)) == -1)
+      return llvm::errorCodeToError(llvm::errnoAsErrorCode());
+  }
+
+  if (auto Error = err.RedirectTo([this](llvm::StringRef output) {
+        SendOutput(OutputType::Stderr, output);
+      }))
+    return Error;
+
+  if (overrideErr) {
+    auto fd = err.GetWriteFileDescriptor();
+    if (auto Error = fd.takeError())
+      return Error;
+
+    if (dup2(*fd, fileno(overrideErr)) == -1)
+      return llvm::errorCodeToError(llvm::errnoAsErrorCode());
+  }
+
+  return llvm::Error::success();
+}
+
+void DAP::StopIO() {
+  out.Stop();
+  err.Stop();
+}
+
 // Send the JSON in "json_str" to the "out" stream. Correctly send the
 // "Content-Length:" field followed by the length, followed by the raw
 // JSON bytes.
@@ -208,19 +261,19 @@ std::string DAP::ReadJSON() {
   std::string json_str;
   int length;
 
-  if (!input.read_expected(log.get(), "Content-Length: "))
+  if (!input.read_expected(log, "Content-Length: "))
     return json_str;
 
-  if (!input.read_line(log.get(), length_str))
+  if (!input.read_line(log, length_str))
     return json_str;
 
   if (!llvm::to_integer(length_str, length))
     return json_str;
 
-  if (!input.read_expected(log.get(), "\r\n"))
+  if (!input.read_expected(log, "\r\n"))
     return json_str;
 
-  if (!input.read_full(log.get(), length, json_str))
+  if (!input.read_full(log, length, json_str))
     return json_str;
 
   if (log) {
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index ae496236f1336..846300cb945b0 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -9,36 +9,38 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_DAP_H
 #define LLDB_TOOLS_LLDB_DAP_DAP_H
 
-#include <cstdio>
-#include <iosfwd>
-#include <map>
-#include <optional>
-#include <thread>
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/JSON.h"
-#include "llvm/Support/Threading.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include "lldb/API/SBAttachInfo.h"
-#include "lldb/API/SBCommandInterpreter.h"
-#include "lldb/API/SBCommandReturnObject.h"
-#include "lldb/API/SBDebugger.h"
-#include "lldb/API/SBEvent.h"
-#include "lldb/API/SBFormat.h"
-#include "lldb/API/SBLaunchInfo.h"
-#include "lldb/API/SBTarget.h"
-#include "lldb/API/SBThread.h"
-
+#include "DAPForward.h"
 #include "ExceptionBreakpoint.h"
 #include "FunctionBreakpoint.h"
 #include "IOStream.h"
 #include "InstructionBreakpoint.h"
+#include "OutputRedirector.h"
 #include "ProgressEvent.h"
 #include "SourceBreakpoint.h"
+#include "lldb/API/SBBroadcaster.h"
+#include "lldb/API/SBCommandInterpreter.h"
+#include "lldb/API/SBDebugger.h"
+#include "lldb/API/SBError.h"
+#include "lldb/API/SBFile.h"
+#include "lldb/API/SBFormat.h"
+#include "lldb/API/SBFrame.h"
+#include "lldb/API/SBTarget.h"
+#include "lldb/API/SBThread.h"
+#include "lldb/API/SBValue.h"
+#include "lldb/API/SBValueList.h"
+#include "lldb/lldb-types.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
+#include "llvm/Support/Threading.h"
+#include <map>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <vector>
 
 #define VARREF_LOCALS (int64_t)1
 #define VARREF_GLOBALS (int64_t)2
@@ -138,15 +140,18 @@ struct SendEventRequestHandler : public lldb::SBCommandPluginInterface {
 
 struct DAP {
   llvm::StringRef debug_adaptor_path;
+  std::ofstream *log;
   InputStream input;
   OutputStream output;
+  lldb::SBFile in;
+  OutputRedirector out;
+  OutputRedirector err;
   lldb::SBDebugger debugger;
   lldb::SBTarget target;
   Variables variables;
   lldb::SBBroadcaster broadcaster;
   std::thread event_thread;
   std::thread progress_event_thread;
-  std::unique_ptr<std::ofstream> log;
   llvm::StringMap<SourceBreakpointMap> source_breakpoints;
   FunctionBreakpointMap function_breakpoints;
   InstructionBreakpointMap instruction_breakpoints;
@@ -198,13 +203,23 @@ struct DAP {
   // will contain that expression.
   std::string last_nonempty_var_expression;
 
-  DAP(llvm::StringRef path, ReplMode repl_mode);
+  DAP(llvm::StringRef path, std::ofstream *log, ReplMode repl_mode,
+      StreamDescriptor input, StreamDescriptor output);
   ~DAP();
   DAP(const DAP &rhs) = delete;
   void operator=(const DAP &rhs) = delete;
   ExceptionBreakpoint *GetExceptionBreakpoint(const std::string &filter);
   ExceptionBreakpoint *GetExceptionBreakpoint(const lldb::break_id_t bp_id);
 
+  /// Redirect stdout and stderr fo the IDE's console output.
+  ///
+  /// Errors in this operation will be printed to the log file and the IDE's
+  /// console output as well.
+  llvm::Error ConfigureIO(std::FILE *overrideOut, std::FILE *overrideErr);
+
+  /// Stop the redirected IO threads and associated pipes.
+  void StopIO();
+
   // Serialize the JSON value into a string and send the JSON packet to
   // the "out" stream.
   void SendJSON(const llvm::json::Value &json);
diff --git a/lldb/tools/lldb-dap/IOStream.h b/lldb/tools/lldb-dap/IOStream.h
index 57d5fd458b716..74889eb2e5a86 100644
--- a/lldb/tools/lldb-dap/IOStream.h
+++ b/lldb/tools/lldb-dap/IOStream.h
@@ -52,6 +52,9 @@ struct StreamDescriptor {
 struct InputStream {
   StreamDescriptor descriptor;
 
+  explicit InputStream(StreamDescriptor descriptor)
+      : descriptor(std::move(descriptor)) {}
+
   bool read_full(std::ofstream *log, size_t length, std::string &text);
 
   bool read_line(std::ofstream *log, std::string &line);
@@ -62,6 +65,9 @@ struct InputStream {
 struct OutputStream {
   StreamDescriptor descriptor;
 
+  explicit OutputStream(StreamDescriptor descriptor)
+      : descriptor(std::move(descriptor)) {}
+
   bool write_full(llvm::StringRef str);
 };
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/OutputRedirector.cpp b/lldb/tools/lldb-dap/OutputRedirector.cpp
index 2c2f49569869b..8fcbcfec99c44 100644
--- a/lldb/tools/lldb-dap/OutputRedirector.cpp
+++ b/lldb/tools/lldb-dap/OutputRedirector.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===/
 
+#include "llvm/Support/Error.h"
+#include <system_error>
 #if defined(_WIN32)
 #include <fcntl.h>
 #include <io.h>
@@ -17,47 +19,59 @@
 #include "OutputRedirector.h"
 #include "llvm/ADT/StringRef.h"
 
-using namespace llvm;
+using lldb_private::Pipe;
+using lldb_private::Status;
+using llvm::createStringError;
+using llvm::Error;
+using llvm::Expected;
+using llvm::StringRef;
 
 namespace lldb_dap {
 
-Error RedirectFd(int fd, std::function<void(llvm::StringRef)> callback) {
-  int new_fd[2];
-#if defined(_WIN32)
-  if (_pipe(new_fd, 4096, O_TEXT) == -1) {
-#else
-  if (pipe(new_fd) == -1) {
-#endif
-    int error = errno;
-    return createStringError(inconvertibleErrorCode(),
-                             "Couldn't create new pipe for fd %d. %s", fd,
-                             strerror(error));
-  }
+Expected<int> OutputRedirector::GetWriteFileDescriptor() {
+  if (!m_pipe.CanWrite())
+    return createStringError(std::errc::bad_file_descriptor,
+                             "write handle is not open for writing");
+  return m_pipe.GetWriteFileDescriptor();
+}
 
-  if (dup2(new_fd[1], fd) == -1) {
-    int error = errno;
-    return createStringError(inconvertibleErrorCode(),
-                             "Couldn't override the fd %d. %s", fd,
-                             strerror(error));
-  }
+Error OutputRedirector::RedirectTo(std::function<void(StringRef)> callback) {
+  Status status = m_pipe.CreateNew(/*child_process_inherit=*/false);
+  if (status.Fail())
+    return status.takeError();
 
-  int read_fd = new_fd[0];
-  std::thread t([read_fd, callback]() {
+  m_forwarder = std::thread([this, callback]() {
     char buffer[OutputBufferSize];
-    while (true) {
-      ssize_t bytes_count = read(read_fd, &buffer, sizeof(buffer));
-      if (bytes_count == 0)
-        return;
-      if (bytes_count == -1) {
-        if (errno == EAGAIN || errno == EINTR)
-          continue;
+    while (m_pipe.CanRead() && !m_stopped) {
+      size_t bytes_read;
+      Status status = m_pipe.Read(&buffer, sizeof(buffer), bytes_read);
+      if (status.Fail())
+        continue;
+
+      // EOF detected
+      if (bytes_read == 0 || m_stopped)
         break;
-      }
-      callback(StringRef(buffer, bytes_count));
+
+      callback(StringRef(buffer, bytes_read));
     }
   });
-  t.detach();
+
   return Error::success();
 }
 
+void OutputRedirector::Stop() {
+  m_stopped = true;
+
+  if (m_pipe.CanWrite()) {
+    // Closing the pipe may not be sufficient to wake up the thread in case the
+    // write descriptor is duplicated (to stdout/err or to another process).
+    // Write a null byte to ensure the read call returns.
+    char buf[] = "\0";
+    size_t bytes_written;
+    m_pipe.Write(buf, sizeof(buf), bytes_written);
+    m_pipe.CloseWriteFileDescriptor();
+    m_forwarder.join();
+  }
+}
+
 } // namespace lldb_dap
diff --git a/lldb/tools/lldb-dap/OutputRedirector.h b/lldb/tools/lldb-dap/OutputRedirector.h
index e26d1648b104f..41ea05c22c691 100644
--- a/lldb/tools/lldb-dap/OutputRedirector.h
+++ b/lldb/tools/lldb-dap/OutputRedirector.h
@@ -9,17 +9,39 @@
 #ifndef LLDB_TOOLS_LLDB_DAP_OUTPUT_REDIRECTOR_H
 #define LLDB_TOOLS_LLDB_DAP_OUTPUT_REDIRECTOR_H
 
+#include "lldb/Host/Pipe.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
+#include <atomic>
+#include <functional>
+#include <thread>
 
 namespace lldb_dap {
 
-/// Redirects the output of a given file descriptor to a callback.
-///
-/// \return
-///     \a Error::success if the redirection was set up correctly, or an error
-///     otherwise.
-llvm::Error RedirectFd(int fd, std::function<void(llvm::StringRef)> callback);
+class OutputRedirector {
+public:
+  /// Creates writable file descriptor that will invoke the given callback on
+  /// each write in a background thread.
+  ///
+  /// \return
+  ///     \a Error::success if the redirection was set up correctly, or an error
+  ///     otherwise.
+  llvm::Error RedirectTo(std::function<void(llvm::StringRef)> callback);
+
+  llvm::Expected<int> GetWriteFileDescriptor();
+  void Stop();
+
+  ~OutputRedirector() { Stop(); }
+
+  OutputRedirector() = default;
+  OutputRedirector(const OutputRedirector &) = delete;
+  OutputRedirector &operator=(const OutputRedirector &) = delete;
+
+private:
+  std::atomic<bool> m_stopped = false;
+  lldb_private::Pipe m_pipe;
+  std::thread m_forwarder;
+};
 
 } // namespace lldb_dap
 
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 6b12569d90a83..9e0e7f21ce4fc 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -10,10 +10,10 @@
 #include "FifoFiles.h"
 #include "JSONUtils.h"
 #include "LLDBUtils.h"
-#include "OutputRedirector.h"
 #include "RunInTerminal.h"
 #include "Watchpoint.h"
 #include "lldb/API/SBDeclaration.h"
+#include "lldb/API/SBEvent.h"
 #include "lldb/API/SBInstruction.h"
 #include "lldb/API/SBListener.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
@@ -41,9 +41,11 @@
 #include <cassert>
 #include <climits>
 #include <cstdarg>
+#include <cstdint>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <fcntl.h>
 #include <map>
 #include <memory>
 #include <optional>
@@ -140,15 +142,14 @@ lldb::SBValueList *GetTopLevelScope(DAP &dap, int64_t variablesReference) {
   }
 }
 
-SOCKET AcceptConnection(DAP &dap, int portno) {
+SOCKET AcceptConnection(std::ofstream *log, int portno) {
   // Accept a socket connection from any host on "portno".
   SOCKET newsockfd = -1;
   struct sockaddr_in serv_addr, cli_addr;
   SOCKET sockfd = socket(AF_INET, SOCK_STREAM, 0);
   if (sockfd < 0) {
-    if (dap.log)
-      *dap.log << "error: opening socket (" << strerror(errno) << ")"
-               << std::endl;
+    if (log)
+      *log << "error: opening socket (" << strerror(errno) << ")" << std::endl;
   } else {
     memset((char *)&serv_addr, 0, sizeof(serv_addr));
     serv_addr.sin_family = AF_INET;
@@ -156,9 +157,9 @@ SOCKET AcceptConnection(DAP &dap, int portno) {
     serv_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
     serv_addr.sin_port = htons(portno);
     if (bind(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-      if (dap.log)
-        *dap.log << "error: binding socket (" << strerror(errno) << ")"
-                 << std::endl;
+      if (log)
+        *log << "error: binding socket (" << strerror(errno) << ")"
+             << std::endl;
     } else {
       listen(sockfd, 5);
       socklen_t clilen = sizeof(cli_addr);
@@ -166,8 +167,8 @@ SOCKET AcceptConnection(DAP &dap, int portno) {
           llvm::sys::RetryAfterSignal(static_cast<SOCKET>(-1), accept, sockfd,
                                       (struct sockaddr *)&cli_addr, &clilen);
       if (newsockfd < 0)
-        if (dap.log)
-          *dap.log << "error: accept (" << strerror(errno) << ")" << std::endl;
+        if (log)
+          *log << "error: accept (" << strerror(errno) << ")" << std::endl;
     }
 #if defined(_WIN32)
     closesocket(sockfd);
@@ -1103,6 +1104,7 @@ void request_disconnect(DAP &dap, const llvm::json::Object &request) {
     dap.broadcaster.BroadcastEventByType(eBroadcastBitStopProgressThread);
     dap.progress_event_thread.join();
   }
+  dap.StopIO();
   dap.disconnecting = true;
 }
 
@@ -1872,7 +1874,36 @@ void request_initialize(DAP &dap, const llvm::json::Object &request) {
   // which may affect the outcome of tests.
   bool source_init_file = GetBoolean(arguments, "sourceInitFile", true);
 
-  dap.debugger = lldb::SBDebugger::Create(source_init_file);
+  // Do not source init files until in/out/err are configured.
+  dap.debugger = lldb::SBDebugger::Create(false);
+  dap.debugger.SetInputFile(dap.in);
+  auto out_fd = dap.out.GetWriteFileDescriptor();
+  if (llvm::Error err = out_fd.takeError()) {
+    response["success"] = false;
+    EmplaceSafeString(response, "message", llvm::toString(std::move(err)));
+    dap.SendJSON(llvm::json::Value(std::move(response)));
+    return;
+  }
+  dap.debugger.SetOutputFile(lldb::SBFile(*out_fd, "w", false));
+  auto err_fd = dap.err.GetWriteFileDescriptor();
+  if (llvm::Error err = err_fd.takeError()) {
+    response["success"] = false;
+    EmplaceSafeString(response, "message", llvm::toString(std::move(err)));
+    dap.SendJSON(llvm::json::Value(std::move(response)));
+    return;
+  }
+  dap.debugger.SetErrorFile(lldb::SBFile(*err_fd, "w", false));
+
+  auto interp = dap.debugger.GetCommandInterpreter();
+
+  if (source_init_file) {
+    dap.debugger.SkipLLDBInitFiles(false);
+    dap.debugger.SkipAppInitFiles(false);
+    lldb::SBCommandReturnObject init;
+    interp.SourceInitFileInGlobalDirectory(init);
+    interp.SourceInitFileInHomeDirectory(init);
+  }
+
   if (llvm::Error err = dap.RunPreInitCommands()) {
     response["success"] = false;
     EmplaceSafeString(response, "message", llvm::toString(std::move(err)));
@@ -4911,36 +4942,14 @@ static void redirection_test() {
   fflush(stderr);
 }
 
-/// Redirect stdout and stderr fo the IDE's console output.
-///
-/// Errors in this operation will be printed to the log file and the IDE's
-/// console output as well.
-///
-/// \return
-///     A fd pointing to the original stdout.
-static int SetupStdoutStderrRedirection(DAP &dap) {
-  int stdoutfd = fileno(stdout);
-  int new_stdout_fd = dup(stdoutfd);
-  auto output_callback_stderr = [&dap](llvm::StringRef data) {
-    dap.SendOutput(OutputType::Stderr, data);
-  };
-  auto output_callback_stdout = [&dap](llvm::StringRef data) {
-    dap.SendOutput(OutputType::Stdout, data);
-  };
-  if (llvm::Error err = RedirectFd(stdoutfd, output_callback_stdout)) {
-    std::string error_message = llvm::toString(std::move(err));
-    if (dap.log)
-      *dap.log << error_message << std::endl;
-    output_callback_stderr(error_message);
-  }
-  if (llvm::Error err = RedirectFd(fileno(stderr), output_callback_stderr)) {
-    std::string error_message = llvm::toString(std::move(err));
-    if (dap.log)
-      *dap.log << error_message << std::endl;
-    output_callback_stderr(error_message);
-  }
-
-  return new_stdout_fd;
+/// Duplicates a file descriptor, setting FD_CLOEXEC if applicable.
+static int DuplicateFileDescriptor(int fd) {
+#if defined(F_DUPFD_CLOEXEC)
+  // Ensure FD_CLOEXEC is set.
+  return ::fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+  return ::dup(fd);
+#endif
 }
 
 int main(int argc, char *argv[]) {
@@ -5031,47 +5040,88 @@ int main(int argc, char *argv[]) {
   }
 #endif
 
+  std::unique_ptr<std::ofstream> log = nullptr;
+  const char *log_file_path = getenv("LLDBDAP_LOG");
+  if (log_file_path)
+    log = std::make_unique<std::ofstream>(log_file_path);
+
   // Initialize LLDB first before we do anything.
-  lldb::SBDebugger::Initialize();
+  lldb::SBError error = lldb::SBDebugger::InitializeWithErrorHandling();
+  if (error.Fail()) {
+    lldb::SBStream os;
+    error.GetDescription(os);
+    llvm::errs() << "lldb initialize failed: " << os.GetData() << "\n";
+    return EXIT_FAILURE;
+  }
 
   // Terminate the debugger before the C++ destructor chain kicks in.
   auto terminate_debugger =
       llvm::make_scope_exit([] { lldb::SBDebugger::Terminate(); });
 
-  DAP dap = DAP(program_path.str(), default_repl_mode);
-
-  RegisterRequestCallbacks(dap);
-
-  // stdout/stderr redirection to the IDE's console
-  int new_stdout_fd = SetupStdoutStderrRedirection(dap);
-
+  StreamDescriptor input;
+  StreamDescriptor output;
+  std::FILE *redirectOut = nullptr;
+  std::FILE *redirectErr = nullptr;
   if (portno != -1) {
     printf("Listening on port %i...\n", portno);
-    SOCKET socket_fd = AcceptConnection(dap, portno);
-    if (socket_fd >= 0) {
-      dap.input.descriptor = StreamDescriptor::from_socket(socket_fd, true);
-      dap.output.descriptor = StreamDescriptor::from_socket(socket_fd, false);
-    } else {
+    SOCKET socket_fd = AcceptConnection(log.get(), portno);
+    if (socket_fd < 0)
       return EXIT_FAILURE;
-    }
+
+    input = StreamDescriptor::from_socket(socket_fd, true);
+    output = StreamDescriptor::from_socket(socket_fd, false);
   } else {
-    dap.input.descriptor = StreamDescriptor::from_file(fileno(stdin), false);
-    dap.output.descriptor = StreamDescriptor::from_file(new_stdout_fd, false);
+#if defined(_WIN32)
+    // Windows opens stdout and stdin in text mode which converts \n to 13,10
+    // while the value is just 10 on Darwin/Linux. Setting the file mode to
+    // binary fixes this.
+    int result = _setmode(fileno(stdout), _O_BINARY);
+    assert(result);
+    result = _setmode(fileno(stdin), _O_BINARY);
+    UNUSED_IF_ASSERT_DISABLED(result);
+    assert(result);
+#endif
 
-    /// used only by TestVSCode_redirection_to_console.py
-    if (getenv("LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION") != nullptr)
-      redirection_test();
+    int stdout_fd = DuplicateFileDescriptor(fileno(stdout));
+    if (stdout_fd == -1) {
+      llvm::logAllUnhandledErrors(
+          llvm::errorCodeToError(llvm::errnoAsErrorCode()), llvm::errs(),
+          "Failed to configure stdout redirect: ");
+      return EXIT_FAILURE;
+    }
+
+    redirectOut = stdout;
+    redirectErr = stderr;
+
+    input = StreamDescriptor::from_file(fileno(stdin), false);
+    output = StreamDescriptor::from_file(stdout_fd, false);
+  }
+
+  DAP dap = DAP(program_path.str(), log.get(), default_repl_mode,
+                std::move(input), std::move(output));
+
+  // stdout/stderr redirection to the IDE's console
+  if (auto Err = dap.ConfigureIO(redirectOut, redirectErr)) {
+    llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(),
+                                "Failed to configure lldb-dap IO operations: ");
+    return EXIT_FAILURE;
   }
 
+  RegisterRequestCallbacks(dap);
+
   for (const std::string &arg :
        input_args.getAllArgValues(OPT_pre_init_command)) {
     dap.pre_init_commands.push_back(arg);
   }
 
+  // used only by TestVSCode_redirection_to_console.py
+  if (getenv("LLDB_DAP_TEST_STDOUT_STDERR_REDIRECTION") != nullptr)
+    redirection_test();
+
   bool CleanExit = true;
   if (auto Err = dap.Loop()) {
-    if (dap.log)
-      *dap.log << "Transport Error: " << llvm::toString(std::move(Err)) << "\n";
+    if (log)
+      *log << "Transport Error: " << llvm::toString(std::move(Err)) << "\n";
     CleanExit = false;
   }
 

From fee393e4ea2b53139ee7924e3aa818433d70cfc7 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:59:15 -0800
Subject: [PATCH 244/432] [flang][runtime] Don't crash on ASYNCHRONOUS='NO' in
 child I/O (#124208)

When ASYNCHRONOUS='NO' appears in a data transfer statement control item
list, don't crash if it isn't appropriate for the kind of I/O under way
(such as child I/O).

Fixes https://github.com/llvm/llvm-project/issues/124135.
---
 flang/runtime/io-api.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index 7023f61ba34de..9dfa09ab332c2 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -770,18 +770,18 @@ bool IODEF(SetAsynchronous)(
           "SetAsynchronous() called after GetNewUnit() for an OPEN statement");
     }
     open->unit().set_mayAsynchronous(isYes);
+  } else if (!isYes) {
+    // ASYNCHRONOUS='NO' is the default, so this is a no-op
   } else if (auto *ext{io.get_if<ExternalIoStatementBase>()}) {
-    if (isYes) {
-      if (ext->unit().mayAsynchronous()) {
-        ext->SetAsynchronous();
-      } else {
-        handler.SignalError(IostatBadAsynchronous);
-      }
+    if (ext->unit().mayAsynchronous()) {
+      ext->SetAsynchronous();
+    } else {
+      handler.SignalError(IostatBadAsynchronous);
     }
   } else if (!io.get_if<NoopStatementState>() &&
       !io.get_if<ErroneousIoStatementState>()) {
-    handler.Crash("SetAsynchronous() called when not in an OPEN or external "
-                  "I/O statement");
+    handler.Crash("SetAsynchronous('YES') called when not in an OPEN or "
+                  "external I/O statement");
   }
   return !handler.InError();
 }

From e252c402104bd7c23341748663e1a182451c2ec8 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 08:59:43 -0800
Subject: [PATCH 245/432] [flang] Fix spurious error due to bad expression
 shape calculation (#124323)

GetShape() needed to be called with a FoldingContext in order to
properly construct an extent expression for the shape of an array
constructor whose elements (nested in an implied DO loop) were not
scalars.

Fixes https://github.com/llvm/llvm-project/issues/124191.
---
 flang/include/flang/Evaluate/shape.h | 25 +++++++++++++++++--------
 flang/lib/Evaluate/shape.cpp         | 13 ++++++++++---
 flang/test/Evaluate/bug124191.f90    |  6 ++++++
 3 files changed, 33 insertions(+), 11 deletions(-)
 create mode 100644 flang/test/Evaluate/bug124191.f90

diff --git a/flang/include/flang/Evaluate/shape.h b/flang/include/flang/Evaluate/shape.h
index 3e42ec691158b..f0505cfcdf2d7 100644
--- a/flang/include/flang/Evaluate/shape.h
+++ b/flang/include/flang/Evaluate/shape.h
@@ -71,6 +71,9 @@ template <typename A>
 std::optional<Shape> GetShape(
     FoldingContext &, const A &, bool invariantOnly = true);
 template <typename A>
+std::optional<Shape> GetShape(
+    FoldingContext *, const A &, bool invariantOnly = true);
+template <typename A>
 std::optional<Shape> GetShape(const A &, bool invariantOnly = true);
 
 // The dimension argument to these inquiries is zero-based,
@@ -149,6 +152,8 @@ inline MaybeExtentExpr GetSize(const std::optional<Shape> &maybeShape) {
 // Utility predicate: does an expression reference any implied DO index?
 bool ContainsAnyImpliedDoIndex(const ExtentExpr &);
 
+// GetShape()
+
 class GetShapeHelper
     : public AnyTraverse<GetShapeHelper, std::optional<Shape>> {
 public:
@@ -261,23 +266,27 @@ class GetShapeHelper
 
 template <typename A>
 std::optional<Shape> GetShape(
-    FoldingContext &context, const A &x, bool invariantOnly) {
-  if (auto shape{GetShapeHelper{&context, invariantOnly}(x)}) {
-    return Fold(context, std::move(shape));
+    FoldingContext *context, const A &x, bool invariantOnly) {
+  if (auto shape{GetShapeHelper{context, invariantOnly}(x)}) {
+    if (context) {
+      return Fold(*context, std::move(shape));
+    } else {
+      return shape;
+    }
   } else {
     return std::nullopt;
   }
 }
 
 template <typename A>
-std::optional<Shape> GetShape(const A &x, bool invariantOnly) {
-  return GetShapeHelper{/*context=*/nullptr, invariantOnly}(x);
+std::optional<Shape> GetShape(
+    FoldingContext &context, const A &x, bool invariantOnly) {
+  return GetShape(&context, x, invariantOnly);
 }
 
 template <typename A>
-std::optional<Shape> GetShape(
-    FoldingContext *context, const A &x, bool invariantOnly = true) {
-  return GetShapeHelper{context, invariantOnly}(x);
+std::optional<Shape> GetShape(const A &x, bool invariantOnly) {
+  return GetShape(/*context=*/nullptr, x, invariantOnly);
 }
 
 template <typename A>
diff --git a/flang/lib/Evaluate/shape.cpp b/flang/lib/Evaluate/shape.cpp
index 58b824d9b8e64..fa957cfc08495 100644
--- a/flang/lib/Evaluate/shape.cpp
+++ b/flang/lib/Evaluate/shape.cpp
@@ -16,6 +16,7 @@
 #include "flang/Evaluate/tools.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/message.h"
+#include "flang/Semantics/semantics.h"
 #include "flang/Semantics/symbol.h"
 #include <functional>
 
@@ -23,6 +24,10 @@ using namespace std::placeholders; // _1, _2, &c. for std::bind()
 
 namespace Fortran::evaluate {
 
+FoldingContext &GetFoldingContextFrom(const Symbol &symbol) {
+  return symbol.owner().context().foldingContext();
+}
+
 bool IsImpliedShape(const Symbol &original) {
   const Symbol &symbol{ResolveAssociations(original)};
   const auto *details{symbol.detailsIf<semantics::ObjectEntityDetails>()};
@@ -483,7 +488,7 @@ static MaybeExtentExpr GetAssociatedExtent(
     const Symbol &symbol, int dimension) {
   if (const auto *assoc{symbol.detailsIf<semantics::AssocEntityDetails>()};
       assoc && !assoc->rank()) { // not SELECT RANK case
-    if (auto shape{GetShape(assoc->expr())};
+    if (auto shape{GetShape(GetFoldingContextFrom(symbol), assoc->expr())};
         shape && dimension < static_cast<int>(shape->size())) {
       if (auto &extent{shape->at(dimension)};
           // Don't return a non-constant extent, as the variables that
@@ -519,7 +524,8 @@ MaybeExtentExpr GetExtent(
   }
   if (const auto *details{symbol.detailsIf<semantics::ObjectEntityDetails>()}) {
     if (IsImpliedShape(symbol) && details->init()) {
-      if (auto shape{GetShape(symbol, invariantOnly)}) {
+      if (auto shape{
+              GetShape(GetFoldingContextFrom(symbol), symbol, invariantOnly)}) {
         if (dimension < static_cast<int>(shape->size())) {
           return std::move(shape->at(dimension));
         }
@@ -568,7 +574,8 @@ MaybeExtentExpr GetExtent(const Subscript &subscript, const NamedEntity &base,
                 MaybeExtentExpr{triplet.stride()});
           },
           [&](const IndirectSubscriptIntegerExpr &subs) -> MaybeExtentExpr {
-            if (auto shape{GetShape(subs.value())};
+            if (auto shape{GetShape(
+                    GetFoldingContextFrom(base.GetLastSymbol()), subs.value())};
                 shape && GetRank(*shape) == 1) {
               // vector-valued subscript
               return std::move(shape->at(0));
diff --git a/flang/test/Evaluate/bug124191.f90 b/flang/test/Evaluate/bug124191.f90
new file mode 100644
index 0000000000000..27d08032efa2f
--- /dev/null
+++ b/flang/test/Evaluate/bug124191.f90
@@ -0,0 +1,6 @@
+! RUN: %flang_fc1 -fsyntax-only -pedantic %s 2>&1 | FileCheck --allow-empty %s
+! CHECK-NOT: error:
+! Regression test for https://github.com/llvm/llvm-project/issues/124191
+character(3) :: arr(5) = ['aa.', 'bb.', 'cc.', 'dd.', 'ee.']
+arr([(mod(iachar(arr(i:i-1:-1)(1:1)),5)+1, i=2,5,3)]) = arr(5:2:-1)
+end

From 08c364280a790cb2a80fff86beb5ea69782ce667 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 09:00:10 -0800
Subject: [PATCH 246/432] [flang] Improve error message on bad complex literal.
 (#124331)

A complex literal constant can have one BOZ component, since the type
and value of the literal can be determined by converting the BOZ value
to the type of the other component. But a complex literal constant with
two BOZ components doesn't have a well-defined type. The error message
was confusing in the case; emit a better one.

Fixes https://github.com/llvm/llvm-project/issues/124201.
---
 flang/lib/Evaluate/tools.cpp       | 5 +++++
 flang/test/Semantics/complex01.f90 | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index 2d0e1996632fc..16b0260719097 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -212,6 +212,11 @@ ConvertRealOperandsResult ConvertRealOperands(
             return {AsSameKindExprs<TypeCategory::Real>(
                 ConvertTo(ry, std::move(bx)), std::move(ry))};
           },
+          [&](BOZLiteralConstant &&,
+              BOZLiteralConstant &&) -> ConvertRealOperandsResult {
+            messages.Say("operands cannot both be BOZ"_err_en_US);
+            return std::nullopt;
+          },
           [&](auto &&, auto &&) -> ConvertRealOperandsResult { // C718
             messages.Say(
                 "operands must be INTEGER, UNSIGNED, REAL, or BOZ"_err_en_US);
diff --git a/flang/test/Semantics/complex01.f90 b/flang/test/Semantics/complex01.f90
index c9d408ee0e111..d268eafa68589 100644
--- a/flang/test/Semantics/complex01.f90
+++ b/flang/test/Semantics/complex01.f90
@@ -1,5 +1,5 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
-! C718 Each named constant in a complex literal constant shall be of type 
+! C718 Each named constant in a complex literal constant shall be of type
 ! integer or real.
 subroutine s()
   integer :: ivar = 35
@@ -30,4 +30,6 @@ subroutine s()
   complex :: cvar11 = (cconst, 1.0)
   !ERROR: operands must be INTEGER, UNSIGNED, REAL, or BOZ
   complex :: cvar12 = (lconst, 1.0)
+  !ERROR: operands cannot both be BOZ
+  complex :: cvar13 = (z'3f700000', z'00000000')
 end subroutine s

From b80965efc1ea3f30cbdac090d869b1000fca5d2b Mon Sep 17 00:00:00 2001
From: Josh Stone <jistone@redhat.com>
Date: Mon, 27 Jan 2025 09:04:43 -0800
Subject: [PATCH 247/432] [Support] Report OOM from `allocate_buffer` (#85449)

Previously, it called `::operator new` which may throw `std::bad_alloc`,
regardless of whether LLVM itself was built with exception handling, and
this can cause safety issues if outside code has destructors that will
call back into LLVM. Now we use `::operator new(..., nothrow)` and call
`llvm::report_bad_alloc_error` when allocation fails, which will abort
when LLVM is built without exceptions.

Ref: https://github.com/llvm/llvm-project/issues/85281
---
 llvm/lib/Support/MemAlloc.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/MemAlloc.cpp b/llvm/lib/Support/MemAlloc.cpp
index 07a26cf26480b..6adc9abd75c5b 100644
--- a/llvm/lib/Support/MemAlloc.cpp
+++ b/llvm/lib/Support/MemAlloc.cpp
@@ -13,12 +13,15 @@
 
 LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
 llvm::allocate_buffer(size_t Size, size_t Alignment) {
-  return ::operator new(Size
+  void *Result = ::operator new(Size,
 #ifdef __cpp_aligned_new
-                        ,
-                        std::align_val_t(Alignment)
+                                std::align_val_t(Alignment),
 #endif
-  );
+                                std::nothrow);
+  if (Result == nullptr) {
+    report_bad_alloc_error("Buffer allocation failed");
+  }
+  return Result;
 }
 
 void llvm::deallocate_buffer(void *Ptr, size_t Size, size_t Alignment) {

From e7de6036983641ccf0fb45afd3eb96ff962525aa Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 27 Jan 2025 16:54:40 +0000
Subject: [PATCH 248/432] [X86] combineCMov - pull out repeated getValueType
 calls. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ce3c140af8105..8f904209d8a3a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48464,7 +48464,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   SDLoc DL(N);
-
+  EVT VT = N->getValueType(0);
   SDValue FalseOp = N->getOperand(0);
   SDValue TrueOp = N->getOperand(1);
   X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
@@ -48483,7 +48483,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         !Subtarget.canUseCMOV() || hasFPCMov(CC)) {
       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
                        Flags};
-      return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+      return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
     }
   }
 
@@ -48530,9 +48530,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
 
       // Optimize cases that will turn into an LEA instruction.  This requires
       // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
-      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+      if (VT == MVT::i32 || VT == MVT::i64) {
         APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
-        assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&
+        assert(Diff.getBitWidth() == VT.getSizeInBits() &&
                "Implicit constant truncation");
 
         bool isFastMultiplier = false;
@@ -48600,11 +48600,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         std::swap(TrueOp, FalseOp);
       }
 
-      if (CC == X86::COND_E &&
-          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+      if (CC == X86::COND_E && CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
         SDValue Ops[] = {FalseOp, Cond.getOperand(0),
                          DAG.getTargetConstant(CC, DL, MVT::i8), Cond};
-        return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+        return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
       }
     }
   }
@@ -48624,14 +48623,13 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
     auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
     if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) {
       EVT CondVT = Cond->getValueType(0);
-      EVT OuterVT = N->getValueType(0);
       // Subtract 1 and generate a carry.
       SDValue NewSub =
           DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0),
                       DAG.getConstant(1, DL, CondVT));
       SDValue EFLAGS(NewSub.getNode(), 1);
-      return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32),
-                         TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS);
+      return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(VT, MVT::i32), TrueOp,
+                         DAG.getConstant(0, DL, VT), EFLAGS);
     }
   }
 
@@ -48665,10 +48663,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
 
       SDValue LOps[] = {FalseOp, TrueOp,
                         DAG.getTargetConstant(CC0, DL, MVT::i8), Flags};
-      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
+      SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, VT, LOps);
       SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8),
                        Flags};
-      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
+      SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
       return CMOV;
     }
   }
@@ -48696,7 +48694,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
         (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
          Add.getOperand(0).getOpcode() == ISD::CTTZ) &&
         Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) {
-      EVT VT = N->getValueType(0);
       // This should constant fold.
       SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1));
       SDValue CMov =

From 44c9e46fce12badae8cd3f5bd53fe1c2b1248940 Mon Sep 17 00:00:00 2001
From: Shubham Sandeep Rastogi <srastogi22@apple.com>
Date: Mon, 27 Jan 2025 09:26:22 -0800
Subject: [PATCH 249/432] [InstrRef] Fix mismatch between LiveDebugValues and
 salvageCopySSA (#124233)

The LiveDebugValues pass and the instruction selector (which calls
salvageCopySSA) need to be consistent on what they consider a copy
instruction. With https://github.com/llvm/llvm-project/pull/75184, the
definition of what a copy instruction is was narrowed for AArch64 to
exclude a w->x ORR and treat it as a zero-extend rather than a copy

However, to make sure LiveDebugValues still treats a w->x ORR as a copy,
the new function, isCopyLikeInstr was created. We need to make sure that
salvageCopySSA also calls that function.

This patch addresses this mismatch.
---
 llvm/lib/CodeGen/MachineFunction.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index ab3609b6141b8..7d504ef5a0482 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -1054,7 +1054,7 @@ auto MachineFunction::salvageCopySSA(
   // Check whether this copy-like instruction has already been salvaged into
   // an operand pair.
   Register Dest;
-  if (auto CopyDstSrc = TII.isCopyInstr(MI)) {
+  if (auto CopyDstSrc = TII.isCopyLikeInstr(MI)) {
     Dest = CopyDstSrc->Destination->getReg();
   } else {
     assert(MI.isSubregToReg());
@@ -1138,7 +1138,7 @@ auto MachineFunction::salvageCopySSAImpl(MachineInstr &MI)
     CurInst = Inst.getIterator();
 
     // Any non-copy instruction is the defining instruction we're seeking.
-    if (!Inst.isCopyLike() && !TII.isCopyInstr(Inst))
+    if (!Inst.isCopyLike() && !TII.isCopyLikeInstr(Inst))
       break;
     State = GetRegAndSubreg(Inst);
   };

From 1e89355dadce13a162882b58a0e7f181669ba65f Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 27 Jan 2025 09:28:29 -0800
Subject: [PATCH 250/432] [clang-format] Treat `f<N | M>(a)` as template
 function call (#124438)

Fixes #123144.
---
 clang/lib/Format/TokenAnnotator.cpp           | 8 ++++----
 clang/unittests/Format/TokenAnnotatorTest.cpp | 7 ++++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 655766178fbb0..08fda7a2ffa51 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -252,10 +252,10 @@ class AnnotatingParser {
       // parameters.
       // FIXME: This is getting out of hand, write a decent parser.
       if (MaybeAngles && InExpr && !Line.startsWith(tok::kw_template) &&
-          Prev.is(TT_BinaryOperator)) {
-        const auto Precedence = Prev.getPrecedence();
-        if (Precedence > prec::Conditional && Precedence < prec::Relational)
-          MaybeAngles = false;
+          Prev.is(TT_BinaryOperator) &&
+          (Prev.isOneOf(tok::pipepipe, tok::ampamp) ||
+           Prev.getPrecedence() == prec::Equality)) {
+        MaybeAngles = false;
       }
       if (Prev.isOneOf(tok::question, tok::colon) && !Style.isProto())
         SeenTernaryOperator = true;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 585878e0edc5b..fc77e277947c5 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -692,7 +692,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsNonTemplateAngleBrackets) {
   EXPECT_TOKEN(Tokens[4], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[8], tok::greater, TT_BinaryOperator);
 
-  Tokens = annotate("return A < B ^ A > B;");
+  Tokens = annotate("return A < B != A > B;");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[6], tok::greater, TT_BinaryOperator);
@@ -3678,6 +3678,11 @@ TEST_F(TokenAnnotatorTest, TemplateInstantiation) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
 
+  Tokens = annotate("return FixedInt<N | M>(foo);");
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
+  EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[6], tok::greater, TT_TemplateCloser);
+
   Tokens = annotate("return std::conditional_t<T::value == U::value, T, U>{};");
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);

From 19f052443df05df373ef1c695055886db16de376 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 27 Jan 2025 18:30:51 +0100
Subject: [PATCH 251/432] [Clang] fix test on 32 bits target after 561132e
 (#124593)

---
 clang/test/SemaCXX/cxx2b-consteval-propagate.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 222d482f40aa5..868a52c357241 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -534,15 +534,15 @@ namespace GH123405 {
 consteval void fn() {}
 
 template <typename>
-constexpr int tfn(int) {
+constexpr auto tfn(int) {
     auto p = &fn;  // expected-note {{'tfn<int>' is an immediate function because its body evaluates the address of a consteval function 'fn'}}
-    return int(p); // expected-error {{cast from pointer to smaller type 'int' loses information}}
+    return p;
 }
 
-int g() {
+void g() {
    int a; // expected-note {{declared here}}
-   return tfn<int>(a); // expected-error {{call to immediate function 'GH123405::tfn<int>' is not a constant expression}}\
-                       // expected-note {{read of non-const variable 'a' is not allowed in a constant expression}}
+   tfn<int>(a); // expected-error {{call to immediate function 'GH123405::tfn<int>' is not a constant expression}}\
+                // expected-note {{read of non-const variable 'a' is not allowed in a constant expression}}
 }
 
 }

From 99bd2e3f123baf9a14acc9b31ee0f557288118a6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 17:32:47 +0000
Subject: [PATCH 252/432] [AArch64] Add Neon FP8 conversion intrinsics
 (#123612)

The patch adds the following intrinsics:

    bfloat16x8_t vcvt1_bf16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm)
    bfloat16x8_t vcvt1_low_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)
    bfloat16x8_t vcvt2_bf16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm)
    bfloat16x8_t vcvt2_low_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)

    bfloat16x8_t vcvt1_high_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)
    bfloat16x8_t vcvt2_high_bf16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)

    float16x8_t vcvt1_f16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm)
    float16x8_t vcvt1_low_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)
    float16x8_t vcvt2_f16_mf8_fpm(mfloat8x8_t vn, fpm_t fpm)
    float16x8_t vcvt2_low_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)

    float16x8_t vcvt1_high_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)
    float16x8_t vcvt2_high_f16_mf8_fpm(mfloat8x16_t vn, fpm_t fpm)

mfloat8x8_t vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm)
mfloat8x16_t vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
float32x4_t vm, fpm_t fpm)

mfloat8x8_t vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm)
mfloat8x16_t vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t
fpm)

Co-Authored-By: Caroline Concatto <caroline.concatto@arm.com>
---
 clang/include/clang/Basic/arm_neon.td         |  22 ++
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  79 ++++-
 clang/lib/CodeGen/CodeGenFunction.h           |   7 +
 .../fp8-intrinsics/acle_neon_fp8_cvt.c        | 316 ++++++++++++++++++
 .../acle_neon_fp8_cvt.c                       |  43 +++
 clang/utils/TableGen/NeonEmitter.cpp          |  21 +-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  22 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  46 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll     | 112 +++++++
 11 files changed, 662 insertions(+), 22 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
 create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ddc5391eb3fa2..9a6a77640ef5d 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2119,6 +2119,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Hm">;
+  def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Hm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Hm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Hm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
+  def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Hm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Hm">;
+
+  def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "mQm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index fd800e5a6278e..91a2bf3020b9a 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation {
 // B: change to BFloat16
 // P: change to polynomial category.
 // p: change polynomial to equivalent integer category. Otherwise nop.
+// V: change to fpm_t
 //
 // >: double element width (vector size unchanged).
 // <: half element width (vector size unchanged).
@@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5162ac503b8eb..0a06ce028a916 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6759,12 +6759,36 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+                                        SmallVectorImpl<Value *> &Ops,
+                                        Value *FPM, const char *name) {
+  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+  return EmitNeonCall(F, Ops, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
   return ConstantInt::get(Ty, neg ? -SV : SV);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                           llvm::Type *Ty1, bool Extract,
+                                           SmallVectorImpl<llvm::Value *> &Ops,
+                                           const CallExpr *E,
+                                           const char *name) {
+  llvm::Type *Tys[] = {Ty0, Ty1};
+  if (Extract) {
+    // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 // Right-shift a vector by a constant.
 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
                                           llvm::Type *Ty, bool usgn,
@@ -12736,6 +12760,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return V;
 
   unsigned Int;
+  bool ExtractLow = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -13950,7 +13975,59 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-
+  case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
+  case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+  case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
+  case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              Ops[0]->getType(), false, Ops, E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
+                              E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 16),
+                              llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
+                              E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
+    llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
+                                        Builder.getInt64(0));
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty,
+                              Ops[1]->getType(), false, Ops, E, "vfcvtn2");
+  }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index bdd6e3bd55a7f..a5416ab91c8d6 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4692,6 +4692,13 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+                               SmallVectorImpl<llvm::Value *> &O,
+                               llvm::Value *FPM, const char *name);
+  llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                  llvm::Type *Ty1, bool Extract,
+                                  SmallVectorImpl<llvm::Value *> &Ops,
+                                  const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 0000000000000..4305b840f2a05
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f32_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
+// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpm13__Mfloat8x8_t13__Float32x4_tS0_m(
+// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
+                                    float32x4_t vm, fpm_t fpm) {
+  return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f16_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {
+  return vcvtq_mf8_f16_fpm(vn, vm, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 0000000000000..2c7004c7968a4
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vcvt1_bf16_mf8_fpm(v8, fpm);
+  // expected-error@-1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_bf16_mf8_fpm(v8, fpm);
+  // expected-error@-1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_f16_mf8_fpm(v8, fpm);
+  // expected-error@-1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_f16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_f16_mf8_fpm(v8, fpm);
+  // expected-error@-1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_f16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_high_f16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_f16_mf8_fpm(v16, fpm);
+  // expected-error@-1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f32_fpm(va4, va4, fpm);
+  // expected-error@-1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm);
+  // expected-error@-1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm);
+  // expected-error@-1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}}
+  (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm);
+  // expected-error@-1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}}
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 7299a49252f0d..11f33ca17fda8 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -74,6 +74,7 @@ enum ClassKind {
   ClassI,     // generic integer instruction, e.g., "i8" suffix
   ClassS,     // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
   ClassW,     // width-specific instruction, e.g., "8" suffix
+  ClassV,     // void-suffix instruction, no suffix
   ClassB,     // bitcast arguments with enum argument to specify type
   ClassL,     // Logical instructions which are op instructions
               // but we need to not emit any suffix for in our
@@ -144,7 +145,7 @@ class Type {
 private:
   TypeSpec TS;
 
-  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 };
+  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM };
   TypeKind Kind;
   bool Immediate, Constant, Pointer;
   // ScalarForMangling and NoManglingQ are really not suited to live here as
@@ -198,6 +199,7 @@ class Type {
   bool isVoid() const { return Kind == Void; }
   bool isBFloat16() const { return Kind == BFloat16; }
   bool isMFloat8() const { return Kind == MFloat8; }
+  bool isFPM() const { return Kind == FPM; }
   unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
   unsigned getSizeInBits() const { return Bitwidth; }
   unsigned getElementSizeInBits() const { return ElementBitwidth; }
@@ -600,6 +602,7 @@ class NeonEmitter {
     const Record *SI = R.getClass("SInst");
     const Record *II = R.getClass("IInst");
     const Record *WI = R.getClass("WInst");
+    const Record *VI = R.getClass("VInst");
     const Record *SOpI = R.getClass("SOpInst");
     const Record *IOpI = R.getClass("IOpInst");
     const Record *WOpI = R.getClass("WOpInst");
@@ -609,6 +612,7 @@ class NeonEmitter {
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
     ClassMap[WI] = ClassW;
+    ClassMap[VI] = ClassV;
     ClassMap[SOpI] = ClassS;
     ClassMap[IOpI] = ClassI;
     ClassMap[WOpI] = ClassW;
@@ -641,6 +645,9 @@ class NeonEmitter {
 std::string Type::str() const {
   if (isVoid())
     return "void";
+  if (isFPM())
+    return "fpm_t";
+
   std::string S;
 
   if (isInteger() && !isSigned())
@@ -699,6 +706,8 @@ std::string Type::builtin_str() const {
   } else if (isMFloat8()) {
     assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits");
     S += "m";
+  } else if (isFPM()) {
+    S += "UWi";
   } else
     switch (ElementBitwidth) {
     case 16: S += "h"; break;
@@ -925,6 +934,13 @@ void Type::applyModifiers(StringRef Mods) {
     case 'P':
       Kind = Poly;
       break;
+    case 'V':
+      Kind = FPM;
+      Bitwidth = ElementBitwidth = 64;
+      NumVectors = 0;
+      Immediate = Constant = Pointer = false;
+      ScalarForMangling = NoManglingQ = true;
+      break;
     case '>':
       assert(ElementBitwidth < 128);
       ElementBitwidth *= 2;
@@ -1000,6 +1016,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   if (CK == ClassB && TargetGuard == "neon")
     return "";
 
+  if (this->CK == ClassV)
+    return "";
+
   if (T.isBFloat16())
     return "bf16";
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b31a65d9bcc02..31c9546376c82 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -993,6 +993,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 
+  //
+  // Neon FP8 intrinsics
+  //
+
+  // Conversions
+  class AdvSIMD_FP8_1VectorArg_Long_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_cvtl1   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_fp8_cvtl2   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+
+  def int_aarch64_neon_fp8_fcvtn
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
+  def int_aarch64_neon_fp8_fcvtn2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 6a3a9492e031c..67b4366454845 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6559,17 +6559,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
 
 
 // FCVTN (FP16 to FP8)
-multiclass SIMDThreeSameSizeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
-   def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+    def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+  }
+  def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>;
+  def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>;
 }
 
-// TODO : Create v16f8 value type
 // FCVTN, FCVTN2 (FP32 to FP8)
-multiclass SIMDThreeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
-   def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
-                                           V128, v16i8, v4f32, null_frag>;
+multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+    def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+                                            V128, v16i8, v4f32, null_frag>;
+  }
+
+  def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>;
+
+  def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
 // TODO: Create a new Value Type v8f8 and v16f8
@@ -7033,11 +7046,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD two-register miscellaneous
 //----------------------------------------------------------------------------
-multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
-  def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
-                                     asm, ".8h", ".8b", []>;
-  def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
-                                     asm#2, ".8h", ".16b", []>;
+multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> {
+  let Uses=[FPMR, FPCR], mayLoad = 1 in {
+    def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+                                      asm, ".8h", ".8b", []>;
+    def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+                                        asm#2, ".8h", ".16b", []>;
+  }
+  def : Pat<(dty (Op (v8i8 V64:$Rn))),
+            (!cast<Instruction>(NAME) V64:$Rn)>;
+
+  def : Pat<(dty (Op (v16i8 V128:$Rn))),
+            (!cast<Instruction>(NAME#2) V128:$Rn)>;
 }
 
 class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9d0bd44544134..881af6eb95117 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10324,13 +10324,13 @@ let Predicates = [HasD128] in {
 // 2023 Architecture Extensions:
 //===----------------------------===//
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
-  defm F1CVTL  : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
-  defm F2CVTL  : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
-  defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
-  defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
-  defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
-  defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+let Predicates = [HasFP8] in {
+  defm F1CVTL  : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+  defm F2CVTL  : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+  defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+  defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+  defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+  defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
   defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
new file mode 100644
index 0000000000000..6070380d24234
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+
+define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+   %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_low_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_high_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn2 v0.16b, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm)
+  ret <16 x i8> %res
+}
+
+
+define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) {
+; CHECK-LABEL: test_vcvtn_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) {
+; CHECK-LABEL: test_vcvtn2_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.16b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm)
+  ret <16 x i8> %res
+}

From 658f8500c84fcdcfbf5470ae7b4f732ef4a3c32f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 27 Jan 2025 18:38:12 +0100
Subject: [PATCH 253/432] [bazel] Remove obsolete mlir-cpu-runner alias

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 3336daf6773b2..4d44396be98ee 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7,7 +7,6 @@
 
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("//llvm:binary_alias.bzl", "binary_alias")
 load(
     ":build_defs.bzl",
     "cc_headers_only",
@@ -5593,8 +5592,8 @@ cc_library(
     deps = [
         ":IR",
         ":LLVMDialect",
-        ":LLVMPassIncGen",
         ":LLVMIRTransformsDIExpressionLegalization",
+        ":LLVMPassIncGen",
         ":Pass",
     ],
 )
@@ -9268,9 +9267,9 @@ cc_library(
         ":IR",
         ":LLVMConversionIncGen",
         ":LLVMDialect",
-        ":LLVMIntrinsicConversionIncGen",
         ":LLVMIRTransformsDIExpressionLegalization",
         ":LLVMIRTransformsLegalizeForExport",
+        ":LLVMIntrinsicConversionIncGen",
         ":OpenMPDialect",
         ":Support",
         ":TransformUtils",
@@ -10196,12 +10195,6 @@ cc_binary(
     ],
 )
 
-# TODO: Remove this alias.
-binary_alias(
-    name = "mlir-cpu-runner",
-    binary = ":mlir-runner",
-)
-
 # This target provides the headers from LLVM's Support target without any of
 # the symbols. In particular, it does not contain the static registration code
 # which may be executed by at most one shared library loaded by ORCJit. Direct

From 88cca8ea209bb034eaec6af09a0227fb8cc7303e Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 27 Jan 2025 12:41:00 -0500
Subject: [PATCH 254/432] [libc++] Add more missing bits to the locale base API
 (#122531)

This patch adds the following pieces to the locale base API:
- __setlocale (for std::setlocale)
- __lconv_t (for std::lconv)
- _LIBCPP_FOO_MASK and _LIBCPP_LC_ALL

This should be sufficient to implement all of the platform-agnostic
localization support in libc++ without relying directly on any public
API names from the C library. This makes it possible to port libc++ to
platforms that don't provide the usual locale APIs.
---
 libcxx/include/__locale                       |  5 +-
 libcxx/include/__locale_dir/locale_base_api.h | 29 ++++++++++-
 .../include/__locale_dir/support/bsd_like.h   | 16 +++++-
 libcxx/include/__locale_dir/support/fuchsia.h | 16 +++++-
 libcxx/include/__locale_dir/support/windows.h | 52 ++++++++++---------
 libcxx/src/iostream.cpp                       |  2 +-
 libcxx/src/locale.cpp                         | 44 ++++++++--------
 libcxx/src/support/win32/locale_win32.cpp     |  2 +-
 8 files changed, 112 insertions(+), 54 deletions(-)

diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index e10eb62fb844b..dfe79d5e506f1 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -60,8 +60,9 @@ public:
 
   static const category // values assigned here are for exposition only
       none    = 0,
-      collate = LC_COLLATE_MASK, ctype = LC_CTYPE_MASK, monetary = LC_MONETARY_MASK, numeric = LC_NUMERIC_MASK,
-      time = LC_TIME_MASK, messages = LC_MESSAGES_MASK, all = collate | ctype | monetary | numeric | time | messages;
+      collate = _LIBCPP_COLLATE_MASK, ctype = _LIBCPP_CTYPE_MASK, monetary = _LIBCPP_MONETARY_MASK,
+      numeric = _LIBCPP_NUMERIC_MASK, time = _LIBCPP_TIME_MASK, messages = _LIBCPP_MESSAGES_MASK,
+      all = collate | ctype | monetary | numeric | time | messages;
 
   // construct/copy/destroy:
   locale() _NOEXCEPT;
diff --git a/libcxx/include/__locale_dir/locale_base_api.h b/libcxx/include/__locale_dir/locale_base_api.h
index b112a4aef7765..bbee9f49867fd 100644
--- a/libcxx/include/__locale_dir/locale_base_api.h
+++ b/libcxx/include/__locale_dir/locale_base_api.h
@@ -29,11 +29,22 @@
 // -----------------
 // namespace __locale {
 //  using __locale_t = implementation-defined;
+//  using __lconv_t  = implementation-defined;
 //  __locale_t  __newlocale(int, const char*, __locale_t);
 //  void        __freelocale(__locale_t);
-//  lconv*      __localeconv(__locale_t&);
+//  char*       __setlocale(int, const char*);
+//  __lconv_t*  __localeconv(__locale_t&);
 // }
 //
+// #define _LIBCPP_COLLATE_MASK   /* implementation-defined */
+// #define _LIBCPP_CTYPE_MASK     /* implementation-defined */
+// #define _LIBCPP_MONETARY_MASK  /* implementation-defined */
+// #define _LIBCPP_NUMERIC_MASK   /* implementation-defined */
+// #define _LIBCPP_TIME_MASK      /* implementation-defined */
+// #define _LIBCPP_MESSAGES_MASK  /* implementation-defined */
+// #define _LIBCPP_ALL_MASK       /* implementation-defined */
+// #define _LIBCPP_LC_ALL         /* implementation-defined */
+//
 // Strtonum functions
 // ------------------
 // namespace __locale {
@@ -133,14 +144,28 @@ namespace __locale {
 // Locale management
 //
 using __locale_t _LIBCPP_NODEBUG = locale_t;
+using __lconv_t _LIBCPP_NODEBUG  = lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return newlocale(__category_mask, __name, __loc);
 }
 
+inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __locale) {
+  return ::setlocale(__category, __locale);
+}
+
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { freelocale(__loc); }
 
-inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) { return __libcpp_localeconv_l(__loc); }
+inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) { return __libcpp_localeconv_l(__loc); }
+
+#  define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK
+#  define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK
+#  define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK
+#  define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK
+#  define _LIBCPP_TIME_MASK LC_TIME_MASK
+#  define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK
+#  define _LIBCPP_ALL_MASK LC_ALL_MASK
+#  define _LIBCPP_LC_ALL LC_ALL
 
 //
 // Strtonum functions
diff --git a/libcxx/include/__locale_dir/support/bsd_like.h b/libcxx/include/__locale_dir/support/bsd_like.h
index b3933c71c6b26..c0080b13a08cf 100644
--- a/libcxx/include/__locale_dir/support/bsd_like.h
+++ b/libcxx/include/__locale_dir/support/bsd_like.h
@@ -36,7 +36,17 @@ namespace __locale {
 //
 // Locale management
 //
+#define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK
+#define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK
+#define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK
+#define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK
+#define _LIBCPP_TIME_MASK LC_TIME_MASK
+#define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK
+#define _LIBCPP_ALL_MASK LC_ALL_MASK
+#define _LIBCPP_LC_ALL LC_ALL
+
 using __locale_t = ::locale_t;
+using __lconv_t  = std::lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __locale, __locale_t __base) {
   return ::newlocale(__category_mask, __locale, __base);
@@ -44,7 +54,11 @@ inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const c
 
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::freelocale(__loc); }
 
-inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) { return ::localeconv_l(__loc); }
+inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __locale) {
+  return ::setlocale(__category, __locale);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) { return ::localeconv_l(__loc); }
 
 //
 // Strtonum functions
diff --git a/libcxx/include/__locale_dir/support/fuchsia.h b/libcxx/include/__locale_dir/support/fuchsia.h
index 4a54896c8e268..237f48562d6e0 100644
--- a/libcxx/include/__locale_dir/support/fuchsia.h
+++ b/libcxx/include/__locale_dir/support/fuchsia.h
@@ -40,7 +40,17 @@ struct __locale_guard {
 //
 // Locale management
 //
+#define _LIBCPP_COLLATE_MASK LC_COLLATE_MASK
+#define _LIBCPP_CTYPE_MASK LC_CTYPE_MASK
+#define _LIBCPP_MONETARY_MASK LC_MONETARY_MASK
+#define _LIBCPP_NUMERIC_MASK LC_NUMERIC_MASK
+#define _LIBCPP_TIME_MASK LC_TIME_MASK
+#define _LIBCPP_MESSAGES_MASK LC_MESSAGES_MASK
+#define _LIBCPP_ALL_MASK LC_ALL_MASK
+#define _LIBCPP_LC_ALL LC_ALL
+
 using __locale_t = locale_t;
+using __lconv_t  = std::lconv;
 
 inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const char* __name, __locale_t __loc) {
   return ::newlocale(__category_mask, __name, __loc);
@@ -48,7 +58,11 @@ inline _LIBCPP_HIDE_FROM_ABI __locale_t __newlocale(int __category_mask, const c
 
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::freelocale(__loc); }
 
-inline _LIBCPP_HIDE_FROM_ABI lconv* __localeconv(__locale_t& __loc) {
+inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, char const* __locale) {
+  return ::setlocale(__category, __locale);
+}
+
+inline _LIBCPP_HIDE_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc) {
   __locale_guard __current(__loc);
   return std::localeconv();
 }
diff --git a/libcxx/include/__locale_dir/support/windows.h b/libcxx/include/__locale_dir/support/windows.h
index eca0e17d94c85..ff89d3e87eb44 100644
--- a/libcxx/include/__locale_dir/support/windows.h
+++ b/libcxx/include/__locale_dir/support/windows.h
@@ -26,22 +26,14 @@
 #  pragma GCC system_header
 #endif
 
-#define _CATMASK(n) ((1 << (n)) >> 1)
-#define LC_COLLATE_MASK _CATMASK(LC_COLLATE)
-#define LC_CTYPE_MASK _CATMASK(LC_CTYPE)
-#define LC_MONETARY_MASK _CATMASK(LC_MONETARY)
-#define LC_NUMERIC_MASK _CATMASK(LC_NUMERIC)
-#define LC_TIME_MASK _CATMASK(LC_TIME)
-#define LC_MESSAGES_MASK _CATMASK(6)
-#define LC_ALL_MASK                                                                                                    \
-  (LC_COLLATE_MASK | LC_CTYPE_MASK | LC_MESSAGES_MASK | LC_MONETARY_MASK | LC_NUMERIC_MASK | LC_TIME_MASK)
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 namespace __locale {
 
+using __lconv_t = std::lconv;
+
 class __lconv_storage {
 public:
-  __lconv_storage(const lconv* __lc_input) {
+  __lconv_storage(const __lconv_t* __lc_input) {
     __lc_ = *__lc_input;
 
     __decimal_point_     = __lc_input->decimal_point;
@@ -67,10 +59,10 @@ class __lconv_storage {
     __lc_.negative_sign     = const_cast<char*>(__negative_sign_.c_str());
   }
 
-  std::lconv* __get() { return &__lc_; }
+  __lconv_t* __get() { return &__lc_; }
 
 private:
-  std::lconv __lc_;
+  __lconv_t __lc_;
   std::string __decimal_point_;
   std::string __thousands_sep_;
   std::string __grouping_;
@@ -86,6 +78,18 @@ class __lconv_storage {
 //
 // Locale management
 //
+#define _CATMASK(n) ((1 << (n)) >> 1)
+#define _LIBCPP_COLLATE_MASK _CATMASK(LC_COLLATE)
+#define _LIBCPP_CTYPE_MASK _CATMASK(LC_CTYPE)
+#define _LIBCPP_MONETARY_MASK _CATMASK(LC_MONETARY)
+#define _LIBCPP_NUMERIC_MASK _CATMASK(LC_NUMERIC)
+#define _LIBCPP_TIME_MASK _CATMASK(LC_TIME)
+#define _LIBCPP_MESSAGES_MASK _CATMASK(6)
+#define _LIBCPP_ALL_MASK                                                                                               \
+  (_LIBCPP_COLLATE_MASK | _LIBCPP_CTYPE_MASK | _LIBCPP_MESSAGES_MASK | _LIBCPP_MONETARY_MASK | _LIBCPP_NUMERIC_MASK |  \
+   _LIBCPP_TIME_MASK)
+#define _LIBCPP_LC_ALL LC_ALL
+
 class __locale_t {
 public:
   __locale_t() : __locale_(nullptr), __locale_str_(nullptr), __lc_(nullptr) {}
@@ -137,7 +141,7 @@ class __locale_t {
 
   operator ::_locale_t() const { return __locale_; }
 
-  std::lconv* __store_lconv(const std::lconv* __input_lc) {
+  __lconv_t* __store_lconv(const __lconv_t* __input_lc) {
     delete __lc_;
     __lc_ = new __lconv_storage(__input_lc);
     return __lc_->__get();
@@ -151,7 +155,13 @@ class __locale_t {
 
 _LIBCPP_EXPORTED_FROM_ABI __locale_t __newlocale(int __mask, const char* __locale, __locale_t __base);
 inline _LIBCPP_HIDE_FROM_ABI void __freelocale(__locale_t __loc) { ::_free_locale(__loc); }
-_LIBCPP_EXPORTED_FROM_ABI lconv* __localeconv(__locale_t& __loc);
+inline _LIBCPP_HIDE_FROM_ABI char* __setlocale(int __category, const char* __locale) {
+  char* __new_locale = ::setlocale(__category, __locale);
+  if (__new_locale == nullptr)
+    std::__throw_bad_alloc();
+  return __new_locale;
+}
+_LIBCPP_EXPORTED_FROM_ABI __lconv_t* __localeconv(__locale_t& __loc);
 
 //
 // Strtonum functions
@@ -292,7 +302,7 @@ struct __locale_guard {
     // Setting the locale can be expensive even when the locale given is
     // already the current locale, so do an explicit check to see if the
     // current locale is already the one we want.
-    const char* __lc = __setlocale(nullptr);
+    const char* __lc = __locale::__setlocale(LC_ALL, nullptr);
     // If every category is the same, the locale string will simply be the
     // locale name, otherwise it will be a semicolon-separated string listing
     // each category.  In the second case, we know at least one category won't
@@ -301,7 +311,7 @@ struct __locale_guard {
       __locale_all = _strdup(__lc);
       if (__locale_all == nullptr)
         __throw_bad_alloc();
-      __setlocale(__l.__get_locale());
+      __locale::__setlocale(LC_ALL, __l.__get_locale());
     }
   }
   _LIBCPP_HIDE_FROM_ABI ~__locale_guard() {
@@ -310,17 +320,11 @@ struct __locale_guard {
     // for the different categories in the same format as returned by
     // setlocale(LC_ALL, nullptr).
     if (__locale_all != nullptr) {
-      __setlocale(__locale_all);
+      __locale::__setlocale(LC_ALL, __locale_all);
       free(__locale_all);
     }
     _configthreadlocale(__status);
   }
-  _LIBCPP_HIDE_FROM_ABI static const char* __setlocale(const char* __locale) {
-    const char* __new_locale = setlocale(LC_ALL, __locale);
-    if (__new_locale == nullptr)
-      __throw_bad_alloc();
-    return __new_locale;
-  }
   int __status;
   char* __locale_all = nullptr;
 };
diff --git a/libcxx/src/iostream.cpp b/libcxx/src/iostream.cpp
index 6db02d5603794..9bc9ec0531a4b 100644
--- a/libcxx/src/iostream.cpp
+++ b/libcxx/src/iostream.cpp
@@ -103,7 +103,7 @@ alignas(wostream) _LIBCPP_EXPORTED_FROM_ABI char wclog[sizeof(wostream)]
 static void force_locale_initialization() {
 #if defined(_LIBCPP_MSVCRT_LIKE)
   static bool once = []() {
-    auto loc = __locale::__newlocale(LC_ALL_MASK, "C", 0);
+    auto loc = __locale::__newlocale(_LIBCPP_ALL_MASK, "C", 0);
     {
       __locale::__locale_guard g(loc); // forces initialization of locale TLS
       ((void)g);
diff --git a/libcxx/src/locale.cpp b/libcxx/src/locale.cpp
index fb67a729cd0f2..81f3ad4974390 100644
--- a/libcxx/src/locale.cpp
+++ b/libcxx/src/locale.cpp
@@ -51,7 +51,7 @@ _LIBCPP_PUSH_MACROS
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 struct __libcpp_unique_locale {
-  __libcpp_unique_locale(const char* nm) : __loc_(__locale::__newlocale(LC_ALL_MASK, nm, 0)) {}
+  __libcpp_unique_locale(const char* nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm, 0)) {}
 
   ~__libcpp_unique_locale() {
     if (__loc_)
@@ -74,7 +74,7 @@ __locale::__locale_t __cloc() {
   // In theory this could create a race condition. In practice
   // the race condition is non-fatal since it will just create
   // a little resource leak. Better approach would be appreciated.
-  static __locale::__locale_t result = __locale::__newlocale(LC_ALL_MASK, "C", 0);
+  static __locale::__locale_t result = __locale::__newlocale(_LIBCPP_ALL_MASK, "C", 0);
   return result;
 }
 #endif // __cloc_defined
@@ -570,7 +570,7 @@ locale locale::global(const locale& loc) {
   locale r  = g;
   g         = loc;
   if (g.name() != "*")
-    setlocale(LC_ALL, g.name().c_str());
+    __locale::__setlocale(_LIBCPP_LC_ALL, g.name().c_str());
   return r;
 }
 
@@ -600,7 +600,7 @@ long locale::id::__get() {
 // template <> class collate_byname<char>
 
 collate_byname<char>::collate_byname(const char* n, size_t refs)
-    : collate<char>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, n, 0)) {
+    : collate<char>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, n, 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("collate_byname<char>::collate_byname"
@@ -610,7 +610,7 @@ collate_byname<char>::collate_byname(const char* n, size_t refs)
 }
 
 collate_byname<char>::collate_byname(const string& name, size_t refs)
-    : collate<char>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, name.c_str(), 0)) {
+    : collate<char>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name.c_str(), 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("collate_byname<char>::collate_byname"
@@ -644,7 +644,7 @@ collate_byname<char>::string_type collate_byname<char>::do_transform(const char_
 
 #if _LIBCPP_HAS_WIDE_CHARACTERS
 collate_byname<wchar_t>::collate_byname(const char* n, size_t refs)
-    : collate<wchar_t>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, n, 0)) {
+    : collate<wchar_t>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, n, 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("collate_byname<wchar_t>::collate_byname(size_t refs)"
@@ -654,7 +654,7 @@ collate_byname<wchar_t>::collate_byname(const char* n, size_t refs)
 }
 
 collate_byname<wchar_t>::collate_byname(const string& name, size_t refs)
-    : collate<wchar_t>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, name.c_str(), 0)) {
+    : collate<wchar_t>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name.c_str(), 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("collate_byname<wchar_t>::collate_byname(size_t refs)"
@@ -1047,7 +1047,7 @@ const unsigned short* ctype<char>::__classic_upper_table() _NOEXCEPT {
 // template <> class ctype_byname<char>
 
 ctype_byname<char>::ctype_byname(const char* name, size_t refs)
-    : ctype<char>(0, false, refs), __l_(__locale::__newlocale(LC_ALL_MASK, name, 0)) {
+    : ctype<char>(0, false, refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name, 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("ctype_byname<char>::ctype_byname"
@@ -1057,7 +1057,7 @@ ctype_byname<char>::ctype_byname(const char* name, size_t refs)
 }
 
 ctype_byname<char>::ctype_byname(const string& name, size_t refs)
-    : ctype<char>(0, false, refs), __l_(__locale::__newlocale(LC_ALL_MASK, name.c_str(), 0)) {
+    : ctype<char>(0, false, refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name.c_str(), 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("ctype_byname<char>::ctype_byname"
@@ -1092,7 +1092,7 @@ const char* ctype_byname<char>::do_tolower(char_type* low, const char_type* high
 
 #if _LIBCPP_HAS_WIDE_CHARACTERS
 ctype_byname<wchar_t>::ctype_byname(const char* name, size_t refs)
-    : ctype<wchar_t>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, name, 0)) {
+    : ctype<wchar_t>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name, 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("ctype_byname<wchar_t>::ctype_byname"
@@ -1102,7 +1102,7 @@ ctype_byname<wchar_t>::ctype_byname(const char* name, size_t refs)
 }
 
 ctype_byname<wchar_t>::ctype_byname(const string& name, size_t refs)
-    : ctype<wchar_t>(refs), __l_(__locale::__newlocale(LC_ALL_MASK, name.c_str(), 0)) {
+    : ctype<wchar_t>(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, name.c_str(), 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("ctype_byname<wchar_t>::ctype_byname"
@@ -1342,7 +1342,7 @@ constinit locale::id codecvt<wchar_t, char, mbstate_t>::id;
 codecvt<wchar_t, char, mbstate_t>::codecvt(size_t refs) : locale::facet(refs), __l_(_LIBCPP_GET_C_LOCALE) {}
 
 codecvt<wchar_t, char, mbstate_t>::codecvt(const char* nm, size_t refs)
-    : locale::facet(refs), __l_(__locale::__newlocale(LC_ALL_MASK, nm, 0)) {
+    : locale::facet(refs), __l_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm, 0)) {
   if (__l_ == 0)
     __throw_runtime_error(
         ("codecvt_byname<wchar_t, char, mbstate_t>::codecvt_byname"
@@ -4067,7 +4067,7 @@ void numpunct_byname<char>::__init(const char* nm) {
            string(nm))
               .c_str());
 
-    lconv* lc = __locale::__localeconv(loc.get());
+    __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
     if (!checked_string_to_char_convert(__decimal_point_, lc->decimal_point, loc.get()))
       __decimal_point_ = base::do_decimal_point();
     if (!checked_string_to_char_convert(__thousands_sep_, lc->thousands_sep, loc.get()))
@@ -4098,7 +4098,7 @@ void numpunct_byname<wchar_t>::__init(const char* nm) {
            string(nm))
               .c_str());
 
-    lconv* lc = __locale::__localeconv(loc.get());
+    __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
     checked_string_to_wchar_convert(__decimal_point_, lc->decimal_point, loc.get());
     checked_string_to_wchar_convert(__thousands_sep_, lc->thousands_sep, loc.get());
     __grouping_ = lc->grouping;
@@ -4442,12 +4442,12 @@ const wstring& __time_get_c_storage<wchar_t>::__r() const {
 
 // time_get_byname
 
-__time_get::__time_get(const char* nm) : __loc_(__locale::__newlocale(LC_ALL_MASK, nm, 0)) {
+__time_get::__time_get(const char* nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm, 0)) {
   if (__loc_ == 0)
     __throw_runtime_error(("time_get_byname failed to construct for " + string(nm)).c_str());
 }
 
-__time_get::__time_get(const string& nm) : __loc_(__locale::__newlocale(LC_ALL_MASK, nm.c_str(), 0)) {
+__time_get::__time_get(const string& nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm.c_str(), 0)) {
   if (__loc_ == 0)
     __throw_runtime_error(("time_get_byname failed to construct for " + nm).c_str());
 }
@@ -5027,12 +5027,12 @@ time_base::dateorder __time_get_storage<wchar_t>::__do_date_order() const {
 
 // time_put
 
-__time_put::__time_put(const char* nm) : __loc_(__locale::__newlocale(LC_ALL_MASK, nm, 0)) {
+__time_put::__time_put(const char* nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm, 0)) {
   if (__loc_ == 0)
     __throw_runtime_error(("time_put_byname failed to construct for " + string(nm)).c_str());
 }
 
-__time_put::__time_put(const string& nm) : __loc_(__locale::__newlocale(LC_ALL_MASK, nm.c_str(), 0)) {
+__time_put::__time_put(const string& nm) : __loc_(__locale::__newlocale(_LIBCPP_ALL_MASK, nm.c_str(), 0)) {
   if (__loc_ == 0)
     __throw_runtime_error(("time_put_byname failed to construct for " + nm).c_str());
 }
@@ -5433,7 +5433,7 @@ void moneypunct_byname<char, false>::init(const char* nm) {
   if (!loc)
     __throw_runtime_error(("moneypunct_byname failed to construct for " + string(nm)).c_str());
 
-  lconv* lc = __locale::__localeconv(loc.get());
+  __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
   if (!checked_string_to_char_convert(__decimal_point_, lc->mon_decimal_point, loc.get()))
     __decimal_point_ = base::do_decimal_point();
   if (!checked_string_to_char_convert(__thousands_sep_, lc->mon_thousands_sep, loc.get()))
@@ -5468,7 +5468,7 @@ void moneypunct_byname<char, true>::init(const char* nm) {
   if (!loc)
     __throw_runtime_error(("moneypunct_byname failed to construct for " + string(nm)).c_str());
 
-  lconv* lc = __locale::__localeconv(loc.get());
+  __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
   if (!checked_string_to_char_convert(__decimal_point_, lc->mon_decimal_point, loc.get()))
     __decimal_point_ = base::do_decimal_point();
   if (!checked_string_to_char_convert(__thousands_sep_, lc->mon_thousands_sep, loc.get()))
@@ -5523,7 +5523,7 @@ void moneypunct_byname<wchar_t, false>::init(const char* nm) {
   __libcpp_unique_locale loc(nm);
   if (!loc)
     __throw_runtime_error(("moneypunct_byname failed to construct for " + string(nm)).c_str());
-  lconv* lc = __locale::__localeconv(loc.get());
+  __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
   if (!checked_string_to_wchar_convert(__decimal_point_, lc->mon_decimal_point, loc.get()))
     __decimal_point_ = base::do_decimal_point();
   if (!checked_string_to_wchar_convert(__thousands_sep_, lc->mon_thousands_sep, loc.get()))
@@ -5578,7 +5578,7 @@ void moneypunct_byname<wchar_t, true>::init(const char* nm) {
   if (!loc)
     __throw_runtime_error(("moneypunct_byname failed to construct for " + string(nm)).c_str());
 
-  lconv* lc = __locale::__localeconv(loc.get());
+  __locale::__lconv_t* lc = __locale::__localeconv(loc.get());
   if (!checked_string_to_wchar_convert(__decimal_point_, lc->mon_decimal_point, loc.get()))
     __decimal_point_ = base::do_decimal_point();
   if (!checked_string_to_wchar_convert(__thousands_sep_, lc->mon_thousands_sep, loc.get()))
diff --git a/libcxx/src/support/win32/locale_win32.cpp b/libcxx/src/support/win32/locale_win32.cpp
index ec2dd7f36ec70..24402e818d95d 100644
--- a/libcxx/src/support/win32/locale_win32.cpp
+++ b/libcxx/src/support/win32/locale_win32.cpp
@@ -26,7 +26,7 @@ __locale_t __newlocale(int /*mask*/, const char* locale, __locale_t /*base*/) {
   return {::_create_locale(LC_ALL, locale), locale};
 }
 
-lconv* __localeconv(__locale_t& loc) {
+__lconv_t* __localeconv(__locale_t& loc) {
   __locale_guard __current(loc);
   lconv* lc = std::localeconv();
   if (!lc)

From 7b1becd940cb93f8b63c9872e1af7431dea353d1 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 11:46:35 -0600
Subject: [PATCH 255/432] [libc] Add CMake cache file for the GPU build
 (#124589)

Summary:
This introduces libc cache files and adds one for building the GPU
support. The cache files will set defaults for these arguments which can
be overridden if the user needs to. They also serve as documentation for
how the builid is expected to look.
---
 libc/cmake/caches/gpu.cmake | 4 ++++
 libc/docs/gpu/building.rst  | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 libc/cmake/caches/gpu.cmake

diff --git a/libc/cmake/caches/gpu.cmake b/libc/cmake/caches/gpu.cmake
new file mode 100644
index 0000000000000..1867db9ffa12e
--- /dev/null
+++ b/libc/cmake/caches/gpu.cmake
@@ -0,0 +1,4 @@
+set(LLVM_ENABLE_PROJECTS "clang;clang-tools-extra;lld" CACHE STRING "")
+set(LLVM_RUNTIME_TARGETS default;amdgcn-amd-amdhsa;nvptx64-nvidia-cuda CACHE STRING "") 
+set(RUNTIMES_nvptx64-nvidia-cuda_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")
+set(RUNTIMES_amdgcn-amd-amdhsa_LLVM_ENABLE_RUNTIMES "compiler-rt;libc" CACHE STRING "")
diff --git a/libc/docs/gpu/building.rst b/libc/docs/gpu/building.rst
index 88643575ae4d9..94d3f1f644e5c 100644
--- a/libc/docs/gpu/building.rst
+++ b/libc/docs/gpu/building.rst
@@ -58,7 +58,8 @@ OpenMP support. We then set ``RUNTIMES_<triple>_LLVM_ENABLE_RUNTIMES`` to enable
 ``libc`` for the GPU targets. The ``LLVM_RUNTIME_TARGETS`` sets the enabled
 targets to build, in this case we want the default target and the GPU targets.
 Note that if ``libc`` were included in ``LLVM_ENABLE_RUNTIMES`` it would build
-targeting the default host environment as well.
+targeting the default host environment as well. Alternatively, you can point
+your build towards the ``libc/cmake/caches/gpu.cmake`` cache file with ``-C``.
 
 Runtimes cross build
 --------------------

From 610e33a547751019ff514d34f95f72d58118249c Mon Sep 17 00:00:00 2001
From: Michael Kruse <llvm-project@meinersbur.de>
Date: Mon, 27 Jan 2025 17:35:38 +0100
Subject: [PATCH 256/432] [Polly] Ensure i1 preload condition

If the preload condition is a constant, ExprBuilder::create returns an
integer of the native integer while an i1 is expected. Cast the result
to i1 if that happens.

Fixes #123932
---
 polly/include/polly/CodeGen/IslExprBuilder.h | 3 +++
 polly/lib/CodeGen/IslExprBuilder.cpp         | 7 +++++++
 polly/lib/CodeGen/IslNodeBuilder.cpp         | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/polly/include/polly/CodeGen/IslExprBuilder.h b/polly/include/polly/CodeGen/IslExprBuilder.h
index 25f61be5787c1..df8140e0b8757 100644
--- a/polly/include/polly/CodeGen/IslExprBuilder.h
+++ b/polly/include/polly/CodeGen/IslExprBuilder.h
@@ -135,6 +135,9 @@ class IslExprBuilder final {
   /// @return The llvm::Value* containing the result of the computation.
   llvm::Value *create(__isl_take isl_ast_expr *Expr);
 
+  /// Create LLVM-IR for an isl_ast_expr[ession] and cast it to i1.
+  llvm::Value *createBool(__isl_take isl_ast_expr *Expr);
+
   /// Return the largest of two types.
   ///
   /// @param T1 The first type.
diff --git a/polly/lib/CodeGen/IslExprBuilder.cpp b/polly/lib/CodeGen/IslExprBuilder.cpp
index 1688c41c624b2..8c54436f295b3 100644
--- a/polly/lib/CodeGen/IslExprBuilder.cpp
+++ b/polly/lib/CodeGen/IslExprBuilder.cpp
@@ -790,3 +790,10 @@ Value *IslExprBuilder::create(__isl_take isl_ast_expr *Expr) {
 
   llvm_unreachable("Unexpected enum value");
 }
+
+llvm::Value *IslExprBuilder::createBool(__isl_take isl_ast_expr *Expr) {
+  Value *Result = create(Expr);
+  if (!Result->getType()->isIntegerTy(1))
+    Result = Builder.CreateICmpNE(Result, Builder.getInt1(false));
+  return Result;
+}
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index bbe7bc42cd833..40205215ea0b3 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1103,7 +1103,7 @@ Value *IslNodeBuilder::preloadInvariantLoad(const MemoryAccess &MA,
   Domain = nullptr;
 
   ExprBuilder.setTrackOverflow(true);
-  Value *Cond = ExprBuilder.create(DomainCond);
+  Value *Cond = ExprBuilder.createBool(DomainCond);
   Value *OverflowHappened = Builder.CreateNot(ExprBuilder.getOverflowState(),
                                               "polly.preload.cond.overflown");
   Cond = Builder.CreateAnd(Cond, OverflowHappened, "polly.preload.cond.result");

From 1782168c527bbb9756c96a95f82397b5952d32b4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:02:21 -0800
Subject: [PATCH 257/432] [X86] Fix a warning

This patch fixes:

  llvm/lib/Target/X86/X86TargetTransformInfo.cpp:1583:47: error:
  comparison of integers of different signs: 'size_t' (aka 'unsigned
  long') and 'typename iterator_traits<const int *>::difference_type'
  (aka 'long') [-Werror,-Wsign-compare]
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index a64d65cb770b0..22dccaf061e1f 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1580,9 +1580,10 @@ InstructionCost X86TTIImpl::getShuffleCost(
                ((P.value() % Mask.size()) / NumEltsPerLane) ==
                    (P.index() / NumEltsPerLane);
       });
-      IsSingleElementMask = (Mask.size() - 1) == count_if(Mask, [](int M) {
-                              return M == PoisonMaskElem;
-                            });
+      IsSingleElementMask =
+          (Mask.size() - 1) == static_cast<unsigned>(count_if(Mask, [](int M) {
+            return M == PoisonMaskElem;
+          }));
     }
   }
 

From 754b94638e8935e1c1ed6121e0037fdae8b3c63c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 10:04:57 -0800
Subject: [PATCH 258/432] [lld] Support RUN_LLD_MAIN_TWICE for the ELF port
 (#124441)

This enables the LLD_IN_TEST=2 testing mode for
```
path/to/llvm-lit -sv --param RUN_LLD_MAIN_TWICE=1 lld/test/ELF
```

When `Fatal` is called, `RunSafely` will return false.
For the first invocation in LLD_IN_TEST=2 mode, `inTestOutputDisabled`
is true and lld will not write to stdout/stderr, making many tests fail.
(This essentially discourages `Fatal` calls in the source code.)

Add XFAIL: main-run-twice to these tests similar to
https://reviews.llvm.org/D112898 for Mach-O

```
comment="This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice."
xargs </tmp/0 sed -Ei "1s/(;|#|\/\/) REQUIRES: .*/\0\n\1 "$comment"\n\1 XFAIL: main-run-twice/;t;1s/^/# "$comment"\n# XFAIL: main-run-twice\n/"
```
---
 lld/test/ELF/archive-thin-missing-member.s        | 2 ++
 lld/test/ELF/arm-thumb-thunk-v6m-xo.s             | 2 ++
 lld/test/ELF/arm-thunk-section-too-large.s        | 2 ++
 lld/test/ELF/arm-thunk-toolargesection.s          | 2 ++
 lld/test/ELF/arm-v5-reloc-error.s                 | 2 ++
 lld/test/ELF/bad-archive.s                        | 2 ++
 lld/test/ELF/fatlto/fatlto.invalid.s              | 2 ++
 lld/test/ELF/invalid-cie-reference.s              | 2 ++
 lld/test/ELF/invalid/comdat-broken.test           | 2 ++
 lld/test/ELF/invalid/data-encoding.test           | 2 ++
 lld/test/ELF/invalid/dynamic-section-broken.test  | 2 ++
 lld/test/ELF/invalid/invalid-elf.test             | 2 ++
 lld/test/ELF/invalid/invalid-file-class.test      | 2 ++
 lld/test/ELF/invalid/sht-group-wrong-section.test | 2 ++
 lld/test/ELF/invalid/sht-group.test               | 2 ++
 lld/test/ELF/invalid/symtab-sh-info.s             | 2 ++
 lld/test/ELF/invalid/verneed-shared.test          | 2 ++
 lld/test/ELF/lto/bitcode-nodatalayout.ll          | 2 ++
 lld/test/ELF/lto/bitcode-wrapper.ll               | 2 ++
 lld/test/ELF/unsupported-emachine.test            | 2 ++
 lld/test/lit.cfg.py                               | 6 ++----
 21 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/lld/test/ELF/archive-thin-missing-member.s b/lld/test/ELF/archive-thin-missing-member.s
index 1d770451fcbe8..45fcd0e3caa23 100644
--- a/lld/test/ELF/archive-thin-missing-member.s
+++ b/lld/test/ELF/archive-thin-missing-member.s
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 
 # RUN: rm -f %t-no-syms.a
 # RUN: rm -f %t-syms.a
diff --git a/lld/test/ELF/arm-thumb-thunk-v6m-xo.s b/lld/test/ELF/arm-thumb-thunk-v6m-xo.s
index f1b6c0c194b38..caee025ef52b2 100644
--- a/lld/test/ELF/arm-thumb-thunk-v6m-xo.s
+++ b/lld/test/ELF/arm-thumb-thunk-v6m-xo.s
@@ -1,4 +1,6 @@
 // REQUIRES: arm
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 // RUN: rm -rf %t && split-file %s %t
 // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -triple=armv6m-none-eabi %t/a.s -o %t/a.o
 // RUN: ld.lld --no-rosegment --script %t/a.t %t/a.o -o %t/a
diff --git a/lld/test/ELF/arm-thunk-section-too-large.s b/lld/test/ELF/arm-thunk-section-too-large.s
index c6c058085968d..a2e8ff555eadc 100644
--- a/lld/test/ELF/arm-thunk-section-too-large.s
+++ b/lld/test/ELF/arm-thunk-section-too-large.s
@@ -1,4 +1,6 @@
 // REQUIRES: arm
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 // RUN: llvm-mc %s -triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o
 // RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/lld/test/ELF/arm-thunk-toolargesection.s b/lld/test/ELF/arm-thunk-toolargesection.s
index f91c4a4959315..88f8e6ba4cad3 100644
--- a/lld/test/ELF/arm-thunk-toolargesection.s
+++ b/lld/test/ELF/arm-thunk-toolargesection.s
@@ -1,4 +1,6 @@
 // REQUIRES: arm
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 // RUN: llvm-mc -filetype=obj -triple=thumbv7a-none-linux-gnueabi %s -o %t
 // RUN: not ld.lld %t -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/lld/test/ELF/arm-v5-reloc-error.s b/lld/test/ELF/arm-v5-reloc-error.s
index bd4b9ad68d10a..85e66f1d3850f 100644
--- a/lld/test/ELF/arm-v5-reloc-error.s
+++ b/lld/test/ELF/arm-v5-reloc-error.s
@@ -1,4 +1,6 @@
 // REQUIRES: arm
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 // RUN: llvm-mc -filetype=obj -triple=armv7a-linux-gnueabi %s -o %t
 // RUN: echo "SECTIONS { \
 // RUN:       . = SIZEOF_HEADERS; \
diff --git a/lld/test/ELF/bad-archive.s b/lld/test/ELF/bad-archive.s
index ba6d674c2fa65..92aa1646f6c57 100644
--- a/lld/test/ELF/bad-archive.s
+++ b/lld/test/ELF/bad-archive.s
@@ -1,4 +1,6 @@
 // REQUIRES: x86
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 
 // Check bad archive error reporting with --whole-archive
 // and without it.
diff --git a/lld/test/ELF/fatlto/fatlto.invalid.s b/lld/test/ELF/fatlto/fatlto.invalid.s
index a712b1dbb58b4..a0d5fd5597b76 100644
--- a/lld/test/ELF/fatlto/fatlto.invalid.s
+++ b/lld/test/ELF/fatlto/fatlto.invalid.s
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
 # RUN: not ld.lld %t -o /dev/null --fat-lto-objects 2>&1 | FileCheck %s
 
diff --git a/lld/test/ELF/invalid-cie-reference.s b/lld/test/ELF/invalid-cie-reference.s
index 158fc4e935f28..b2d3750ff1c7a 100644
--- a/lld/test/ELF/invalid-cie-reference.s
+++ b/lld/test/ELF/invalid-cie-reference.s
@@ -1,4 +1,6 @@
 // REQUIRES: x86
+// This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+// XFAIL: main-run-twice
 
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t
 // RUN: not ld.lld %t -o /dev/null 2>&1 | FileCheck %s
diff --git a/lld/test/ELF/invalid/comdat-broken.test b/lld/test/ELF/invalid/comdat-broken.test
index 02941070dc0d1..6064272272636 100644
--- a/lld/test/ELF/invalid/comdat-broken.test
+++ b/lld/test/ELF/invalid/comdat-broken.test
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
diff --git a/lld/test/ELF/invalid/data-encoding.test b/lld/test/ELF/invalid/data-encoding.test
index 94862af79c3cf..361d53da1a862 100644
--- a/lld/test/ELF/invalid/data-encoding.test
+++ b/lld/test/ELF/invalid/data-encoding.test
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 
 # The object in the archive has invalid data encoding.
 # Check we report this.
diff --git a/lld/test/ELF/invalid/dynamic-section-broken.test b/lld/test/ELF/invalid/dynamic-section-broken.test
index 62f311470a2e5..01fe9a05b8a5f 100644
--- a/lld/test/ELF/invalid/dynamic-section-broken.test
+++ b/lld/test/ELF/invalid/dynamic-section-broken.test
@@ -1,3 +1,5 @@
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 ## .dynamic section has invalid sh_entsize, check we report it.
 # RUN: yaml2obj --docnum=1 %s -o %t.so
 # RUN: not ld.lld %t.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR1
diff --git a/lld/test/ELF/invalid/invalid-elf.test b/lld/test/ELF/invalid/invalid-elf.test
index 848a430eb053c..7282259b7305c 100644
--- a/lld/test/ELF/invalid/invalid-elf.test
+++ b/lld/test/ELF/invalid/invalid-elf.test
@@ -1,3 +1,5 @@
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: rm -rf %t && mkdir -p %t
 # RUN: echo > %t/empty.o
 # RUN: llvm-ar --format=gnu cr %t/not-elf.a %t/empty.o
diff --git a/lld/test/ELF/invalid/invalid-file-class.test b/lld/test/ELF/invalid/invalid-file-class.test
index 3f547861b3793..dda4648e255b6 100644
--- a/lld/test/ELF/invalid/invalid-file-class.test
+++ b/lld/test/ELF/invalid/invalid-file-class.test
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: rm -rf %t && mkdir -p %t
 
 ## In this test, we check that able to report objects with
diff --git a/lld/test/ELF/invalid/sht-group-wrong-section.test b/lld/test/ELF/invalid/sht-group-wrong-section.test
index 46209a213902c..0eea432c1ee45 100644
--- a/lld/test/ELF/invalid/sht-group-wrong-section.test
+++ b/lld/test/ELF/invalid/sht-group-wrong-section.test
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o %t.o -o /dev/null 2>&1 | FileCheck %s
 # CHECK: error: {{.*}}.o: invalid section index in group: 12345
diff --git a/lld/test/ELF/invalid/sht-group.test b/lld/test/ELF/invalid/sht-group.test
index 9041bb61b3e95..d50fbe6a6ce75 100644
--- a/lld/test/ELF/invalid/sht-group.test
+++ b/lld/test/ELF/invalid/sht-group.test
@@ -1,4 +1,6 @@
 # REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
 # CHECK: invalid symbol index
diff --git a/lld/test/ELF/invalid/symtab-sh-info.s b/lld/test/ELF/invalid/symtab-sh-info.s
index 253bbf9e62d56..03570892c4628 100644
--- a/lld/test/ELF/invalid/symtab-sh-info.s
+++ b/lld/test/ELF/invalid/symtab-sh-info.s
@@ -1,3 +1,5 @@
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 ## .symtab's sh_info contains zero value. First entry in a .symtab is a
 ## zero entry that must exist in a valid object, so sh_info can't be null.
 ## Check we report a proper error for that case.
diff --git a/lld/test/ELF/invalid/verneed-shared.test b/lld/test/ELF/invalid/verneed-shared.test
index 2e2ff494fb582..fd107e4f09781 100644
--- a/lld/test/ELF/invalid/verneed-shared.test
+++ b/lld/test/ELF/invalid/verneed-shared.test
@@ -1,4 +1,6 @@
 ## REQUIRES: x86
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 ## Test that we can parse SHT_GNU_verneed in a shared object and report certain errors.
 
 # RUN: echo '.globl _start; _start:' | llvm-mc -filetype=obj -triple=x86_64 - -o %t.o
diff --git a/lld/test/ELF/lto/bitcode-nodatalayout.ll b/lld/test/ELF/lto/bitcode-nodatalayout.ll
index 7ddc122267218..39e8263aa7287 100644
--- a/lld/test/ELF/lto/bitcode-nodatalayout.ll
+++ b/lld/test/ELF/lto/bitcode-nodatalayout.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: x86
+; This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+; XFAIL: main-run-twice
 ; RUN: llvm-as %s -o %t.o
 ; RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/lld/test/ELF/lto/bitcode-wrapper.ll b/lld/test/ELF/lto/bitcode-wrapper.ll
index 183d8c28383b3..c76be4854cca3 100644
--- a/lld/test/ELF/lto/bitcode-wrapper.ll
+++ b/lld/test/ELF/lto/bitcode-wrapper.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: x86
+; This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+; XFAIL: main-run-twice
 
 ;; The LLVM bitcode format allows for an optional wrapper header. This test
 ;; shows that LLD can handle bitcode wrapped in this way, and also that an
diff --git a/lld/test/ELF/unsupported-emachine.test b/lld/test/ELF/unsupported-emachine.test
index 43d907ea72d56..54a3402378bfb 100644
--- a/lld/test/ELF/unsupported-emachine.test
+++ b/lld/test/ELF/unsupported-emachine.test
@@ -1,3 +1,5 @@
+# This test intentionally checks for fatal errors, and fatal errors aren't supported for testing when main is run twice.
+# XFAIL: main-run-twice
 # RUN: yaml2obj %s -o %t.o
 # RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck %s
 
diff --git a/lld/test/lit.cfg.py b/lld/test/lit.cfg.py
index 859094e2b57db..9e6b0e839d9a8 100644
--- a/lld/test/lit.cfg.py
+++ b/lld/test/lit.cfg.py
@@ -104,10 +104,8 @@
     config.environment["LLD_IN_TEST"] = "1"
 else:
     config.environment["LLD_IN_TEST"] = "2"
-    # Many ELF tests fail in this mode.
-    config.excludes.append("ELF")
-    # Some old Mach-O backend tests fail, and it's due for removal anyway.
-    config.excludes.append("mach-o")
+    # Many wasm tests fail.
+    config.excludes.append("wasm")
     # Some new Mach-O backend tests fail; give them a way to mark themselves
     # unsupported in this mode.
     config.available_features.add("main-run-twice")

From 5d6d982df61d16b6d498e6d59dd91c059679d3d8 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 18:12:00 +0000
Subject: [PATCH 259/432] [AArch64] Generate zeroing forms of certain SVE2.2
 instructions (11/11) (#116837)

SVE2.2 introduces instructions with predicated forms with zeroing of
the inactive lanes. This allows in some cases to save a `movprfx` or
a `mov` instruction when emitting code for `_x` or `_z` variants of
intrinsics.

This patch adds support for emitting the zeroing forms of certain
`SXTB`, `UXTB`, `SXTH`, `UXTH`, `SXTW`, and `UXTW` instructions.
---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   12 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   28 +-
 .../test/CodeGen/AArch64/zeroing-forms-ext.ll | 1028 +++++++++++++++++
 3 files changed, 1060 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 498967bfc8f9c..524fccb8d43e6 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4325,14 +4325,14 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm FNEG_ZPzZ : sve_int_un_pred_arit_bitwise_fp_z<0b101, "fneg", AArch64fneg_mt>;
 
   // SVE2p2 integer unary arithmetic, zeroing predicate
-  defm SXTB_ZPzZ  : sve_int_un_pred_arit_h_z<0b000, "sxtb">;
-  defm UXTB_ZPzZ  : sve_int_un_pred_arit_h_z<0b001, "uxtb">;
-  defm SXTH_ZPzZ  : sve_int_un_pred_arit_w_z<0b010, "sxth">;
-  defm UXTH_ZPzZ  : sve_int_un_pred_arit_w_z<0b011, "uxth">;
+  defm SXTB_ZPzZ  : sve_int_un_pred_arit_h_z<0b000, "sxtb", AArch64sxt_mt>;
+  defm UXTB_ZPzZ  : sve_int_un_pred_arit_h_z<0b001, "uxtb", AArch64uxt_mt>;
+  defm SXTH_ZPzZ  : sve_int_un_pred_arit_w_z<0b010, "sxth", AArch64sxt_mt>;
+  defm UXTH_ZPzZ  : sve_int_un_pred_arit_w_z<0b011, "uxth", AArch64uxt_mt>;
   defm ABS_ZPzZ   : sve_int_un_pred_arit_z<  0b110, "abs", AArch64abs_mt>;
   defm NEG_ZPzZ   : sve_int_un_pred_arit_z<  0b111, "neg", AArch64neg_mt>;
-  def SXTW_ZPzZ_D : sve_int_un_pred_arit_z<0b11, 0b1000, "sxtw", ZPR64>;
-  def UXTW_ZPzZ_D : sve_int_un_pred_arit_z<0b11, 0b1010, "uxtw", ZPR64>;
+  defm SXTW_ZPzZ  : sve_int_un_pred_arit_d_z<0b100, "sxtw", AArch64sxt_mt>;
+  defm UXTW_ZPzZ  : sve_int_un_pred_arit_d_z<0b101, "uxtw", AArch64uxt_mt>;
 
   // SVE predicate count
   defm FIRSTP_XPP : sve_int_pcount_pred_tmp<0b001, "firstp">;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 14cb03411c5a4..e443c5ab150bd 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -662,6 +662,17 @@ multiclass SVE_InReg_Extend_PassthruUndef<ValueType vt, SDPatternOperator op, Va
             (inst $PassThru, $Pg, $Src)>;
 }
 
+multiclass SVE_InReg_Extend_PassthruUndefZero<ValueType vt, SDPatternOperator op, ValueType pt,
+                                         ValueType inreg_vt, Instruction inst> {
+  let AddedComplexity = 1 in {
+    def : Pat<(vt (op pt:$Pg, vt:$Src, inreg_vt, (vt (SVEDup0Undef)))),
+               (inst $Pg, $Src)>;
+
+    def : Pat<(vt (op (pt (SVEAllActive:$Pg)), vt:$Src, inreg_vt, (vt (SVEAny)))),
+              (inst $Pg, $Src)>;
+  }
+}
+
 class SVE_Shift_DupImm_Pred_Pat<ValueType vt, SDPatternOperator op,
                                 ValueType pt, ValueType it,
                                 ComplexPattern cast, Instruction inst>
@@ -4930,10 +4941,14 @@ multiclass sve_int_un_pred_arit_h<bits<3> opc, string asm,
   defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i8, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
-multiclass sve_int_un_pred_arit_h_z<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_h_z<bits<3> opc, string asm, SDPatternOperator op> {
   def _H : sve_int_un_pred_arit_z<0b01, { opc, 0b0 }, asm, ZPR16>;
   def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv8i16, op, nxv8i1, nxv8i8, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv4i32, op, nxv4i1, nxv4i8, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv2i64, op, nxv2i1, nxv2i8, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_w<bits<3> opc, string asm,
@@ -4953,9 +4968,12 @@ multiclass sve_int_un_pred_arit_w<bits<3> opc, string asm,
   defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i16, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
-multiclass sve_int_un_pred_arit_w_z<bits<3> opc, string asm> {
+multiclass sve_int_un_pred_arit_w_z<bits<3> opc, string asm, SDPatternOperator op> {
   def _S : sve_int_un_pred_arit_z<0b10, { opc, 0b0 }, asm, ZPR32>;
   def _D : sve_int_un_pred_arit_z<0b11, { opc, 0b0 }, asm, ZPR64>;
+
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv4i32, op, nxv4i1, nxv4i16, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv2i64, op, nxv2i1, nxv2i16, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_un_pred_arit_d<bits<3> opc, string asm,
@@ -4970,6 +4988,12 @@ multiclass sve_int_un_pred_arit_d<bits<3> opc, string asm,
   defm : SVE_InReg_Extend_PassthruUndef<nxv2i64, op, nxv2i1, nxv2i32, !cast<Pseudo>(NAME # _D_UNDEF)>;
 }
 
+multiclass sve_int_un_pred_arit_d_z<bits<3> opc, string asm, SDPatternOperator op> {
+  def _D : sve_int_un_pred_arit_z<0b11, {opc, 0b0}, asm, ZPR64>;
+
+  defm : SVE_InReg_Extend_PassthruUndefZero<nxv2i64, op, nxv2i1, nxv2i32, !cast<Instruction>(NAME # _D)>;
+}
+
 multiclass sve_int_un_pred_arit_bitwise<bits<3> opc, string asm,
                                         SDPatternOperator op> {
   def _B : sve_int_un_pred_arit<0b00, { opc, 0b1 }, asm, ZPR8>,
diff --git a/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll b/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll
new file mode 100644
index 0000000000000..b29805c2b8f05
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/zeroing-forms-ext.ll
@@ -0,0 +1,1028 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve    < %s | FileCheck %s
+; RUN: llc -mattr=+sve2p2 < %s | FileCheck %s -check-prefix CHECK-2p2
+
+; RUN: llc -mattr=+sme    -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sme2p2 -force-streaming < %s | FileCheck %s -check-prefix CHECK-2p2
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x i16> @test_svextb_s16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_s16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svextb_s16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_s16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svextb_s16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_s16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    sxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svextb_u16_x_1(<vscale x 8 x i1> %pg, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_u16_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxtb z0.h, p0/m, z0.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u16_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.h, p0/z, z0.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svextb_u16_x_2(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_u16_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u16_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svextb_u16_z(<vscale x 8 x i1> %pg, double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svextb_u16_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    uxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u16_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_u32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_u32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_u32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_u32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svextb_u32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svextb_u32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    uxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_u64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_u64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxtb z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_u64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_u64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextb_u64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextb_u64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    uxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextb_u64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_s32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_s32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_s32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_s32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_s32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_s32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    sxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    sxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_u32_x_1(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_u32_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxth z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u32_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.s, p0/z, z0.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_u32_x_2(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_u32_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u32_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svexth_u32_z(<vscale x 4 x i1> %pg, double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svexth_u32_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    uxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u32_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_u64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_u64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxth z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_u64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_u64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svexth_u64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svexth_u64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    uxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svexth_u64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_s64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_s64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_s64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtw z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_s64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_s64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_s64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_s64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_s64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    sxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_s64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    sxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_u64_x_1(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_u64_x_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uxtw z0.d, p0/m, z0.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_u64_x_1:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtw z0.d, p0/z, z0.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_u64_x_2(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_u64_x_2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_u64_x_2:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svextw_u64_z(<vscale x 2 x i1> %pg, double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svextw_u64_z:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    uxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svextw_u64_z:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    uxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svsxtb_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svsxtb_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svsxtb_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svsxtb_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxtb z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    sxtb z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.sxtb.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svsxtb_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svsxtb_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svsxtb_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svsxtb_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxtb z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sxtb z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxtb.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svsxtb_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svsxtb_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsxtb_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svsxtb_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxtb z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtb_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxtb z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtb.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 8 x i16> @test_svuxtb_nxv8i16_ptrue_u(double %z0, <vscale x 8 x i16> %x) {
+; CHECK-LABEL: test_svuxtb_nxv8i16_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.h, p0/m, z1.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv8i16_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    uxtb z0.h, p0/z, z1.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> poison, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %x)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 8 x i16> @test_svuxtb_nxv8i16_ptrue(double %z0, <vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
+; CHECK-LABEL: test_svuxtb_nxv8i16_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxtb z0.h, p0/m, z2.h
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv8i16_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.h
+; CHECK-2p2-NEXT:    uxtb z0.h, p0/z, z2.h
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+  %0 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.uxtb.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i1> %pg, <vscale x 8 x i16> %y)
+  ret <vscale x 8 x i16> %0
+}
+
+define <vscale x 4 x i32> @test_svuxtb_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svuxtb_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    uxtb z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svuxtb_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svuxtb_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxtb z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    uxtb z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxtb.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svuxtb_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svuxtb_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtb z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxtb z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svuxtb_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svuxtb_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxtb z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtb_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxtb z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtb.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svsxth_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svsxth_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxth_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svsxth_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svsxth_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxth z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxth_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    sxth z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.sxth.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svsxth_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svsxth_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxth_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsxth_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svsxth_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxth z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxth_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxth z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxth.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 4 x i32> @test_svuxth_nxv4i32_ptrue_u(double %z0, <vscale x 4 x i32> %x) {
+; CHECK-LABEL: test_svuxth_nxv4i32_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxth z0.s, p0/m, z1.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxth_nxv4i32_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    uxth z0.s, p0/z, z1.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %x)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 4 x i32> @test_svuxth_nxv4i32_ptrue(double %z0, <vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
+; CHECK-LABEL: test_svuxth_nxv4i32_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxth z0.s, p0/m, z2.s
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxth_nxv4i32_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.s
+; CHECK-2p2-NEXT:    uxth z0.s, p0/z, z2.s
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+  %0 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.uxth.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %y)
+  ret <vscale x 4 x i32> %0
+}
+
+define <vscale x 2 x i64> @test_svuxth_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svuxth_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxth z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxth_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxth z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svuxth_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svuxth_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxth z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxth_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxth z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxth.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsxtw_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svsxtw_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    sxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtw_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svsxtw_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svsxtw_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    sxtw z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svsxtw_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    sxtw z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.sxtw.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svuxtw_nxv2i64_ptrue_u(double %z0, <vscale x 2 x i64> %x) {
+; CHECK-LABEL: test_svuxtw_nxv2i64_ptrue_u:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z1
+; CHECK-NEXT:    uxtw z0.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtw_nxv2i64_ptrue_u:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxtw z0.d, p0/z, z1.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> poison, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %x)
+  ret <vscale x 2 x i64> %0
+}
+
+define <vscale x 2 x i64> @test_svuxtw_nxv2i64_ptrue(double %z0, <vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
+; CHECK-LABEL: test_svuxtw_nxv2i64_ptrue:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    movprfx z0, z2
+; CHECK-NEXT:    uxtw z0.d, p0/m, z2.d
+; CHECK-NEXT:    ret
+;
+; CHECK-2p2-LABEL: test_svuxtw_nxv2i64_ptrue:
+; CHECK-2p2:       // %bb.0: // %entry
+; CHECK-2p2-NEXT:    ptrue p0.d
+; CHECK-2p2-NEXT:    uxtw z0.d, p0/z, z2.d
+; CHECK-2p2-NEXT:    ret
+entry:
+  %pg = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+  %0 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.uxtw.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %y)
+  ret <vscale x 2 x i64> %0
+}

From dc6411d3e1ab903750a4c80571ee0ebd7e26a62c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:34:16 -0800
Subject: [PATCH 260/432] [AST] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124502)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect TemplateOrSpecialization to be nonnull.
---
 clang/lib/AST/Decl.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 728556614e632..beb5fcaefac53 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4360,9 +4360,9 @@ FunctionDecl::getTemplateSpecializationKindForInstantiation() const {
 void
 FunctionDecl::setTemplateSpecializationKind(TemplateSpecializationKind TSK,
                                           SourceLocation PointOfInstantiation) {
-  if (FunctionTemplateSpecializationInfo *FTSInfo
-        = TemplateOrSpecialization.dyn_cast<
-                                    FunctionTemplateSpecializationInfo*>()) {
+  if (FunctionTemplateSpecializationInfo *FTSInfo =
+          dyn_cast<FunctionTemplateSpecializationInfo *>(
+              TemplateOrSpecialization)) {
     FTSInfo->setTemplateSpecializationKind(TSK);
     if (TSK != TSK_ExplicitSpecialization &&
         PointOfInstantiation.isValid() &&
@@ -4371,8 +4371,9 @@ FunctionDecl::setTemplateSpecializationKind(TemplateSpecializationKind TSK,
       if (ASTMutationListener *L = getASTContext().getASTMutationListener())
         L->InstantiationRequested(this);
     }
-  } else if (MemberSpecializationInfo *MSInfo
-             = TemplateOrSpecialization.dyn_cast<MemberSpecializationInfo*>()) {
+  } else if (MemberSpecializationInfo *MSInfo =
+                 dyn_cast<MemberSpecializationInfo *>(
+                     TemplateOrSpecialization)) {
     MSInfo->setTemplateSpecializationKind(TSK);
     if (TSK != TSK_ExplicitSpecialization &&
         PointOfInstantiation.isValid() &&

From 4075915ebdfc7b69381388c96781e6abfa5f4407 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:34:41 -0800
Subject: [PATCH 261/432] [Sema] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124503)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses dyn_cast
because we expect U.first to be nonnull.
---
 clang/lib/Sema/SemaDeclCXX.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 08065e3cad2bb..c1a016ff91cb2 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -9235,7 +9235,7 @@ struct SpecialMemberVisitor {
   static SourceLocation getSubobjectLoc(Subobject Subobj) {
     // FIXME: For an indirect virtual base, the direct base leading to
     // the indirect virtual base would be a more useful choice.
-    if (auto *B = Subobj.dyn_cast<CXXBaseSpecifier*>())
+    if (auto *B = dyn_cast<CXXBaseSpecifier *>(Subobj))
       return B->getBaseTypeLoc();
     else
       return cast<FieldDecl *>(Subobj)->getLocation();
@@ -17525,7 +17525,7 @@ DeclResult Sema::ActOnTemplatedFriendTag(
   unsigned FriendDeclDepth = TempParamLists.front()->getDepth();
   for (UnexpandedParameterPack &U : Unexpanded) {
     if (getDepthAndIndex(U).first >= FriendDeclDepth) {
-      auto *ND = U.first.dyn_cast<NamedDecl *>();
+      auto *ND = dyn_cast<NamedDecl *>(U.first);
       if (!ND)
         ND = cast<const TemplateTypeParmType *>(U.first)->getDecl();
       Diag(U.second, diag::friend_template_decl_malformed_pack_expansion)

From 5d2434166787e36312f037538119d3820c5af5e6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:34:54 -0800
Subject: [PATCH 262/432] [lld] Migrate away from PointerUnion::dyn_cast (NFC)
 (#124504)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

This patch migrates uses of PointerUnion::dyn_cast to
dyn_cast_if_present (see the definition of PointerUnion::dyn_cast).
Note that we cannot use dyn_cast in any of the migrations in this
patch; placing

  assert(!X.isNull());

just before any of dyn_cast_if_present in this patch triggers some
failure in check-lld.
---
 lld/MachO/ObjC.cpp   | 2 +-
 lld/MachO/Writer.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lld/MachO/ObjC.cpp b/lld/MachO/ObjC.cpp
index 272197b34e115..fe9cee9651cca 100644
--- a/lld/MachO/ObjC.cpp
+++ b/lld/MachO/ObjC.cpp
@@ -543,7 +543,7 @@ ObjcCategoryMerger::tryGetSymbolAtIsecOffset(const ConcatInputSection *isec,
   if (!reloc)
     return nullptr;
 
-  Symbol *sym = reloc->referent.dyn_cast<Symbol *>();
+  Symbol *sym = dyn_cast_if_present<Symbol *>(reloc->referent);
 
   if (reloc->addend && sym) {
     assert(isa<Defined>(sym) && "Expected defined for non-zero addend");
diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp
index bec980e18e18b..d9856a46e8cb8 100644
--- a/lld/MachO/Writer.cpp
+++ b/lld/MachO/Writer.cpp
@@ -711,7 +711,7 @@ void Writer::scanRelocations() {
 
       // Canonicalize the referent so that later accesses in Writer won't
       // have to worry about it.
-      if (auto *referentIsec = r.referent.dyn_cast<InputSection *>())
+      if (auto *referentIsec = dyn_cast_if_present<InputSection *>(r.referent))
         r.referent = referentIsec->canonical();
 
       if (target->hasAttr(r.type, RelocAttrBits::SUBTRAHEND)) {
@@ -725,7 +725,7 @@ void Writer::scanRelocations() {
           it->referent = referentIsec->canonical();
         continue;
       }
-      if (auto *sym = r.referent.dyn_cast<Symbol *>()) {
+      if (auto *sym = dyn_cast_if_present<Symbol *>(r.referent)) {
         if (auto *undefined = dyn_cast<Undefined>(sym))
           treatUndefinedSymbol(*undefined, isec, r.offset);
         // treatUndefinedSymbol() can replace sym with a DylibSymbol; re-check.

From e0c5a8553d62124c983e3d8bdc3ea31ed1ea0b96 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:35:37 -0800
Subject: [PATCH 263/432] [memprof] Migrate away from PointerUnion::dyn_cast
 (NFC) (#124505)

Note that PointerUnion::dyn_cast has been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

Literal migration would result in dyn_cast_if_present (see the
definition of PointerUnion::dyn_cast), but this patch uses cast
because we know which alternative to expect in the ternary expression.
---
 llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index 1966ce2908371..03e2e7089202d 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -3598,10 +3598,9 @@ IndexCallsiteContextGraph::cloneFunctionForCallsite(
   // The next clone number is the current size of versions array.
   // Confirm this matches the CloneNo provided by the caller, which is based on
   // the number of function clones we have.
-  assert(CloneNo ==
-         (isa<AllocInfo *>(Call.call())
-              ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
-              : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
+  assert(CloneNo == (isa<AllocInfo *>(Call.call())
+                         ? cast<AllocInfo *>(Call.call())->Versions.size()
+                         : cast<CallsiteInfo *>(Call.call())->Clones.size()));
   // Walk all the instructions in this function. Create a new version for
   // each (by adding an entry to the Versions/Clones summary array), and copy
   // over the version being called for the function clone being cloned here.

From 817e777296a508356f55d5e1a06cba714c0fe13b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 27 Jan 2025 10:35:52 -0800
Subject: [PATCH 264/432] [CodeGen] Avoid repeated hash lookups (NFC) (#124506)

---
 llvm/lib/CodeGen/PrologEpilogInserter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 51e9a067707e2..eb8929cae069e 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -208,8 +208,8 @@ static void stashEntryDbgValues(MachineBasicBlock &MBB,
   }
 
   // Remove stashed debug values from the block.
-  if (EntryDbgValues.count(&MBB))
-    for (auto *MI : EntryDbgValues[&MBB])
+  if (auto It = EntryDbgValues.find(&MBB); It != EntryDbgValues.end())
+    for (auto *MI : It->second)
       MI->removeFromParent();
 }
 

From 4e812756745e59e02dca12abaed50638b5eb273f Mon Sep 17 00:00:00 2001
From: Meng Zhuo <mengzhuo1203@gmail.com>
Date: Tue, 28 Jan 2025 02:39:40 +0800
Subject: [PATCH 265/432] [tsan] Add support for linux/riscv64 in
 lib/tsan/go/buildgo.sh (#124557)

---
 compiler-rt/lib/tsan/go/buildgo.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/lib/tsan/go/buildgo.sh b/compiler-rt/lib/tsan/go/buildgo.sh
index 78ba41a0bdc6d..6871b36c3f510 100755
--- a/compiler-rt/lib/tsan/go/buildgo.sh
+++ b/compiler-rt/lib/tsan/go/buildgo.sh
@@ -18,6 +18,8 @@ if [ "`uname -a | grep Linux`" != "" ]; then
 		else
 			HOST_GOARCH="mips64"
 		fi
+  	elif [ "`uname -a | grep riscv64`" != "" ]; then
+		HOST_GOARCH="riscv64"
 	elif [ "`uname -a | grep s390x`" != "" ]; then
 		HOST_GOARCH="s390x"
 	fi

From d1139b32d251c1e258abeb6556d5fff045d7ae12 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 27 Jan 2025 13:47:17 -0500
Subject: [PATCH 266/432] [AMDGPU][True16][CodeGen] true16 codegen pats for
 v_mad_u16 (#124000)

true16 codegen pats for v_mad_u16 (mul+add)
---
 llvm/lib/Target/AMDGPU/VOP3Instructions.td |   3 +
 llvm/test/CodeGen/AMDGPU/mad.u16.ll        | 124 +++++++++++++++------
 2 files changed, 91 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 947ac5c27620f..4290c3a16262b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -446,6 +446,9 @@ multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
   >;
 }
 
+let True16Predicate = UseRealTrue16Insts in {
+  defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_t16_e64>;
+} // End True16Predicates = UseRealTrue16Insts
 let True16Predicate = UseFakeTrue16Insts in {
   defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_fake16_e64>;
 } // End True16Predicates = UseFakeTrue16Insts
diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
index c500942840053..c4c0dc6998265 100644
--- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ; FIXME: GFX9 should be producing v_mad_u16 instead of v_mad_legacy_u16.
 
@@ -65,22 +66,44 @@ define amdgpu_kernel void @mad_u16(
 ; GFX10-NEXT:    global_store_short v0, v1, s[8:9]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: mad_u16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    global_load_u16 v0, v0, s[6:7] glc dlc
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u16 v0, v1, v2, v0
-; GFX11-NEXT:    global_store_b16 v3, v0, s[0:1]
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: mad_u16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    global_load_u16 v3, v0, s[6:7] glc dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: mad_u16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    global_load_u16 v0, v0, s[6:7] glc dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mad_u16 v0, v1, v2, v0
+; GFX11-FAKE16-NEXT:    global_store_b16 v3, v0, s[0:1]
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -121,11 +144,20 @@ define i16 @v_mad_u16(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX10-NEXT:    v_mad_u16 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mad_u16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_mad_u16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_mad_u16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
   ret i16 %add
@@ -151,13 +183,23 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mad_u16_zext:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_mad_u16_zext:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_mad_u16_zext:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
   %zext = zext i16 %add to i32
@@ -187,13 +229,23 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mad_u16_zext64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mad_u16 v0, v0, v1, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_mad_u16_zext64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mad_u16 v0.l, v0.l, v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_mad_u16_zext64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_mad_u16 v0, v0, v1, v2
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i16 %arg0, %arg1
   %add = add i16 %mul, %arg2
   %zext = zext i16 %add to i64

From e77d428e46d94e1be6e5f38205b01d3f528d5e3f Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Mon, 27 Jan 2025 10:50:06 -0800
Subject: [PATCH 267/432] [AMDGPU] Do not remat instructions with PhysReg uses
 (#124366)

This blocks rematerialization during scheduling if the instruction has a
non accepted PhysReg use.

Currently, there aren't any checks like this in place, and we may create
invalid code: https://godbolt.org/z/xjPjdcorf
---
 llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp   |  12 +-
 .../machine-scheduler-sink-trivial-remats.mir | 456 ++++++++++++++++--
 2 files changed, 418 insertions(+), 50 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index b00105ae9bd52..6e693066de10b 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -1842,15 +1842,23 @@ bool PreRARematStage::sinkTriviallyRematInsts(const GCNSubtarget &ST,
   return true;
 }
 
-// Copied from MachineLICM
 bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
   if (!DAG.TII->isTriviallyReMaterializable(MI))
     return false;
 
-  for (const MachineOperand &MO : MI.all_uses())
+  for (const MachineOperand &MO : MI.all_uses()) {
     if (MO.getReg().isVirtual())
       return false;
 
+    // We can't remat physreg uses, unless it is a constant or an ignorable
+    // use (e.g. implicit exec use on VALU instructions)
+    if (MO.getReg().isPhysical()) {
+      if (DAG.MRI.isConstantPhysReg(MO.getReg()) || DAG.TII->isIgnorableUse(MO))
+        continue;
+      return false;
+    }
+  }
+
   return true;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
index 9f264de531950..7662abc0aaf85 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-scheduler-sink-trivial-remats.mir
@@ -84,13 +84,11 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     S_NOP 0
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -191,14 +189,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -300,7 +296,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -308,7 +303,6 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -408,7 +402,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -416,7 +409,6 @@ body:             |
     S_NOP 0, implicit %22, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -529,7 +521,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -537,14 +528,12 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -552,7 +541,6 @@ body:             |
     S_NOP 0, implicit %25
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -666,7 +654,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -674,7 +661,6 @@ body:             |
     S_NOP 0, implicit %23, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     %25:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 25, implicit $exec, implicit $mode
@@ -682,7 +668,6 @@ body:             |
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %27:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 27, implicit $exec, implicit $mode, implicit-def $m0
@@ -690,7 +675,6 @@ body:             |
     S_NOP 0, implicit %25, implicit %26
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -949,14 +933,12 @@ body:             |
     undef %23.sub0:vreg_64 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23.sub1:vreg_64 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -1053,7 +1035,6 @@ body:             |
     undef %21.sub0:vreg_128 = nofpexcept V_CVT_I32_F64_e32 21, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
@@ -1062,7 +1043,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -1581,7 +1561,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -1589,7 +1568,6 @@ body:             |
     S_NOP 0, implicit %24, implicit %25
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -2528,14 +2506,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
     successors: %bb.3
 
     S_NOP 0, implicit %23
@@ -2543,7 +2519,6 @@ body:             |
     S_NOP 0
 
   bb.3:
-  ; predecessors: %bb.2
     successors: %bb.4
 
     %26:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 26, implicit $exec, implicit $mode, implicit-def $m0
@@ -2551,7 +2526,6 @@ body:             |
     S_NOP 0, implicit %26, implicit %27
 
   bb.4:
-  ; predcessors: %bb.3
 
     S_NOP 0, implicit %25
     S_NOP 0, implicit %0, implicit %1
@@ -2650,7 +2624,6 @@ body:             |
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode, implicit-def $m0
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -2658,7 +2631,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -2759,7 +2731,6 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
@@ -2767,7 +2738,6 @@ body:             |
     S_NOP 0, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -5030,7 +5000,6 @@ body:             |
     %21.sub1:vreg_128 = nofpexcept V_CVT_I32_F64_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %21.sub2:vreg_128 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5038,7 +5007,6 @@ body:             |
     S_NOP 0, implicit %21
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5137,14 +5105,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %23, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5242,7 +5208,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5250,7 +5215,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5348,7 +5312,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5357,7 +5320,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5456,7 +5418,6 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
@@ -5466,7 +5427,6 @@ body:             |
     S_NOP 0, implicit %22
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5562,14 +5522,12 @@ body:             |
     %22:vreg_64 = nofpexcept V_CVT_F64_I32_e32 22, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %22, implicit %23
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %0, implicit %1
     S_NOP 0, implicit %2, implicit %3
@@ -5669,14 +5627,12 @@ body:             |
     undef %23.sub1:vreg_64_align2 = V_MOV_B32_e32 23, implicit $exec
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23.sub1
     S_NOP 0, implicit %0, implicit %1
@@ -5779,14 +5735,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     DBG_VALUE %23, 0, 0
     S_NOP 0, implicit %23
@@ -5889,14 +5843,12 @@ body:             |
     %23:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 23, implicit $exec, implicit $mode
 
   bb.1:
-  ; predecessors: %bb.0
     successors: %bb.2
 
     %24:vgpr_32 = nofpexcept V_CVT_I32_F64_e32 24, implicit $exec, implicit $mode, implicit-def $m0
     S_NOP 0, implicit %24
 
   bb.2:
-  ; predcessors: %bb.1
 
     S_NOP 0, implicit %23
     S_NOP 0, implicit %0, implicit %1
@@ -5914,3 +5866,411 @@ body:             |
     S_ENDPGM 0
 ...
 
+---
+name:            test_occ_8_physreg_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: test_occ_8_physreg_use
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_30]], implicit [[DEF29]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_31]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_9]], implicit [[V_CVT_I32_F32_e32_17]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_10]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]], implicit [[V_CVT_I32_F32_e32_28]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_29]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr9 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr8, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 $vgpr9, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+
+    S_NOP 0, implicit %50, implicit %60, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %21
+    S_NOP 0, implicit %52, implicit %62
+    S_NOP 0, implicit %53, implicit %63
+    S_NOP 0, implicit %54, implicit %64
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    S_ENDPGM 0
+...
+
+---
+name:            test_occ_8_exec_use
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body:             |
+  ; GFX908-LABEL: name: test_occ_8_exec_use
+  ; GFX908: bb.0:
+  ; GFX908-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX908-NEXT:   liveins: $vgpr0, $sgpr0_sgpr1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
+  ; GFX908-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
+  ; GFX908-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+  ; GFX908-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sreg_64 = S_MOV_B32 0
+  ; GFX908-NEXT:   dead [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF2:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF3:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF6:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF7:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF8:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF9:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF10:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF11:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF12:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF13:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF14:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF15:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF16:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF17:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF17]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF18:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF19:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   dead [[V_CMP_GT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_GT_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, [[COPY1]](s32), implicit $exec
+  ; GFX908-NEXT:   dead undef [[S_MOV_B32_:%[0-9]+]].sub0:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]].sub1
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF1]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_2:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF2]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_3:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF3]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_4:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF4]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_5:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF5]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_6:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF6]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_7:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF7]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_8:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF8]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_9:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF9]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_10:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF10]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_11:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF11]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_12:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF12]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_13:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF13]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_14:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF14]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_15:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF15]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_16:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF16]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_17:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF18]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_18:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF19]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF20:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_19:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF20]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF21:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_20:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF21]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF22:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_21:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF22]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF23:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_22:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF23]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF24:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_23:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF24]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF25:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_24:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF25]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF26:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_25:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF26]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF27:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_26:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF27]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF28:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_27:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF28]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   [[DEF29:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[DEF30:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_28:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF29]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   $vgpr8 = IMPLICIT_DEF
+  ; GFX908-NEXT:   $vgpr9 = IMPLICIT_DEF
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_29:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[DEF30]], implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_BRANCH %bb.1
+  ; GFX908-NEXT: {{  $}}
+  ; GFX908-NEXT: bb.1:
+  ; GFX908-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 255
+  ; GFX908-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 [[S_MOV_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_30:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_30]], implicit [[V_CVT_I32_F32_e32_28]], implicit [[DEF29]]
+  ; GFX908-NEXT:   [[V_CVT_I32_F32_e32_31:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_31]], implicit [[V_CVT_I32_F32_e32_29]], implicit [[DEF30]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_1]], implicit [[V_CVT_I32_F32_e32_9]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_2]], implicit [[V_CVT_I32_F32_e32_10]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_3]], implicit [[V_CVT_I32_F32_e32_11]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_4]], implicit [[V_CVT_I32_F32_e32_12]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_5]], implicit [[V_CVT_I32_F32_e32_13]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_6]], implicit [[V_CVT_I32_F32_e32_14]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_7]], implicit [[V_CVT_I32_F32_e32_15]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_8]], implicit [[V_CVT_I32_F32_e32_16]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_17]], implicit [[V_CVT_I32_F32_e32_18]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_19]], implicit [[V_CVT_I32_F32_e32_20]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_21]], implicit [[V_CVT_I32_F32_e32_22]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_23]], implicit [[V_CVT_I32_F32_e32_24]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_25]], implicit [[V_CVT_I32_F32_e32_26]]
+  ; GFX908-NEXT:   S_NOP 0, implicit [[V_CVT_I32_F32_e32_27]]
+  ; GFX908-NEXT:   $exec = S_MOV_B64 [[S_AND_SAVEEXEC_B64_]]
+  ; GFX908-NEXT:   S_ENDPGM 0
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64(p4) = COPY $sgpr0_sgpr1
+    %2:vgpr_32(s32) = COPY $vgpr0
+    %3:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1(p4), 52, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
+    %4:sreg_64 = V_CMP_GT_U32_e64 %3.sub0, %2(s32), implicit $exec
+    undef %5.sub1:sreg_64 = S_MOV_B32 0
+    %5.sub0:sreg_64 = COPY %3.sub1
+    $vgpr8 = IMPLICIT_DEF
+    $vgpr9 = IMPLICIT_DEF
+    %11:vgpr_32 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vgpr_32 = IMPLICIT_DEF
+    %15:vgpr_32 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = IMPLICIT_DEF
+    %18:vgpr_32 = IMPLICIT_DEF
+    %19:vgpr_32 = IMPLICIT_DEF
+    %20:vgpr_32 = IMPLICIT_DEF
+    %21:vgpr_32 = IMPLICIT_DEF
+    %22:vgpr_32 = IMPLICIT_DEF
+    %23:vgpr_32 = IMPLICIT_DEF
+    %24:vgpr_32 = IMPLICIT_DEF
+    %25:vgpr_32 = IMPLICIT_DEF
+    %26:vgpr_32 = IMPLICIT_DEF
+    %27:vgpr_32 = IMPLICIT_DEF
+    %28:vgpr_32 = IMPLICIT_DEF
+    %29:vgpr_32 = IMPLICIT_DEF
+    %30:vgpr_32 = IMPLICIT_DEF
+    %31:vgpr_32 = IMPLICIT_DEF
+    %32:vgpr_32 = IMPLICIT_DEF
+    %33:vgpr_32 = IMPLICIT_DEF
+    %34:vgpr_32 = IMPLICIT_DEF
+    %35:vgpr_32 = IMPLICIT_DEF
+    %36:vgpr_32 = IMPLICIT_DEF
+    %37:vgpr_32 = IMPLICIT_DEF
+    %38:vgpr_32 = IMPLICIT_DEF
+    %39:vgpr_32 = IMPLICIT_DEF
+    %40:vgpr_32 = IMPLICIT_DEF
+    %41:vgpr_32 = IMPLICIT_DEF
+    %50:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 1, implicit $exec, implicit $mode
+    %51:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 2, implicit $exec, implicit $mode
+    %52:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %12, implicit $exec, implicit $mode
+    %53:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %13, implicit $exec, implicit $mode
+    %54:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %14, implicit $exec, implicit $mode
+    %55:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %15, implicit $exec, implicit $mode
+    %56:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %16, implicit $exec, implicit $mode
+    %57:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %17, implicit $exec, implicit $mode
+    %58:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %18, implicit $exec, implicit $mode
+    %59:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %19, implicit $exec, implicit $mode
+    %60:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %20, implicit $exec, implicit $mode
+    %61:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %21, implicit $exec, implicit $mode
+    %62:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %22, implicit $exec, implicit $mode
+    %63:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %23, implicit $exec, implicit $mode
+    %64:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %24, implicit $exec, implicit $mode
+    %65:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %25, implicit $exec, implicit $mode
+    %66:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %26, implicit $exec, implicit $mode
+    %67:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %27, implicit $exec, implicit $mode
+    %68:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %28, implicit $exec, implicit $mode
+    %69:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %29, implicit $exec, implicit $mode
+    %70:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %30, implicit $exec, implicit $mode
+    %71:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %31, implicit $exec, implicit $mode
+    %72:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %32, implicit $exec, implicit $mode
+    %73:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %33, implicit $exec, implicit $mode
+    %74:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %34, implicit $exec, implicit $mode
+    %75:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %35, implicit $exec, implicit $mode
+    %76:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %36, implicit $exec, implicit $mode
+    %77:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %37, implicit $exec, implicit $mode
+    %78:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %38, implicit $exec, implicit $mode
+    %79:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %39, implicit $exec, implicit $mode
+    %80:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %40, implicit $exec, implicit $mode
+    %81:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 %41, implicit $exec, implicit $mode
+    S_BRANCH %bb.4
+
+  bb.4:
+
+    %100:sreg_64 = S_MOV_B64 255
+    %101:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed %100, implicit-def $exec, implicit-def $scc, implicit $exec
+    S_NOP 0, implicit %50, implicit %60, implicit %20
+    S_NOP 0, implicit %51, implicit %61, implicit %21
+    S_NOP 0, implicit %52, implicit %62
+    S_NOP 0, implicit %53, implicit %63
+    S_NOP 0, implicit %54, implicit %64
+    S_NOP 0, implicit %55, implicit %65
+    S_NOP 0, implicit %56, implicit %66
+    S_NOP 0, implicit %57, implicit %67
+    S_NOP 0, implicit %58, implicit %68
+    S_NOP 0, implicit %59, implicit %69
+    S_NOP 0, implicit %70, implicit %71
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %72, implicit %73
+    S_NOP 0, implicit %74, implicit %75
+    S_NOP 0, implicit %76, implicit %77
+    S_NOP 0, implicit %78, implicit %79
+    S_NOP 0, implicit %80
+    $exec = S_MOV_B64 %101:sreg_64_xexec
+    S_ENDPGM 0
+...
+

From 1bcf44be4c6a01407d608cd7bd17571dc31dbcf5 Mon Sep 17 00:00:00 2001
From: Peter Rong <peterrong96@gmail.com>
Date: Mon, 27 Jan 2025 10:59:07 -0800
Subject: [PATCH 268/432] [MLOpt] Add tf_xla_runtime to LLVMDevelopmentExport
 (#124610)

We need to use tf on assert builds, thus requires it to be export as
well.

Signed-off-by: Peter Rong <PeterRong@meta.com>
---
 llvm/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index ad12100fdb5b8..bdf746525e233 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1127,6 +1127,8 @@ if (NOT TENSORFLOW_AOT_PATH STREQUAL "")
     ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}/tf_runtime)
   install(TARGETS tf_xla_runtime EXPORT LLVMExports
     ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime)
+  install(TARGETS tf_xla_runtime EXPORT LLVMDevelopmentExports
+    ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX} COMPONENT tf_xla_runtime)
   set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS tf_xla_runtime)
   # Once we add more modules, we should handle this more automatically.
   if (DEFINED LLVM_OVERRIDE_MODEL_HEADER_INLINERSIZEMODEL)

From 539b2e06542f7c099885533e4472e6fb3084aa96 Mon Sep 17 00:00:00 2001
From: Heejin Ahn <aheejin@gmail.com>
Date: Mon, 27 Jan 2025 11:01:48 -0800
Subject: [PATCH 269/432] [WebAssembly] Fix catch block type in wasm64
 (#124381)

`try_table`'s `catch` or `catch_ref`'s target block's return type should
be `i64` and `(i64, exnref)` in case of wasm64.
---
 .../WebAssembly/WebAssemblyCFGStackify.cpp    | 11 +++++++++--
 .../WebAssembly/WebAssemblyMCInstLower.cpp    | 19 +++++++++++++------
 llvm/test/CodeGen/WebAssembly/exception.ll    |  5 +++++
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index bdc1cc6d652ac..277d353d1db10 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -852,12 +852,19 @@ void WebAssemblyCFGStackify::placeTryTableMarker(MachineBasicBlock &MBB) {
   // Add a CATCH_*** clause to the TRY_TABLE. These are pseudo instructions
   // following the destination END_BLOCK to simulate block return values,
   // because we currently don't support them.
+  const auto &TLI =
+      *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  WebAssembly::BlockType PtrTy =
+      TLI.getPointerTy(MF.getDataLayout()) == MVT::i32
+          ? WebAssembly::BlockType::I32
+          : WebAssembly::BlockType::I64;
   auto *Catch = WebAssembly::findCatch(&MBB);
   switch (Catch->getOpcode()) {
   case WebAssembly::CATCH:
     // CATCH's destination block's return type is the extracted value type,
-    // which is currently i32 for all supported tags.
-    BlockMIB.addImm(int64_t(WebAssembly::BlockType::I32));
+    // which is currently the thrown value's pointer type for all supported
+    // tags.
+    BlockMIB.addImm(int64_t(PtrTy));
     TryTableMIB.addImm(wasm::WASM_OPCODE_CATCH);
     for (const auto &Use : Catch->uses()) {
       // The only use operand a CATCH can have is the tag symbol.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index f09b29472cb6b..eed0b42863ee6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -169,6 +169,13 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
 
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned NumVariadicDefs = MI->getNumExplicitDefs() - Desc.getNumDefs();
+  const MachineFunction *MF = MI->getMF();
+  const auto &TLI =
+      *MF->getSubtarget<WebAssemblySubtarget>().getTargetLowering();
+  wasm::ValType PtrTy = TLI.getPointerTy(MF->getDataLayout()) == MVT::i32
+                            ? wasm::ValType::I32
+                            : wasm::ValType::I64;
+
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
 
@@ -234,12 +241,12 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
             //    return type of the parent function.
             // 2. (catch_ref ...) clause in try_table instruction. Currently all
             //    tags we support (cpp_exception and c_longjmp) throws a single
-            //    i32, so the multivalue signature for this case will be (i32,
-            //    exnref). Having MO_CATCH_BLOCK_SIG target flags means this is
-            //    a destination of a catch_ref.
-            if (MO.getTargetFlags() == WebAssemblyII::MO_CATCH_BLOCK_SIG)
-              Returns = {wasm::ValType::I32, wasm::ValType::EXNREF};
-            else
+            //    pointer, so the multivalue signature for this case will be
+            //    (ptr, exnref). Having MO_CATCH_BLOCK_SIG target flags means
+            //    this is a destination of a catch_ref.
+            if (MO.getTargetFlags() == WebAssemblyII::MO_CATCH_BLOCK_SIG) {
+              Returns = {PtrTy, wasm::ValType::EXNREF};
+            } else
               getFunctionReturns(MI, Returns);
             MCOp = lowerTypeIndexOperand(std::move(Returns),
                                          SmallVector<wasm::ValType, 4>());
diff --git a/llvm/test/CodeGen/WebAssembly/exception.ll b/llvm/test/CodeGen/WebAssembly/exception.ll
index 304664b622e80..febab822a6a9e 100644
--- a/llvm/test/CodeGen/WebAssembly/exception.ll
+++ b/llvm/test/CodeGen/WebAssembly/exception.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -asm-verbose=false -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs -O0
 ; RUN: llc < %s -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling
 ; RUN: llc < %s -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -filetype=obj
+; RUN: llc < %s -mtriple=wasm64-unknown-unknown -wasm-enable-eh -wasm-use-legacy-eh=false -exception-model=wasm -mattr=+exception-handling -verify-machineinstrs | FileCheck --implicit-check-not=ehgcr -allow-deprecated-dag-overlap %s --check-prefix=WASM64
 
 target triple = "wasm32-unknown-unknown"
 
@@ -30,11 +31,13 @@ define void @throw(ptr %p) {
 ; }
 
 ; CHECK-LABEL: catch:
+; WASM64-LABEL: catch:
 ; CHECK: global.get  __stack_pointer
 ; CHECK: local.set  0
 ; CHECK: block
 ; CHECK:   block     () -> (i32, exnref)
 ; CHECK:     try_table    (catch_ref __cpp_exception 0)
+; WASM64:  block     () -> (i64, exnref)
 ; CHECK:       call  foo
 ; CHECK:       br        2
 ; CHECK:     end_try_table
@@ -138,8 +141,10 @@ ehcleanup:                                        ; preds = %entry
 ; }
 
 ; CHECK-LABEL: terminatepad
+; WASM64-LABEL: terminatepad
 ; CHECK: block
 ; CHECK:   block     i32
+; WASM64:  block     i64
 ; CHECK:     try_table    (catch __cpp_exception 0)
 ; CHECK:       call  foo
 ; CHECK:       br        2

From aa34a6ab299027ac31929173287e42db0dbdb06b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 11:16:22 -0800
Subject: [PATCH 270/432] [RISCV] Add register allocation hints for
 lui/auipc+addi fusion. (#123860)

Spotted the auipc case while looking at code for P550. I'm not sure this
is the right long term fix. We're still missing rematerialization
opportunities for these pairs so a pseudo might be better. That would
interfere with folding auipc+add into load/store addressing though.

Fixes #76779.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |  20 +++
 .../RISCV/machinelicm-address-pseudos.ll      | 166 ++++++++++++------
 .../CodeGen/RISCV/macro-fusion-lui-addi.ll    |  48 +++++
 3 files changed, 180 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index b9c70fe60fb50..b0a52698c1e9f 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -926,6 +926,26 @@ bool RISCVRegisterInfo::getRegAllocationHints(
         tryAddHint(MO, MI.getOperand(0), NeedGPRC);
       }
     }
+
+    // Add a hint if it would allow auipc/lui+addi(w) fusion.
+    if ((MI.getOpcode() == RISCV::ADDIW || MI.getOpcode() == RISCV::ADDI) &&
+        MI.getOperand(1).isReg()) {
+      const MachineBasicBlock &MBB = *MI.getParent();
+      MachineBasicBlock::const_iterator I = MI.getIterator();
+      // Is the previous instruction a LUI or AUIPC that can be fused?
+      if (I != MBB.begin()) {
+        I = skipDebugInstructionsBackward(std::prev(I), MBB.begin());
+        if (((I->getOpcode() == RISCV::LUI && Subtarget.hasLUIADDIFusion()) ||
+             (I->getOpcode() == RISCV::AUIPC &&
+              Subtarget.hasAUIPCADDIFusion())) &&
+            I->getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+          if (OpIdx == 0)
+            tryAddHint(MO, MI.getOperand(1), /*NeedGPRC=*/false);
+          else
+            tryAddHint(MO, MI.getOperand(0), /*NeedGPRC=*/false);
+        }
+      }
+    }
   }
 
   for (MCPhysReg OrderReg : Order)
diff --git a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
index af8105644b57d..8deb17582cb11 100644
--- a/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/RISCV/machinelicm-address-pseudos.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=RV32I %s
+; RUN:   | FileCheck -check-prefixes=RV32I,RV32NOFUSION %s
 ; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
-; RUN:   | FileCheck -check-prefixes=RV64I %s
+; RUN:   | FileCheck -check-prefixes=RV64I,RV64NOFUSION %s
+; RUN: llc -mtriple=riscv32 -relocation-model=pic -verify-machineinstrs < %s \
+; RUN:   -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV32I,RV32FUSION %s
+; RUN: llc -mtriple=riscv64 -relocation-model=pic -verify-machineinstrs < %s \
+; RUN:   -mattr=+auipc-addi-fusion | FileCheck -check-prefixes=RV64I,RV64FUSION %s
 
 ; Verifies that MachineLICM can hoist address generation pseudos out of loops.
 
@@ -141,59 +145,113 @@ ret:
 @gd = external thread_local global i32
 
 define void @test_la_tls_gd(i32 signext %n) nounwind {
-; RV32I-LABEL: test_la_tls_gd:
-; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    addi sp, sp, -16
-; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
-; RV32I-NEXT:    mv s0, a0
-; RV32I-NEXT:    li s2, 0
-; RV32I-NEXT:  .Lpcrel_hi3:
-; RV32I-NEXT:    auipc a0, %tls_gd_pcrel_hi(gd)
-; RV32I-NEXT:    addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
-; RV32I-NEXT:  .LBB3_1: # %loop
-; RV32I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV32I-NEXT:    mv a0, s1
-; RV32I-NEXT:    call __tls_get_addr
-; RV32I-NEXT:    lw zero, 0(a0)
-; RV32I-NEXT:    addi s2, s2, 1
-; RV32I-NEXT:    blt s2, s0, .LBB3_1
-; RV32I-NEXT:  # %bb.2: # %ret
-; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
-; RV32I-NEXT:    addi sp, sp, 16
-; RV32I-NEXT:    ret
+; RV32NOFUSION-LABEL: test_la_tls_gd:
+; RV32NOFUSION:       # %bb.0: # %entry
+; RV32NOFUSION-NEXT:    addi sp, sp, -16
+; RV32NOFUSION-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32NOFUSION-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32NOFUSION-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32NOFUSION-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32NOFUSION-NEXT:    mv s0, a0
+; RV32NOFUSION-NEXT:    li s2, 0
+; RV32NOFUSION-NEXT:  .Lpcrel_hi3:
+; RV32NOFUSION-NEXT:    auipc a0, %tls_gd_pcrel_hi(gd)
+; RV32NOFUSION-NEXT:    addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
+; RV32NOFUSION-NEXT:  .LBB3_1: # %loop
+; RV32NOFUSION-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32NOFUSION-NEXT:    mv a0, s1
+; RV32NOFUSION-NEXT:    call __tls_get_addr
+; RV32NOFUSION-NEXT:    lw zero, 0(a0)
+; RV32NOFUSION-NEXT:    addi s2, s2, 1
+; RV32NOFUSION-NEXT:    blt s2, s0, .LBB3_1
+; RV32NOFUSION-NEXT:  # %bb.2: # %ret
+; RV32NOFUSION-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32NOFUSION-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32NOFUSION-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32NOFUSION-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32NOFUSION-NEXT:    addi sp, sp, 16
+; RV32NOFUSION-NEXT:    ret
 ;
-; RV64I-LABEL: test_la_tls_gd:
-; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    addi sp, sp, -32
-; RV64I-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    mv s0, a0
-; RV64I-NEXT:    li s2, 0
-; RV64I-NEXT:  .Lpcrel_hi3:
-; RV64I-NEXT:    auipc a0, %tls_gd_pcrel_hi(gd)
-; RV64I-NEXT:    addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
-; RV64I-NEXT:  .LBB3_1: # %loop
-; RV64I-NEXT:    # =>This Inner Loop Header: Depth=1
-; RV64I-NEXT:    mv a0, s1
-; RV64I-NEXT:    call __tls_get_addr
-; RV64I-NEXT:    lw zero, 0(a0)
-; RV64I-NEXT:    addiw s2, s2, 1
-; RV64I-NEXT:    blt s2, s0, .LBB3_1
-; RV64I-NEXT:  # %bb.2: # %ret
-; RV64I-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 32
-; RV64I-NEXT:    ret
+; RV64NOFUSION-LABEL: test_la_tls_gd:
+; RV64NOFUSION:       # %bb.0: # %entry
+; RV64NOFUSION-NEXT:    addi sp, sp, -32
+; RV64NOFUSION-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64NOFUSION-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64NOFUSION-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64NOFUSION-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
+; RV64NOFUSION-NEXT:    mv s0, a0
+; RV64NOFUSION-NEXT:    li s2, 0
+; RV64NOFUSION-NEXT:  .Lpcrel_hi3:
+; RV64NOFUSION-NEXT:    auipc a0, %tls_gd_pcrel_hi(gd)
+; RV64NOFUSION-NEXT:    addi s1, a0, %pcrel_lo(.Lpcrel_hi3)
+; RV64NOFUSION-NEXT:  .LBB3_1: # %loop
+; RV64NOFUSION-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64NOFUSION-NEXT:    mv a0, s1
+; RV64NOFUSION-NEXT:    call __tls_get_addr
+; RV64NOFUSION-NEXT:    lw zero, 0(a0)
+; RV64NOFUSION-NEXT:    addiw s2, s2, 1
+; RV64NOFUSION-NEXT:    blt s2, s0, .LBB3_1
+; RV64NOFUSION-NEXT:  # %bb.2: # %ret
+; RV64NOFUSION-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64NOFUSION-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64NOFUSION-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64NOFUSION-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64NOFUSION-NEXT:    addi sp, sp, 32
+; RV64NOFUSION-NEXT:    ret
+;
+; RV32FUSION-LABEL: test_la_tls_gd:
+; RV32FUSION:       # %bb.0: # %entry
+; RV32FUSION-NEXT:    addi sp, sp, -16
+; RV32FUSION-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32FUSION-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32FUSION-NEXT:    sw s1, 4(sp) # 4-byte Folded Spill
+; RV32FUSION-NEXT:    sw s2, 0(sp) # 4-byte Folded Spill
+; RV32FUSION-NEXT:    mv s0, a0
+; RV32FUSION-NEXT:    li s2, 0
+; RV32FUSION-NEXT:  .Lpcrel_hi3:
+; RV32FUSION-NEXT:    auipc s1, %tls_gd_pcrel_hi(gd)
+; RV32FUSION-NEXT:    addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
+; RV32FUSION-NEXT:  .LBB3_1: # %loop
+; RV32FUSION-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV32FUSION-NEXT:    mv a0, s1
+; RV32FUSION-NEXT:    call __tls_get_addr
+; RV32FUSION-NEXT:    lw zero, 0(a0)
+; RV32FUSION-NEXT:    addi s2, s2, 1
+; RV32FUSION-NEXT:    blt s2, s0, .LBB3_1
+; RV32FUSION-NEXT:  # %bb.2: # %ret
+; RV32FUSION-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32FUSION-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32FUSION-NEXT:    lw s1, 4(sp) # 4-byte Folded Reload
+; RV32FUSION-NEXT:    lw s2, 0(sp) # 4-byte Folded Reload
+; RV32FUSION-NEXT:    addi sp, sp, 16
+; RV32FUSION-NEXT:    ret
+;
+; RV64FUSION-LABEL: test_la_tls_gd:
+; RV64FUSION:       # %bb.0: # %entry
+; RV64FUSION-NEXT:    addi sp, sp, -32
+; RV64FUSION-NEXT:    sd ra, 24(sp) # 8-byte Folded Spill
+; RV64FUSION-NEXT:    sd s0, 16(sp) # 8-byte Folded Spill
+; RV64FUSION-NEXT:    sd s1, 8(sp) # 8-byte Folded Spill
+; RV64FUSION-NEXT:    sd s2, 0(sp) # 8-byte Folded Spill
+; RV64FUSION-NEXT:    mv s0, a0
+; RV64FUSION-NEXT:    li s2, 0
+; RV64FUSION-NEXT:  .Lpcrel_hi3:
+; RV64FUSION-NEXT:    auipc s1, %tls_gd_pcrel_hi(gd)
+; RV64FUSION-NEXT:    addi s1, s1, %pcrel_lo(.Lpcrel_hi3)
+; RV64FUSION-NEXT:  .LBB3_1: # %loop
+; RV64FUSION-NEXT:    # =>This Inner Loop Header: Depth=1
+; RV64FUSION-NEXT:    mv a0, s1
+; RV64FUSION-NEXT:    call __tls_get_addr
+; RV64FUSION-NEXT:    lw zero, 0(a0)
+; RV64FUSION-NEXT:    addiw s2, s2, 1
+; RV64FUSION-NEXT:    blt s2, s0, .LBB3_1
+; RV64FUSION-NEXT:  # %bb.2: # %ret
+; RV64FUSION-NEXT:    ld ra, 24(sp) # 8-byte Folded Reload
+; RV64FUSION-NEXT:    ld s0, 16(sp) # 8-byte Folded Reload
+; RV64FUSION-NEXT:    ld s1, 8(sp) # 8-byte Folded Reload
+; RV64FUSION-NEXT:    ld s2, 0(sp) # 8-byte Folded Reload
+; RV64FUSION-NEXT:    addi sp, sp, 32
+; RV64FUSION-NEXT:    ret
 entry:
   br label %loop
 
diff --git a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
index 3f758e25c4217..d1b10af16063a 100644
--- a/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
+++ b/llvm/test/CodeGen/RISCV/macro-fusion-lui-addi.ll
@@ -12,6 +12,8 @@
 ;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion,+use-postra-scheduler -mcpu=sifive-u74 \
 ;RUN:   -misched-postra-direction=bidirectional -target-abi=lp64f \
 ;RUN:   | FileCheck %s --check-prefixes=FUSION-POSTRA,FUSION-POSTRA-BIDIRECTIONAL
+;RUN: llc < %s -mtriple=riscv64 -mattr=+f,+lui-addi-fusion -target-abi=lp64f \
+;RUN:   | FileCheck %s --check-prefix=FUSION-GENERIC
 
 @.str = private constant [4 x i8] c"%f\0A\00", align 1
 
@@ -50,6 +52,13 @@ define void @foo(i32 signext %0, i32 signext %1) {
 ; FUSION-POSTRA-BIDIRECTIONAL-NEXT:    addi a0, a0, %lo(.L.str)
 ; FUSION-POSTRA-BIDIRECTIONAL-NEXT:    fcvt.s.w fa0, a1
 ; FUSION-POSTRA-BIDIRECTIONAL-NEXT:    tail bar
+;
+; FUSION-GENERIC-LABEL: foo:
+; FUSION-GENERIC:       # %bb.0:
+; FUSION-GENERIC-NEXT:    fcvt.s.w fa0, a1
+; FUSION-GENERIC-NEXT:    lui a0, %hi(.L.str)
+; FUSION-GENERIC-NEXT:    addi a0, a0, %lo(.L.str)
+; FUSION-GENERIC-NEXT:    tail bar
   %3 = sitofp i32 %1 to float
   tail call void @bar(ptr @.str, float %3)
   ret void
@@ -76,5 +85,44 @@ define i32 @test_matint() {
 ; FUSION-POSTRA-NEXT:    lui a0, 1
 ; FUSION-POSTRA-NEXT:    addiw a0, a0, -2048
 ; FUSION-POSTRA-NEXT:    ret
+;
+; FUSION-GENERIC-LABEL: test_matint:
+; FUSION-GENERIC:       # %bb.0:
+; FUSION-GENERIC-NEXT:    lui a0, 1
+; FUSION-GENERIC-NEXT:    addiw a0, a0, -2048
+; FUSION-GENERIC-NEXT:    ret
   ret i32 2048
 }
+
+define void @test_regalloc_hint(i32 noundef signext %0, i32 noundef signext %1) {
+; NOFUSION-LABEL: test_regalloc_hint:
+; NOFUSION:       # %bb.0:
+; NOFUSION-NEXT:    mv a0, a1
+; NOFUSION-NEXT:    lui a1, 3014
+; NOFUSION-NEXT:    addiw a1, a1, 334
+; NOFUSION-NEXT:    tail bar
+;
+; FUSION-LABEL: test_regalloc_hint:
+; FUSION:       # %bb.0:
+; FUSION-NEXT:    mv a0, a1
+; FUSION-NEXT:    lui a1, 3014
+; FUSION-NEXT:    addiw a1, a1, 334
+; FUSION-NEXT:    tail bar
+;
+; FUSION-POSTRA-LABEL: test_regalloc_hint:
+; FUSION-POSTRA:       # %bb.0:
+; FUSION-POSTRA-NEXT:    mv a0, a1
+; FUSION-POSTRA-NEXT:    lui a1, 3014
+; FUSION-POSTRA-NEXT:    addiw a1, a1, 334
+; FUSION-POSTRA-NEXT:    tail bar
+;
+; FUSION-GENERIC-LABEL: test_regalloc_hint:
+; FUSION-GENERIC:       # %bb.0:
+; FUSION-GENERIC-NEXT:    lui a2, 3014
+; FUSION-GENERIC-NEXT:    addiw a2, a2, 334
+; FUSION-GENERIC-NEXT:    mv a0, a1
+; FUSION-GENERIC-NEXT:    mv a1, a2
+; FUSION-GENERIC-NEXT:    tail bar
+  tail call void @bar(i32 noundef signext %1, i32 noundef signext 12345678)
+  ret void
+}

From 2e5a5237daf82a657561c490845c406e13657311 Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Mon, 27 Jan 2025 19:38:24 +0000
Subject: [PATCH 271/432] [flang][debug] Avoid redundant debug data generation
 for derived types. (#124473)

Since https://github.com/llvm/llvm-project/pull/122770, we have seen
that compile time have become extremely slow for cyclic derived types.
In #122770, we made the criteria to cache a derived type very strict. As
a result, some types which are safe to cache were also being
re-generated every type they were required. This increased the compile
time and also the size of the debug info.

Please see the description of PR# 122770. We decided that when
processing `t1`, the type generated for `t2` and `t3` were not safe to
cached. But our algorithm also denied caching to `t1` which as top level
type was safe.

```
type t1
  type(t2), pointer :: p1
end type
type t2
  type(t3), pointer :: p2
end type
type t3
  type(t1), pointer :: p3
end type
```

I have tinkered the check a bit so that top level type is always cached.
To detect a top level type, we use a depth counter that get incremented
before call to `convertRecordType` and decremented after it returns.

After this change, the following
[file](https://github.com/fujitsu/compiler-test-suite/blob/main/Fortran/0394/0394_0031.f90)
from Fujitsu get compiled around 40s which is same as it was before
#122770.


The smaller testcase present in issue #124049 takes less than half a
second. I also added check to make sure that duplicate entries of the
`DICompositeType` are not present in the IR.

Fixes #124049 and #123960.
---
 .../Transforms/DebugTypeGenerator.cpp         | 30 +++++++++++++++++--
 .../Optimizer/Transforms/DebugTypeGenerator.h |  1 +
 .../debug-cyclic-derived-type-3.f90           |  8 +++--
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 555f354521c9b..cdd30dce183dd 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -48,7 +48,8 @@ DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m,
                                        mlir::SymbolTable *symbolTable_,
                                        const mlir::DataLayout &dl)
     : module(m), symbolTable(symbolTable_), dataLayout{&dl},
-      kindMapping(getKindMapping(m)), llvmTypeConverter(m, false, false, dl) {
+      kindMapping(getKindMapping(m)), llvmTypeConverter(m, false, false, dl),
+      derivedTypeDepth(0) {
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
 
   mlir::MLIRContext *context = module.getContext();
@@ -407,7 +408,10 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertRecordType(
       /*baseType=*/nullptr, mlir::LLVM::DIFlags::Zero, offset * 8,
       /*alignInBits=*/0, elements, /*dataLocation=*/nullptr, /*rank=*/nullptr,
       /*allocated=*/nullptr, /*associated=*/nullptr);
-  if (canCacheThisType) {
+
+  // derivedTypeDepth == 1 means that it is a top level type which is safe to
+  // cache.
+  if (canCacheThisType || derivedTypeDepth == 1) {
     typeCache[Ty] = finalAttr;
   } else {
     auto iter = typeCache.find(Ty);
@@ -663,7 +667,27 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
     return convertCharacterType(charTy, fileAttr, scope, declOp,
                                 /*hasDescriptor=*/false);
   } else if (auto recTy = mlir::dyn_cast_if_present<fir::RecordType>(Ty)) {
-    return convertRecordType(recTy, fileAttr, scope, declOp);
+    // For nested derived types like shown below, the call sequence of the
+    // convertRecordType will look something like as follows:
+    // convertRecordType (t1)
+    //  convertRecordType (t2)
+    //    convertRecordType (t3)
+    // We need to recognize when we are processing the top level type like t1
+    // to make caching decision. The variable `derivedTypeDepth` is used for
+    // this purpose and maintains the current depth of derived type processing.
+    //  type t1
+    //   type(t2), pointer :: p1
+    // end type
+    // type t2
+    //   type(t3), pointer :: p2
+    // end type
+    // type t2
+    //   integer a
+    // end type
+    derivedTypeDepth++;
+    auto result = convertRecordType(recTy, fileAttr, scope, declOp);
+    derivedTypeDepth--;
+    return result;
   } else if (auto tupleTy = mlir::dyn_cast_if_present<mlir::TupleType>(Ty)) {
     return convertTupleType(tupleTy, fileAttr, scope, declOp);
   } else if (auto refTy = mlir::dyn_cast_if_present<fir::ReferenceType>(Ty)) {
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 7daa0af166e69..cc4b5428ee1a9 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -91,6 +91,7 @@ class DebugTypeGenerator {
   std::uint64_t lenOffset;
   std::uint64_t rankOffset;
   std::uint64_t rankSize;
+  int32_t derivedTypeDepth;
   llvm::DenseMap<mlir::Type, mlir::LLVM::DITypeAttr> typeCache;
 };
 
diff --git a/flang/test/Integration/debug-cyclic-derived-type-3.f90 b/flang/test/Integration/debug-cyclic-derived-type-3.f90
index ef9aed13cc514..d91c635576e26 100644
--- a/flang/test/Integration/debug-cyclic-derived-type-3.f90
+++ b/flang/test/Integration/debug-cyclic-derived-type-3.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o -
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck %s
 
 ! mainly test that this program does not cause an assertion failure
 ! testcase for issue 122024
@@ -17,7 +17,7 @@ module m1
 
 program test
   use m1
-  type(t1),pointer :: foo
+  type(t1),pointer :: foo, foo2
   allocate(foo)
   allocate(foo%x1)
   allocate(foo%x1%x2)
@@ -30,3 +30,7 @@ subroutine sub1(bar)
   use m1
   type(t2) :: bar
 end subroutine
+
+! Test that file compiles ok and there is only one DICompositeType for "t1".
+!CHECK: !DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}})
+!CHECK-NOT: !DICompositeType(tag: DW_TAG_structure_type, name: "t1"{{.*}})

From 1e9b60cfa4316246f9fe325ec57daf185120d34e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 11:56:41 -0800
Subject: [PATCH 272/432] [flang] Recognize and check EVENT_QUERY (#123429)

Recognize the intrinsic subroutine EVENT_QUERY and enforce semantic
requirements on calls to it.
---
 flang/lib/Evaluate/intrinsics.cpp    | 17 +++++++++++++--
 flang/lib/Semantics/check-call.cpp   | 32 ++++++++++++++++++++++++++++
 flang/test/Semantics/event_query.f90 | 32 +++++++++++++++++++++-------
 3 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 77d37d40bbddc..954581fd713a2 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -96,6 +96,7 @@ ENUM_CLASS(KindCode, none, defaultIntegerKind,
     typeless, // BOZ literals are INTEGER with this kind
     ieeeFlagType, // IEEE_FLAG_TYPE from ISO_FORTRAN_EXCEPTION
     ieeeRoundType, // IEEE_ROUND_TYPE from ISO_FORTRAN_ARITHMETIC
+    eventType, // EVENT_TYPE from module ISO_FORTRAN_ENV (for coarrays)
     teamType, // TEAM_TYPE from module ISO_FORTRAN_ENV (for coarrays)
     kindArg, // this argument is KIND=
     effectiveKind, // for function results: "kindArg" value, possibly defaulted
@@ -129,6 +130,7 @@ static constexpr TypePattern DefaultChar{CharType, KindCode::defaultCharKind};
 static constexpr TypePattern DefaultLogical{
     LogicalType, KindCode::defaultLogicalKind};
 static constexpr TypePattern BOZ{IntType, KindCode::typeless};
+static constexpr TypePattern EventType{DerivedType, KindCode::eventType};
 static constexpr TypePattern IeeeFlagType{DerivedType, KindCode::ieeeFlagType};
 static constexpr TypePattern IeeeRoundType{
     DerivedType, KindCode::ieeeRoundType};
@@ -1471,6 +1473,13 @@ static const IntrinsicInterface intrinsicSubroutine[]{
             {"time", TypePattern{RealType, KindCode::exactKind, 4},
                 Rank::scalar, Optionality::required, common::Intent::Out}},
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
+    {"event_query",
+        {{"event", EventType, Rank::scalar},
+            {"count", AnyInt, Rank::scalar, Optionality::required,
+                common::Intent::Out},
+            {"stat", AnyInt, Rank::scalar, Optionality::optional,
+                common::Intent::Out}},
+        {}, Rank::elemental, IntrinsicClass::impureSubroutine},
     {"execute_command_line",
         {{"command", DefaultChar, Rank::scalar},
             {"wait", AnyLogical, Rank::scalar, Optionality::optional},
@@ -1592,7 +1601,6 @@ static const IntrinsicInterface intrinsicSubroutine[]{
         {}, Rank::elemental, IntrinsicClass::impureSubroutine},
 };
 
-// TODO: Intrinsic subroutine EVENT_QUERY
 // TODO: Collective intrinsic subroutines: co_reduce
 
 // Finds a built-in derived type and returns it as a DynamicType.
@@ -1968,6 +1976,11 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
     case KindCode::typeless:
       argOk = false;
       break;
+    case KindCode::eventType:
+      argOk = !type->IsUnlimitedPolymorphic() &&
+          type->category() == TypeCategory::Derived &&
+          semantics::IsEventType(&type->GetDerivedTypeSpec());
+      break;
     case KindCode::ieeeFlagType:
       argOk = !type->IsUnlimitedPolymorphic() &&
           type->category() == TypeCategory::Derived &&
@@ -3239,7 +3252,7 @@ static bool ApplySpecificChecks(SpecificCall &call, FoldingContext &context) {
           "Argument of ALLOCATED() must be an ALLOCATABLE object or component"_err_en_US);
     }
   } else if (name == "atomic_add" || name == "atomic_and" ||
-      name == "atomic_or" || name == "atomic_xor") {
+      name == "atomic_or" || name == "atomic_xor" || name == "event_query") {
     return CheckForCoindexedObject(
         context.messages(), call.arguments[2], name, "stat");
   } else if (name == "atomic_cas") {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index e925e1c1c653e..5db6b426810b1 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1616,6 +1616,36 @@ static void CheckAssociated(evaluate::ActualArguments &arguments,
   }
 }
 
+// EVENT_QUERY (F'2023 16.9.82)
+static void CheckEvent_Query(evaluate::ActualArguments &arguments,
+    evaluate::FoldingContext &foldingContext) {
+  if (arguments.size() > 0 && arguments[0] &&
+      ExtractCoarrayRef(*arguments[0]).has_value()) {
+    foldingContext.messages().Say(arguments[0]->sourceLocation(),
+        "EVENT= argument to EVENT_QUERY must not be coindexed"_err_en_US);
+  }
+  if (arguments.size() > 1 && arguments[1]) {
+    if (auto dyType{arguments[1]->GetType()}) {
+      int defaultInt{
+          foldingContext.defaults().GetDefaultKind(TypeCategory::Integer)};
+      if (dyType->category() == TypeCategory::Integer &&
+          dyType->kind() < defaultInt) {
+        foldingContext.messages().Say(arguments[1]->sourceLocation(),
+            "COUNT= argument to EVENT_QUERY must be an integer with kind >= %d"_err_en_US,
+            defaultInt);
+      }
+    }
+  }
+  if (arguments.size() > 2 && arguments[2]) {
+    if (auto dyType{arguments[2]->GetType()}) {
+      if (dyType->category() == TypeCategory::Integer && dyType->kind() < 2) {
+        foldingContext.messages().Say(arguments[2]->sourceLocation(),
+            "STAT= argument to EVENT_QUERY must be an integer with kind >= 2 when present"_err_en_US);
+      }
+    }
+  }
+}
+
 // IMAGE_INDEX (F'2023 16.9.107)
 static void CheckImage_Index(evaluate::ActualArguments &arguments,
     parser::ContextualMessages &messages) {
@@ -1952,6 +1982,8 @@ static void CheckSpecificIntrinsic(const characteristics::Procedure &proc,
     const Scope *scope, const evaluate::SpecificIntrinsic &intrinsic) {
   if (intrinsic.name == "associated") {
     CheckAssociated(arguments, context, scope);
+  } else if (intrinsic.name == "event_query") {
+    CheckEvent_Query(arguments, context.foldingContext());
   } else if (intrinsic.name == "image_index") {
     CheckImage_Index(arguments, context.foldingContext().messages());
   } else if (intrinsic.name == "max" || intrinsic.name == "min") {
diff --git a/flang/test/Semantics/event_query.f90 b/flang/test/Semantics/event_query.f90
index 3f38e3dd37877..f648462bc2090 100644
--- a/flang/test/Semantics/event_query.f90
+++ b/flang/test/Semantics/event_query.f90
@@ -1,14 +1,10 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
-! XFAIL: *
 ! This test checks for semantic errors in event_query() subroutine based on the
 ! statement specification in section 16.9.72 of the Fortran 2018 standard.
 
 program test_event_query
   use iso_fortran_env, only : event_type
-  implicit none
-
-  ! event_type variables must be coarrays
-  type(event_type) non_coarray
+  implicit none(type,external)
 
   type(event_type) concert[*], occurrences(2)[*]
   integer non_event[*], counter, array(1), coarray[*], sync_status, coindexed[*], non_scalar(1)
@@ -33,70 +29,90 @@ program test_event_query
   !___ non-standard-conforming calls _______
 
   ! event-variable must be event_type
+  ! ERROR: Actual argument for 'event=' has bad type 'INTEGER(4)'
   call event_query(non_event, counter)
 
-  ! event-variable must be a coarray
-  call event_query(non_coarray, counter)
-
   ! event-variable must be a scalar variable
+  ! ERROR: 'event=' argument has unacceptable rank 1
   call event_query(occurrences, counter)
 
   ! event-variable must not be coindexed
+  ! ERROR: EVENT= argument to EVENT_QUERY must not be coindexed
   call event_query(concert[1], counter)
 
   ! event-variable has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'event_query'
   call event_query(events=concert, count=counter)
 
   ! event-variable has an argument mismatch
+  ! ERROR: Actual argument for 'event=' has bad type 'INTEGER(4)'
   call event_query(event=non_event, count=counter)
 
   ! count must be an integer
+  ! ERROR: Actual argument for 'count=' has bad type 'LOGICAL(4)'
   call event_query(concert, non_integer)
 
   ! count must be an integer scalar
+  ! ERROR: 'count=' argument has unacceptable rank 1
   call event_query(concert, non_scalar)
 
   ! count must be have a decimal exponent range
   ! no smaller than that of default integer
+  ! ERROR: COUNT= argument to EVENT_QUERY must be an integer with kind >= 4
   call event_query(concert, non_default)
 
   ! count is an intent(out) argument
+  ! ERROR: Actual argument associated with INTENT(OUT) dummy argument 'count=' is not definable
+  ! ERROR: '4_4' is not a variable or pointer
   call event_query(concert, 4)
 
   ! count has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'event_query'
   call event_query(concert, counts=counter)
 
   ! count has an argument mismatch
+  ! ERROR: Actual argument for 'count=' has bad type 'LOGICAL(4)'
   call event_query(concert, count=non_integer)
 
   ! stat must be an integer
+  ! ERROR: Actual argument for 'stat=' has bad type 'LOGICAL(4)'
   call event_query(concert, counter, non_integer)
 
   ! stat must be an integer scalar
+  ! ERROR: 'stat=' argument has unacceptable rank 1
   call event_query(concert, counter, non_scalar)
 
   ! stat is an intent(out) argument
+  ! ERROR: Actual argument associated with INTENT(OUT) dummy argument 'stat=' is not definable
+  ! ERROR: '8_4' is not a variable or pointer
   call event_query(concert, counter, 8)
 
   ! stat has an unknown keyword argument
+  ! ERROR: unknown keyword argument to intrinsic 'event_query'
   call event_query(concert, counter, status=sync_status)
 
   ! stat has an argument mismatch
+  ! ERROR: Actual argument for 'stat=' has bad type 'LOGICAL(4)'
   call event_query(concert, counter, stat=non_integer)
 
   ! stat must not be coindexed
+  ! ERROR: 'stat' argument to 'event_query' may not be a coindexed object
   call event_query(concert, counter, coindexed[1])
 
   ! Too many arguments
+  ! ERROR: too many actual arguments for intrinsic 'event_query'
   call event_query(concert, counter, sync_status, array(1))
 
   ! Repeated event keyword
+  ! ERROR: repeated keyword argument to intrinsic 'event_query'
   call event_query(event=concert, event=occurrences(1), count=counter)
 
   ! Repeated count keyword
+  ! ERROR: repeated keyword argument to intrinsic 'event_query'
   call event_query(event=concert, count=counter, count=array(1))
 
   ! Repeated stat keyword
+  ! ERROR: repeated keyword argument to intrinsic 'event_query'
   call event_query(event=concert, count=counter, stat=sync_status, stat=array(1))
 
 end program test_event_query

From d732c86c928271cf3a829d95a1fcc560894ab8e4 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Mon, 27 Jan 2025 11:57:01 -0800
Subject: [PATCH 273/432] [flang] Don't take corank from actual intrinsic
 argument (#124029)

When constructing the characteristics of a particular reference to an
intrinsic procedure that was passed a non-coindexed reference to local
coarray data as an actual argument, don't add the corank of the actual
argument to those characteristics.

Also clean up the TypeAndShape characteristics class a little; the
Attr::Coarray is redundant since the corank() accessor can be used to
the same effect.
---
 .../include/flang/Evaluate/characteristics.h  | 10 +++-------
 flang/lib/Evaluate/characteristics.cpp        | 20 +++++++++----------
 flang/lib/Evaluate/intrinsics.cpp             |  4 ++++
 flang/lib/Lower/CallInterface.cpp             |  5 +----
 flang/lib/Semantics/check-call.cpp            |  4 ++--
 flang/test/Semantics/call08.f90               |  2 ++
 6 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/flang/include/flang/Evaluate/characteristics.h b/flang/include/flang/Evaluate/characteristics.h
index 357fc3e595243..5cae8a68f599b 100644
--- a/flang/include/flang/Evaluate/characteristics.h
+++ b/flang/include/flang/Evaluate/characteristics.h
@@ -60,8 +60,7 @@ bool ShapesAreCompatible(const std::optional<Shape> &,
 
 class TypeAndShape {
 public:
-  ENUM_CLASS(
-      Attr, AssumedRank, AssumedShape, AssumedSize, DeferredShape, Coarray)
+  ENUM_CLASS(Attr, AssumedRank, AssumedShape, AssumedSize, DeferredShape)
   using Attrs = common::EnumSet<Attr, Attr_enumSize>;
 
   explicit TypeAndShape(DynamicType t) : type_{t}, shape_{Shape{}} {
@@ -103,9 +102,6 @@ class TypeAndShape {
     if (auto type{x.GetType()}) {
       TypeAndShape result{*type, GetShape(context, x, invariantOnly)};
       result.corank_ = GetCorank(x);
-      if (result.corank_ > 0) {
-        result.attrs_.set(Attr::Coarray);
-      }
       if (type->category() == TypeCategory::Character) {
         if (const auto *chExpr{UnwrapExpr<Expr<SomeCharacter>>(x)}) {
           if (auto length{chExpr->LEN()}) {
@@ -179,14 +175,14 @@ class TypeAndShape {
   const std::optional<Shape> &shape() const { return shape_; }
   const Attrs &attrs() const { return attrs_; }
   int corank() const { return corank_; }
+  void set_corank(int n) { corank_ = n; }
 
   // Return -1 for assumed-rank as a safety.
   int Rank() const { return shape_ ? GetRank(*shape_) : -1; }
 
   // Can sequence association apply to this argument?
   bool CanBeSequenceAssociated() const {
-    constexpr Attrs notAssumedOrExplicitShape{
-        ~Attrs{Attr::AssumedSize, Attr::Coarray}};
+    constexpr Attrs notAssumedOrExplicitShape{~Attrs{Attr::AssumedSize}};
     return Rank() > 0 && (attrs() & notAssumedOrExplicitShape).none();
   }
 
diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 3912d1c4b4771..c5470df2622a5 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -227,9 +227,8 @@ void TypeAndShape::AcquireAttrs(const semantics::Symbol &symbol) {
   } else if (semantics::IsAssumedSizeArray(symbol)) {
     attrs_.set(Attr::AssumedSize);
   }
-  if (int n{GetCorank(symbol)}) {
-    corank_ = n;
-    attrs_.set(Attr::Coarray);
+  if (int corank{GetCorank(symbol)}; corank > 0) {
+    corank_ = corank;
   }
   if (const auto *object{
           symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()};
@@ -439,9 +438,9 @@ bool DummyDataObject::CanBePassedViaImplicitInterface(
     return false; // 15.4.2.2(3)(a)
   } else if ((type.attrs() &
                  TypeAndShape::Attrs{TypeAndShape::Attr::AssumedShape,
-                     TypeAndShape::Attr::AssumedRank,
-                     TypeAndShape::Attr::Coarray})
-                 .any()) {
+                     TypeAndShape::Attr::AssumedRank})
+                 .any() ||
+      type.corank() > 0) {
     if (whyNot) {
       *whyNot = "a dummy argument is assumed-shape, assumed-rank, or a coarray";
     }
@@ -471,14 +470,15 @@ bool DummyDataObject::CanBePassedViaImplicitInterface(
 }
 
 bool DummyDataObject::IsPassedByDescriptor(bool isBindC) const {
-  constexpr TypeAndShape::Attrs shapeRequiringBox = {
+  constexpr TypeAndShape::Attrs shapeRequiringBox{
       TypeAndShape::Attr::AssumedShape, TypeAndShape::Attr::DeferredShape,
-      TypeAndShape::Attr::AssumedRank, TypeAndShape::Attr::Coarray};
+      TypeAndShape::Attr::AssumedRank};
   if ((attrs & Attrs{Attr::Allocatable, Attr::Pointer}).any()) {
     return true;
   } else if ((type.attrs() & shapeRequiringBox).any()) {
-    // Need to pass shape/coshape info in a descriptor.
-    return true;
+    return true; // pass shape in descriptor
+  } else if (type.corank() > 0) {
+    return true; // pass coshape in descriptor
   } else if (type.type().IsPolymorphic() && !type.type().IsAssumedType()) {
     // Need to pass dynamic type info in a descriptor.
     return true;
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 954581fd713a2..29f8e5fcc49d5 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -2576,6 +2576,10 @@ std::optional<SpecificCall> IntrinsicInterface::Match(
             // Dummy procedures are never elemental.
             dummyProc->procedure.value().attrs.reset(
                 characteristics::Procedure::Attr::Elemental);
+          } else if (auto *dummyObject{
+                         std::get_if<characteristics::DummyDataObject>(
+                             &dc->u)}) {
+            dummyObject->type.set_corank(0);
           }
           dummyArgs.emplace_back(std::move(*dc));
           if (d.typePattern.kindCode == KindCode::same && !sameDummyArg) {
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index e20b90b2ff1bc..ab421d81141f2 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -1132,10 +1132,7 @@ class Fortran::lower::CallInterfaceImpl {
 
     // TODO: intents that require special care (e.g finalization)
 
-    using ShapeAttr = Fortran::evaluate::characteristics::TypeAndShape::Attr;
-    const Fortran::evaluate::characteristics::TypeAndShape::Attrs &shapeAttrs =
-        obj.type.attrs();
-    if (shapeAttrs.test(ShapeAttr::Coarray))
+    if (obj.type.corank() > 0)
       TODO(loc, "coarray: dummy argument coarray in procedure interface");
 
     // So far assume that if the argument cannot be passed by implicit interface
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index 5db6b426810b1..e396ece303103 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -112,9 +112,9 @@ static bool CanAssociateWithStorageSequence(
              characteristics::TypeAndShape::Attr::AssumedRank) &&
       !dummy.type.attrs().test(
           characteristics::TypeAndShape::Attr::AssumedShape) &&
-      !dummy.type.attrs().test(characteristics::TypeAndShape::Attr::Coarray) &&
       !dummy.attrs.test(characteristics::DummyDataObject::Attr::Allocatable) &&
-      !dummy.attrs.test(characteristics::DummyDataObject::Attr::Pointer);
+      !dummy.attrs.test(characteristics::DummyDataObject::Attr::Pointer) &&
+      dummy.type.corank() == 0;
 }
 
 // When a CHARACTER actual argument is known to be short,
diff --git a/flang/test/Semantics/call08.f90 b/flang/test/Semantics/call08.f90
index f4c690e0c96e0..1b7029102309c 100644
--- a/flang/test/Semantics/call08.f90
+++ b/flang/test/Semantics/call08.f90
@@ -26,6 +26,7 @@ subroutine test(x,c3,c4)
     real :: x(:)[*]
     real, intent(in) :: c3(:)[*]
     real, contiguous, intent(in) :: c4(:)[*]
+    character(2) :: coarr(2)[*] = [ "ab", "cd" ]
     call s01(c1) ! ok
     call s02(c2) ! ok
     call s03(c4) ! ok
@@ -44,5 +45,6 @@ subroutine test(x,c3,c4)
     call s04(c3)
     !ERROR: Actual argument associated with coarray dummy argument 'x=' (not assumed shape or rank) must be simply contiguous
     call s04(x)
+    print *, ichar(coarr(:)(1:1)) ! ok, ensure no bogus contiguity error
   end subroutine
 end module

From 08a18efc664a85b4db8ca46ac986ec2d154ae433 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 27 Jan 2025 21:07:34 +0100
Subject: [PATCH 274/432] [libc++][doc] Update the release notes for LLVM 20.
 (#124403)

This is a preparation for the upcoming LLVM 20 release.
---
 libcxx/docs/ReleaseNotes/20.rst    | 24 ++++++++++++++----------
 libcxx/docs/Status/Cxx23Papers.csv |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 2736061544c53..57ab0c167544b 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -35,6 +35,8 @@ see the `releases page <https://llvm.org/releases/>`_.
 What's New in Libc++ 20.0.0?
 ==============================
 
+The main focus of the libc++ team has been to implement new C++20, C++23, and C++26 features.
+
 Implemented Papers
 ------------------
 
@@ -44,7 +46,7 @@ Implemented Papers
 - P2985R0: A type trait for detecting virtual base classes (`Github <https://github.com/llvm/llvm-project/issues/105432>`__)
 - ``std::jthread`` and ``<stop_token>`` are not guarded behind ``-fexperimental-library`` anymore
 - P2674R1: A trait for implicit lifetime types (`Github <https://github.com/llvm/llvm-project/issues/105259>`__)
-- P0429R9: A Standard ``flat_map`` is partially implemented and ``flat_map`` is provided (`Github <https://github.com/llvm/llvm-project/issues/105190>`__)
+- P0429R9: A Standard ``flat_map`` (`Github <https://github.com/llvm/llvm-project/issues/105190>`__)
 
 Improvements and New Features
 -----------------------------
@@ -111,6 +113,13 @@ Improvements and New Features
   std::errc::not_a_directory``, or use ``err.default_error_condition()`` to map to an ``error_condition``, and then test
   its ``value()`` and ``category()``.
 
+- ``std::stable_sort`` uses radix sort for integral types now, which can improve the performance up to 10 times, depending
+  on type of sorted elements and the initial state of the sorted array.
+
+- Reduced the amount of debug information generated for internal typedefs. This reduces the size of debug builds.
+
+- Added :ref:`hardening mode <hardening>` support for ``forward_list`` and ``bitset``.
+
 Deprecations and Removals
 -------------------------
 
@@ -129,10 +138,10 @@ Deprecations and Removals
   supported as an extension anymore, please migrate any code that uses e.g. ``std::vector<const T>`` to be
   standards conforming.
 
-- Non-conforming member typedefs ``base``, ``iterator``, ``const_iterator``, and ``const_reference`` of ``std::bitset``,
-  and member typedef ``base`` of ``std::forward_list`` and ``std::list`` are removed. Previously, these member typedefs
-  (except ``const_reference``) were private but could cause ambiguity in name lookup. Code that expects such ambiguity
-  will possibly not compile in LLVM 20.
+- Non-conforming member typedefs ``base``, ``iterator``, ``const_iterator``, ``size_type``, ``difference_type``, and
+  ``const_reference`` of ``std::bitset``, and member typedef ``base`` of ``std::forward_list`` and ``std::list`` are
+  removed. Previously, these member typedefs (except ``const_reference``) were private but could cause ambiguity in name
+  lookup. Code that expects such ambiguity will possibly not compile in LLVM 20.
 
 - The function ``__libcpp_verbose_abort()`` is now ``noexcept``, to match ``std::terminate()``. (The combination of
   ``noexcept`` and ``[[noreturn]]`` has special significance for function effects analysis.) For backwards compatibility,
@@ -196,8 +205,3 @@ ABI Affecting Changes
 
 - The localization support base API has been reimplemented, leading to different functions being exported from the
   libc++ built library on Windows and Windows-like platforms.
-
-Build System Changes
---------------------
-
-- TODO
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index bc9d4f8866a73..264c5417a5c28 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -52,7 +52,7 @@
 "`P2443R1 <https://wg21.link/P2443R1>`__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18",""
 "","","","","",""
 "`P0009R18 <https://wg21.link/P0009R18>`__","mdspan: A Non-Owning Multidimensional Array Reference","2022-07 (Virtual)","|Complete|","18",""
-"`P0429R9 <https://wg21.link/P0429R9>`__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","",""
+"`P0429R9 <https://wg21.link/P0429R9>`__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","20",""
 "`P1169R4 <https://wg21.link/P1169R4>`__","``static operator()``","2022-07 (Virtual)","|Complete|","16",""
 "`P1222R4 <https://wg21.link/P1222R4>`__","A Standard ``flat_set``","2022-07 (Virtual)","","",""
 "`P1223R5 <https://wg21.link/P1223R5>`__","``ranges::find_last()``, ``ranges::find_last_if()``, and ``ranges::find_last_if_not()``","2022-07 (Virtual)","|Complete|","19",""

From c4c76eabb88f8ee5b92fa9a84452875b0cb67d0d Mon Sep 17 00:00:00 2001
From: vdonaldson <37090318+vdonaldson@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:11:24 -0500
Subject: [PATCH 275/432] [flang] IEEE underflow control for Arm (#124617)

Update IEEE_SUPPORT_UNDERFLOW_CONTROL, IEEE_GET_UNDERFLOW_MODE, and
IEEE_SET_UNDERFLOW_MODE code for Arm.
---
 flang/include/flang/Tools/TargetSetup.h | 31 ++++++++++-----------
 flang/runtime/exceptions.cpp            | 36 ++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index d1b0da3a42c89..5d23df6823a94 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -24,34 +24,35 @@ namespace Fortran::tools {
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
-  // FIXME: Handle real(3) ?
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
-    targetCharacteristics.DisableType(
-        Fortran::common::TypeCategory::Real, /*kind=*/10);
-  }
+
+  targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true);
+
   if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
   }
+
   if (targetTriple.isARM() || targetTriple.isAArch64()) {
     targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime();
     targetCharacteristics.set_ieeeFeature(
         evaluate::IeeeFeature::Halting, false);
-  } else {
-    targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
+    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
+  }
+
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
+    targetCharacteristics.DisableType(
+        Fortran::common::TypeCategory::Real, /*kind=*/10);
   }
 
-  // Figure out if we can support F128: see
-  // flang/runtime/Float128Math/math-entries.h
-  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
-  // cross-compilation
+  // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h.
+  // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation.
 #ifdef FLANG_RUNTIME_F128_MATH_LIB
-  // we can use libquadmath wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libquadmath wrappers
 #elif HAS_LDBL128
-  // we can use libm wrappers
-  constexpr bool f128Support = true;
+  constexpr bool f128Support = true; // use libm wrappers
 #else
   constexpr bool f128Support = false;
 #endif
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index f541b8e844ade..21648e8ea8c23 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,7 +11,9 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if __x86_64__
+#if __aarch64__
+#include <fpu_control.h>
+#elif __x86_64__
 #include <xmmintrin.h>
 #endif
 
@@ -90,20 +92,40 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
 #endif
 }
 
+// A hardware FZ (flush to zero) bit is the negation of the
+// ieee_[get|set]_underflow_mode GRADUAL argument.
+#if defined(_MM_FLUSH_ZERO_MASK)
+// The x86_64 MXCSR FZ bit affects computations of real kinds 3, 4, and 8.
+#elif defined(_FPU_GETCW)
+// The aarch64 FPCR FZ bit affects computations of real kinds 3, 4, and 8.
+// bit 24: FZ   -- single, double precision flush to zero bit
+// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant]
+#define _FPU_FPCR_FZ_MASK_ 0x01080000
+#endif
+
 bool RTNAME(GetUnderflowMode)(void) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
+#elif defined(_FPU_GETCW)
+  uint64_t fpcr;
+  _FPU_GETCW(fpcr);
+  return (fpcr & _FPU_FPCR_FZ_MASK_) == 0;
 #else
   return false;
 #endif
 }
 void RTNAME(SetUnderflowMode)(bool flag) {
-#if _MM_FLUSH_ZERO_MASK
-  // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode
-  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
+#if defined(_MM_FLUSH_ZERO_MASK)
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
+#elif defined(_FPU_GETCW)
+  uint64_t fpcr;
+  _FPU_GETCW(fpcr);
+  if (flag) {
+    fpcr &= ~_FPU_FPCR_FZ_MASK_;
+  } else {
+    fpcr |= _FPU_FPCR_FZ_MASK_;
+  }
+  _FPU_SETCW(fpcr);
 #endif
 }
 

From a85b2dc45a5f1312d6ee5e2522c24e81a086bf60 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 27 Jan 2025 21:30:29 +0100
Subject: [PATCH 276/432] [Clang] only inherit the parent eval context inside
 of lambdas (#124426)

As we create defaul constructors lazily, we should not inherit from the
parent evaluation context.
However, we need to make an exception for lambdas (in particular their
conversion operators, which are also implicitly defined).

As a drive-by, we introduce a generic way to query whether a function is
a member of a lambda.

This fixes a regression introduced by baf6bd3.

Fixes #118000
---
 clang/include/clang/AST/ASTLambda.h              |  7 +++++++
 clang/include/clang/Sema/Sema.h                  | 15 +++++++++------
 clang/lib/AST/ASTImporter.cpp                    |  6 ++----
 clang/lib/AST/Expr.cpp                           |  3 +--
 clang/lib/CodeGen/CGExpr.cpp                     |  4 ++--
 clang/lib/Sema/SemaDeclCXX.cpp                   |  3 +--
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp   |  3 +--
 clang/test/SemaCXX/cxx2b-consteval-propagate.cpp | 14 ++++++++++++++
 8 files changed, 37 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/AST/ASTLambda.h b/clang/include/clang/AST/ASTLambda.h
index 646cb574847fe..a1854b6a1a949 100644
--- a/clang/include/clang/AST/ASTLambda.h
+++ b/clang/include/clang/AST/ASTLambda.h
@@ -17,6 +17,7 @@
 
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclTemplate.h"
+#include "llvm/Support/Casting.h"
 
 namespace clang {
 inline StringRef getLambdaStaticInvokerName() {
@@ -35,6 +36,12 @@ inline bool isLambdaCallOperator(const DeclContext *DC) {
   return isLambdaCallOperator(cast<CXXMethodDecl>(DC));
 }
 
+inline bool isLambdaMethod(const DeclContext *DC) {
+  if (const auto *MD = dyn_cast_if_present<CXXMethodDecl>(DC))
+    return MD->getParent()->isLambda();
+  return false;
+}
+
 inline bool isLambdaCallWithExplicitObjectParameter(const DeclContext *DC) {
   return isLambdaCallOperator(DC) &&
          cast<CXXMethodDecl>(DC)->isExplicitObjectMemberFunction();
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 4d6e02fe2956e..528304409b809 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -16,6 +16,7 @@
 
 #include "clang/APINotes/APINotesManager.h"
 #include "clang/AST/ASTFwd.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/AttrIterator.h"
 #include "clang/AST/CharUnits.h"
@@ -13108,14 +13109,16 @@ class Sema final : public SemaBase {
               ? ExpressionEvaluationContext::ImmediateFunctionContext
               : ExpressionEvaluationContext::PotentiallyEvaluated);
       if (FD) {
+        auto &Current = S.currentEvaluationContext();
+        const auto &Parent = S.parentEvaluationContext();
+
         FD->setWillHaveBody(true);
-        S.ExprEvalContexts.back().InImmediateFunctionContext =
+        Current.InImmediateFunctionContext =
             FD->isImmediateFunction() ||
-            S.ExprEvalContexts[S.ExprEvalContexts.size() - 2]
-                .isConstantEvaluated() ||
-            S.ExprEvalContexts[S.ExprEvalContexts.size() - 2]
-                .isImmediateFunctionContext();
-        S.ExprEvalContexts.back().InImmediateEscalatingFunctionContext =
+            (isLambdaMethod(FD) && (Parent.isConstantEvaluated() ||
+                                    Parent.isImmediateFunctionContext()));
+
+        Current.InImmediateEscalatingFunctionContext =
             S.getLangOpts().CPlusPlus20 && FD->isImmediateEscalating();
       } else
         assert(isa<ObjCMethodDecl>(DC));
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index be1a65a49622d..f661de54504a9 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/ASTDiagnostic.h"
 #include "clang/AST/ASTImporterSharedState.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/ASTStructuralEquivalence.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
@@ -3729,10 +3730,7 @@ bool ASTNodeImporter::hasReturnTypeDeclaredInside(FunctionDecl *D) {
     if (Importer.FromContext.getLangOpts().CPlusPlus14) // C++14 or later
       return false;
 
-    if (const auto *MD = dyn_cast<CXXMethodDecl>(D))
-      return cast<CXXRecordDecl>(MD->getDeclContext())->isLambda();
-
-    return false;
+    return isLambdaMethod(D);
   };
 
   QualType RetT = FromFPT->getReturnType();
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 31b95bca613c2..06b0491442673 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -895,8 +895,7 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     // type deduction and lambdas. For trailing return types resolve the
     // decltype expression. Otherwise print the real type when this is
     // not a constructor or destructor.
-    if (isa<CXXMethodDecl>(FD) &&
-         cast<CXXMethodDecl>(FD)->getParent()->isLambda())
+    if (isLambdaMethod(FD))
       Proto = "auto " + Proto;
     else if (FT && FT->getReturnType()->getAs<DecltypeType>())
       FT->getReturnType()
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index e9f5497aaae24..d921b4f19212c 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -24,6 +24,7 @@
 #include "ConstantEmitter.h"
 #include "TargetInfo.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclObjC.h"
 #include "clang/AST/NSAPI.h"
@@ -1809,8 +1810,7 @@ CodeGenFunction::tryEmitAsConstant(DeclRefExpr *refExpr) {
   if (CGM.getLangOpts().CUDAIsDevice && result.Val.isLValue() &&
       refExpr->refersToEnclosingVariableOrCapture()) {
     auto *MD = dyn_cast_or_null<CXXMethodDecl>(CurCodeDecl);
-    if (MD && MD->getParent()->isLambda() &&
-        MD->getOverloadedOperator() == OO_Call) {
+    if (isLambdaMethod(MD) && MD->getOverloadedOperator() == OO_Call) {
       const APValue::LValueBase &base = result.Val.getLValueBase();
       if (const ValueDecl *D = base.dyn_cast<const ValueDecl *>()) {
         if (const VarDecl *VD = dyn_cast<const VarDecl>(D)) {
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index c1a016ff91cb2..97954a2b3c6db 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -13855,8 +13855,7 @@ void Sema::setupImplicitSpecialMemberType(CXXMethodDecl *SpecialMem,
   // During template instantiation of implicit special member functions we need
   // a reliable TypeSourceInfo for the function prototype in order to allow
   // functions to be substituted.
-  if (inTemplateInstantiation() &&
-      cast<CXXRecordDecl>(SpecialMem->getParent())->isLambda()) {
+  if (inTemplateInstantiation() && isLambdaMethod(SpecialMem)) {
     TypeSourceInfo *TSI =
         Context.getTrivialTypeSourceInfo(SpecialMem->getType());
     SpecialMem->setTypeSourceInfo(TSI);
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 6a2331e59477a..b3dcb12c4c835 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2606,8 +2606,7 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
   // conditionally populate the TSI without breaking non-template related use
   // cases. Populate TSIs prior to calling SubstFunctionType to make sure we get
   // a proper transformation.
-  if (cast<CXXRecordDecl>(D->getParent())->isLambda() &&
-      !D->getTypeSourceInfo() &&
+  if (isLambdaMethod(D) && !D->getTypeSourceInfo() &&
       isa<CXXConstructorDecl, CXXDestructorDecl>(D)) {
     TypeSourceInfo *TSI =
         SemaRef.Context.getTrivialTypeSourceInfo(D->getType());
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 868a52c357241..05904d9ade067 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -544,5 +544,19 @@ void g() {
    tfn<int>(a); // expected-error {{call to immediate function 'GH123405::tfn<int>' is not a constant expression}}\
                 // expected-note {{read of non-const variable 'a' is not allowed in a constant expression}}
 }
+} // namespace GH123405
 
+namespace GH118000 {
+consteval int baz() { return 0;}
+struct S {
+    int mSize = baz();
+};
+
+consteval void bar() {
+    S s;
+}
+
+void foo() {
+    S s;
 }
+} // namespace GH118000

From 285009f202ca8bfcc6b607eba0e919867559e725 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Thu, 23 Jan 2025 17:06:04 +0000
Subject: [PATCH 277/432] [NFC][DebugInfo] Rewrite more call-sites to insert
 with iterators (#124288)

As part of the "RemoveDIs" work to eliminate debug intrinsics, we're
replacing methods that use Instruction*'s as positions with iterators. The
call-sites updated in this patch are those where the dyn_cast_or_null cast
utility doesn't compose well with iterator insertion. It can distinguish
between nullptr and a "present" (non-null) Instruction pointer, but not
between a legal and illegal instruction iterator. This can lead to
end-iterator dereferences and thus crashes.

We can improve this in the future (as parent-pointers can now be accessed
from ilist nodes), but for the moment, add explicit tests for end()
iterators at the five call sites affected by this.
---
 clang/lib/CodeGen/CGObjCRuntime.cpp          | 13 ++++--
 llvm/lib/IR/Verifier.cpp                     |  6 ++-
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 49 +++++++++++---------
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 17 +++++--
 4 files changed, 53 insertions(+), 32 deletions(-)

diff --git a/clang/lib/CodeGen/CGObjCRuntime.cpp b/clang/lib/CodeGen/CGObjCRuntime.cpp
index b438a92a4fd62..a7f5c913f42fc 100644
--- a/clang/lib/CodeGen/CGObjCRuntime.cpp
+++ b/clang/lib/CodeGen/CGObjCRuntime.cpp
@@ -230,11 +230,14 @@ void CGObjCRuntime::EmitTryCatchStmt(CodeGenFunction &CGF,
     CodeGenFunction::LexicalScope Cleanups(CGF, Handler.Body->getSourceRange());
     SaveAndRestore RevertAfterScope(CGF.CurrentFuncletPad);
     if (useFunclets) {
-      llvm::Instruction *CPICandidate = Handler.Block->getFirstNonPHI();
-      if (auto *CPI = dyn_cast_or_null<llvm::CatchPadInst>(CPICandidate)) {
-        CGF.CurrentFuncletPad = CPI;
-        CPI->setOperand(2, CGF.getExceptionSlot().emitRawPointer(CGF));
-        CGF.EHStack.pushCleanup<CatchRetScope>(NormalCleanup, CPI);
+      llvm::BasicBlock::iterator CPICandidate =
+          Handler.Block->getFirstNonPHIIt();
+      if (CPICandidate != Handler.Block->end()) {
+        if (auto *CPI = dyn_cast_or_null<llvm::CatchPadInst>(CPICandidate)) {
+          CGF.CurrentFuncletPad = CPI;
+          CPI->setOperand(2, CGF.getExceptionSlot().emitRawPointer(CGF));
+          CGF.EHStack.pushCleanup<CatchRetScope>(NormalCleanup, CPI);
+        }
       }
     }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 54de812517438..8432779c107de 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6521,8 +6521,10 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       const ColorVector &CV = BlockEHFuncletColors.find(CallBB)->second;
       assert(CV.size() > 0 && "Uncolored block");
       for (BasicBlock *ColorFirstBB : CV)
-        if (dyn_cast_or_null<FuncletPadInst>(ColorFirstBB->getFirstNonPHI()))
-          InEHFunclet = true;
+        if (auto It = ColorFirstBB->getFirstNonPHIIt();
+            It != ColorFirstBB->end())
+          if (dyn_cast_or_null<FuncletPadInst>(&*It))
+            InEHFunclet = true;
 
       // Check for funclet operand bundle
       bool HasToken = false;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index ee4c3f8082858..5f104f1692731 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1448,34 +1448,39 @@ static void rewritePHIs(BasicBlock &BB) {
 
   // Special case for CleanupPad: all EH blocks must have the same unwind edge
   // so we need to create an additional "dispatcher" block.
-  if (auto *CleanupPad =
-          dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHI())) {
-    SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
-    for (BasicBlock *Pred : Preds) {
-      if (CatchSwitchInst *CS =
-              dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
-        // CleanupPad with a CatchSwitch predecessor: therefore this is an
-        // unwind destination that needs to be handle specially.
-        assert(CS->getUnwindDest() == &BB);
-        (void)CS;
-        rewritePHIsForCleanupPad(&BB, CleanupPad);
-        return;
+  if (!BB.empty()) {
+    if (auto *CleanupPad =
+            dyn_cast_or_null<CleanupPadInst>(BB.getFirstNonPHIIt())) {
+      SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
+      for (BasicBlock *Pred : Preds) {
+        if (CatchSwitchInst *CS =
+                dyn_cast<CatchSwitchInst>(Pred->getTerminator())) {
+          // CleanupPad with a CatchSwitch predecessor: therefore this is an
+          // unwind destination that needs to be handle specially.
+          assert(CS->getUnwindDest() == &BB);
+          (void)CS;
+          rewritePHIsForCleanupPad(&BB, CleanupPad);
+          return;
+        }
       }
     }
   }
 
   LandingPadInst *LandingPad = nullptr;
   PHINode *ReplPHI = nullptr;
-  if ((LandingPad = dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHI()))) {
-    // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
-    // We replace the original landing pad with a PHINode that will collect the
-    // results from all of them.
-    ReplPHI = PHINode::Create(LandingPad->getType(), 1, "");
-    ReplPHI->insertBefore(LandingPad->getIterator());
-    ReplPHI->takeName(LandingPad);
-    LandingPad->replaceAllUsesWith(ReplPHI);
-    // We will erase the original landing pad at the end of this function after
-    // ehAwareSplitEdge cloned it in the transition blocks.
+  if (!BB.empty()) {
+    if ((LandingPad =
+             dyn_cast_or_null<LandingPadInst>(BB.getFirstNonPHIIt()))) {
+      // ehAwareSplitEdge will clone the LandingPad in all the edge blocks.
+      // We replace the original landing pad with a PHINode that will collect the
+      // results from all of them.
+      ReplPHI = PHINode::Create(LandingPad->getType(), 1, "");
+      ReplPHI->insertBefore(LandingPad->getIterator());
+      ReplPHI->takeName(LandingPad);
+      LandingPad->replaceAllUsesWith(ReplPHI);
+      // We will erase the original landing pad at the end of this function after
+      // ehAwareSplitEdge cloned it in the transition blocks.
+    }
   }
 
   SmallVector<BasicBlock *, 8> Preds(predecessors(&BB));
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 797c2f3f5bb81..e1f767edd6ee1 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -824,16 +824,27 @@ static void updateScopeLine(Instruction *ActiveSuspend,
   if (!ActiveSuspend)
     return;
 
-  auto *Successor = ActiveSuspend->getNextNonDebugInstruction();
+  // No subsequent instruction -> fallback to the location of ActiveSuspend.
+  if (!ActiveSuspend->getNextNonDebugInstruction()) {
+    if (auto DL = ActiveSuspend->getDebugLoc())
+      if (SPToUpdate.getFile() == DL->getFile())
+        SPToUpdate.setScopeLine(DL->getLine());
+    return;
+  }
+
+  BasicBlock::iterator Successor =
+      ActiveSuspend->getNextNonDebugInstruction()->getIterator();
   // Corosplit splits the BB around ActiveSuspend, so the meaningful
   // instructions are not in the same BB.
   if (auto *Branch = dyn_cast_or_null<BranchInst>(Successor);
       Branch && Branch->isUnconditional())
-    Successor = &*Branch->getSuccessor(0)->getFirstNonPHIOrDbg();
+    Successor = Branch->getSuccessor(0)->getFirstNonPHIOrDbg();
 
   // Find the first successor of ActiveSuspend with a non-zero line location.
   // If that matches the file of ActiveSuspend, use it.
-  for (; Successor; Successor = Successor->getNextNonDebugInstruction()) {
+  BasicBlock *PBB = Successor->getParent();
+  for (; Successor != PBB->end(); Successor = std::next(Successor)) {
+    Successor = skipDebugIntrinsics(Successor);
     auto DL = Successor->getDebugLoc();
     if (!DL || DL.getLine() == 0)
       continue;

From bd5d361c059814435bab24189e79e01d94c7039d Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Mon, 27 Jan 2025 14:41:33 -0600
Subject: [PATCH 278/432] [mlir][vector] add support for linearizing
 vector.bitcast in VectorLinearize (#123110)

This PR adds support for converting Vector::BitCastOp working on ND
(N >1) vectors into the same op working on linearized (1D) vectors.
---
 .../Vector/Transforms/VectorLinearize.cpp     | 51 ++++++++++++--
 mlir/test/Dialect/Vector/linearize.mlir       | 67 ++++++++++++++++++-
 2 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
index 68535ae5a7a5c..3ecd585c5a26d 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorLinearize.cpp
@@ -72,13 +72,14 @@ struct LinearizeConstant final : OpConversionPattern<arith::ConstantOp> {
     auto resType =
         getTypeConverter()->convertType<VectorType>(constOp.getType());
 
+    if (!resType)
+      return rewriter.notifyMatchFailure(loc, "can't convert return type");
+
     if (resType.isScalable() && !isa<SplatElementsAttr>(constOp.getValue()))
       return rewriter.notifyMatchFailure(
           loc,
           "Cannot linearize a constant scalable vector that's not a splat");
 
-    if (!resType)
-      return rewriter.notifyMatchFailure(loc, "can't convert return type");
     if (!isLessThanTargetBitWidth(constOp, targetVectorBitWidth))
       return rewriter.notifyMatchFailure(
           loc, "Can't flatten since targetBitWidth <= OpSize");
@@ -459,6 +460,45 @@ struct LinearizeVectorInsert final
 private:
   unsigned targetVectorBitWidth;
 };
+
+/// This pattern converts the BitCastOp that works on nD (n > 1)
+/// vectors to a BitCastOp that works on linearized vectors.
+/// Following,
+///   vector.bitcast %v1: vector<4x2xf32> to vector<4x4xf16>
+/// is converted to :
+///   %v1_1d = vector.shape_cast %v1: vector<4x2xf32> to vector<8xf32>
+///   %out_1d = vector.bitcast %v1_1d: vector<8xf32> to vector<16xf16>
+///   %out_nd = vector.shape_cast %out_1d: vector<16xf16> to vector<4x4xf16>
+struct LinearizeVectorBitCast final
+    : public OpConversionPattern<vector::BitCastOp> {
+  using OpConversionPattern::OpConversionPattern;
+  LinearizeVectorBitCast(
+      const TypeConverter &typeConverter, MLIRContext *context,
+      unsigned targetVectBitWidth = std::numeric_limits<unsigned>::max(),
+      PatternBenefit benefit = 1)
+      : OpConversionPattern(typeConverter, context, benefit),
+        targetVectorBitWidth(targetVectBitWidth) {}
+  LogicalResult
+  matchAndRewrite(vector::BitCastOp castOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = castOp.getLoc();
+    auto resType = getTypeConverter()->convertType(castOp.getType());
+    if (!resType)
+      return rewriter.notifyMatchFailure(loc, "can't convert return type.");
+
+    if (!isLessThanTargetBitWidth(castOp, targetVectorBitWidth))
+      return rewriter.notifyMatchFailure(
+          loc, "Can't flatten since targetBitWidth <= OpSize");
+
+    rewriter.replaceOpWithNewOp<vector::BitCastOp>(castOp, resType,
+                                                   adaptor.getSource());
+    return mlir::success();
+  }
+
+private:
+  unsigned targetVectorBitWidth;
+};
+
 } // namespace
 
 void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
@@ -485,7 +525,7 @@ void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
   typeConverter.addTargetMaterialization(materializeCast);
   target.markUnknownOpDynamicallyLegal(
       [=](Operation *op) -> std::optional<bool> {
-        if ((isa<arith::ConstantOp>(op) ||
+        if ((isa<arith::ConstantOp>(op) || isa<vector::BitCastOp>(op) ||
              op->hasTrait<OpTrait::Vectorizable>())) {
           return (isLessThanTargetBitWidth(op, targetBitWidth)
                       ? typeConverter.isLegal(op)
@@ -494,8 +534,9 @@ void mlir::vector::populateVectorLinearizeTypeConversionsAndLegality(
         return std::nullopt;
       });
 
-  patterns.add<LinearizeConstant, LinearizeVectorizable>(
-      typeConverter, patterns.getContext(), targetBitWidth);
+  patterns
+      .add<LinearizeConstant, LinearizeVectorizable, LinearizeVectorBitCast>(
+          typeConverter, patterns.getContext(), targetBitWidth);
 }
 
 void mlir::vector::populateVectorLinearizeShuffleLikeOpsPatterns(
diff --git a/mlir/test/Dialect/Vector/linearize.mlir b/mlir/test/Dialect/Vector/linearize.mlir
index 543e76b5b26e0..99b1bbab1eede 100644
--- a/mlir/test/Dialect/Vector/linearize.mlir
+++ b/mlir/test/Dialect/Vector/linearize.mlir
@@ -179,7 +179,7 @@ func.func @test_extract_strided_slice_1(%arg0 : vector<4x8xf32>) -> vector<2x2xf
 
 // ALL-LABEL:   func.func @test_extract_strided_slice_1_scalable(
 // ALL-SAME:    %[[VAL_0:.*]]: vector<4x[8]xf32>) -> vector<2x[8]xf32> {
-func.func @test_extract_strided_slice_1_scalable(%arg0: vector<4x[8]xf32>) -> vector<2x[8]xf32> {  
+func.func @test_extract_strided_slice_1_scalable(%arg0: vector<4x[8]xf32>) -> vector<2x[8]xf32> {
   // ALL-NOT: vector.shuffle
   // ALL-NOT: vector.shape_cast
   // ALL: %[[RES:.*]] = vector.extract_strided_slice %[[VAL_0]] {offsets = [1, 0], sizes = [2, 8], strides = [1, 1]} : vector<4x[8]xf32> to vector<2x[8]xf32>
@@ -318,3 +318,68 @@ func.func @test_vector_extract_scalar() {
   %0 = vector.extract %cst[0] : i32 from vector<4xi32>
   return
 }
+
+// -----
+
+// ALL-LABEL: test_vector_bitcast
+// ALL-SAME: %[[ARG_0:.*]]: vector<4x4xf32>
+func.func @test_vector_bitcast(%arg0: vector<4x4xf32>) -> vector<4x8xf16> {
+  // DEFAULT: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<4x4xf32> to vector<16xf32>
+  // DEFAULT: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<16xf32> to vector<32xf16>
+  // DEFAULT: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<32xf16> to vector<4x8xf16>
+
+  // BW-128: %[[UPCAST:.*]] = vector.bitcast %[[ARG_0]] : vector<4x4xf32> to vector<4x8xf16>
+  // BW-0: %[[BITCAST:.*]] = vector.bitcast %[[ARG_0]] : vector<4x4xf32> to vector<4x8xf16>
+  %1 = vector.bitcast %arg0 : vector<4x4xf32> to vector<4x8xf16>
+  return %1 : vector<4x8xf16>
+}
+
+// -----
+
+// ALL-LABEL: test_vector_bitcast
+// ALL-SAME: %[[ARG_0:.*]]: vector<4x2xf32>
+func.func @test_vector_bitcast(%arg0: vector<4x2xf32>) -> vector<4x4xf16> {
+  // DEFAULT: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<4x2xf32> to vector<8xf32>
+  // DEFAULT: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<8xf32> to vector<16xf16>
+  // DEFAULT: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<16xf16> to vector<4x4xf16>
+  // BW-128: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<4x2xf32> to vector<8xf32>
+  // BW-128: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<8xf32> to vector<16xf16>
+  // BW-128: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<16xf16> to vector<4x4xf16>
+
+  // BW-0: %[[BITCAST:.*]] = vector.bitcast %[[ARG_0]] : vector<4x2xf32> to vector<4x4xf16>
+  %1 = vector.bitcast %arg0 : vector<4x2xf32> to vector<4x4xf16>
+  return %1 : vector<4x4xf16>
+}
+
+// -----
+
+// ALL-LABEL: test_vector_bitcast
+// ALL-SAME: %[[ARG_0:.*]]: vector<4x[2]xf32>
+func.func @test_vector_bitcast(%arg0: vector<4x[2]xf32>) -> vector<4x[4]xf16> {
+  // DEFAULT: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<4x[2]xf32> to vector<[8]xf32>
+  // DEFAULT: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<[8]xf32> to vector<[16]xf16>
+  // DEFAULT: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<[16]xf16> to vector<4x[4]xf16>
+  // BW-128: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<4x[2]xf32> to vector<[8]xf32>
+  // BW-128: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<[8]xf32> to vector<[16]xf16>
+  // BW-128: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<[16]xf16> to vector<4x[4]xf16>
+
+  // BW-0: %[[BITCAST:.*]] = vector.bitcast %[[ARG_0]] : vector<4x[2]xf32> to vector<4x[4]xf16>
+  %1 = vector.bitcast %arg0 : vector<4x[2]xf32> to vector<4x[4]xf16>
+  return %1 : vector<4x[4]xf16>
+}
+
+// -----
+// ALL-LABEL: test_vector_bitcast
+// ALL-SAME: %[[ARG_0:.*]]: vector<[4]x2xf32>
+func.func @test_vector_bitcast(%arg0: vector<[4]x2xf32>) -> vector<[4]x4xf16> {
+  // DEFAULT: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<[4]x2xf32> to vector<[8]xf32>
+  // DEFAULT: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<[8]xf32> to vector<[16]xf16>
+  // DEFAULT: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<[16]xf16> to vector<[4]x4xf16>
+  // BW-128: %[[DOWNCAST:.*]] = vector.shape_cast %[[ARG_0]] : vector<[4]x2xf32> to vector<[8]xf32>
+  // BW-128: %[[BITCAST:.*]] = vector.bitcast %[[DOWNCAST]] : vector<[8]xf32> to vector<[16]xf16>
+  // BW-128: %[[UPCAST:.*]] = vector.shape_cast %[[BITCAST]] : vector<[16]xf16> to vector<[4]x4xf16>
+
+  // BW-0: %[[BITCAST:.*]] = vector.bitcast %[[ARG_0]] : vector<[4]x2xf32> to vector<[4]x4xf16>
+  %1 = vector.bitcast %arg0 : vector<[4]x2xf32> to vector<[4]x4xf16>
+  return %1 : vector<[4]x4xf16>
+}

From e734f01bffb87d035d9037ade8d8ba1e96639c2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Brand?= <andre.brand@mailbox.org>
Date: Mon, 27 Jan 2025 21:42:33 +0100
Subject: [PATCH 279/432] [clang] Prevent duplicated instantiation of
 enumerators of unscoped member enumerations  (#124407)

This commit addresses a bug occurring when an unscoped member enumeration
of a class template is introduced with an opaque-enum-declaration and later
redeclared with an enum-specifier (per C++23 [class.mem] p6).
Previously, the enumerators, or EnumConstantDecl, of the enum-specifier
were instantiated at both declarations, leading to different issues:
* erroneous ambiguities when referencing the enumerators,
* duplicated diagnostics in the enumerator-list.

The issue is resolved by ensuring that enumerators are instantiated only
at the first instantiated declaration, analogous to nested classes.

Fixes #124405
---
 clang/docs/ReleaseNotes.rst                   |   1 +
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  13 +-
 .../test/SemaCXX/member-enum-declarations.cpp | 112 ++++++++++++++++++
 3 files changed, 122 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/SemaCXX/member-enum-declarations.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 031c5d84e49f9..af3632322b845 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1003,6 +1003,7 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
+- Fixed the rejection of valid code when referencing an enumerator of an unscoped enum member with a prior declaration. (#GH124405)
 - Fixed immediate escalation of non-dependent expressions. (#GH123405)
 - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
   template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498)
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index b3dcb12c4c835..337262e4b5fc3 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1627,12 +1627,17 @@ Decl *TemplateDeclInstantiator::VisitEnumDecl(EnumDecl *D) {
   // specialization causes the implicit instantiation of the declarations, but
   // not the definitions of scoped member enumerations.
   //
-  // DR1484 clarifies that enumeration definitions inside of a template
+  // DR1484 clarifies that enumeration definitions inside a template
   // declaration aren't considered entities that can be separately instantiated
-  // from the rest of the entity they are declared inside of.
+  // from the rest of the entity they are declared inside.
   if (isDeclWithinFunction(D) ? D == Def : Def && !Enum->isScoped()) {
-    SemaRef.CurrentInstantiationScope->InstantiatedLocal(D, Enum);
-    InstantiateEnumDefinition(Enum, Def);
+    // Prevent redundant instantiation of the enumerator-definition if the
+    // definition has already been instantiated due to a prior
+    // opaque-enum-declaration.
+    if (PrevDecl == nullptr) {
+      SemaRef.CurrentInstantiationScope->InstantiatedLocal(D, Enum);
+      InstantiateEnumDefinition(Enum, Def);
+    }
   }
 
   return Enum;
diff --git a/clang/test/SemaCXX/member-enum-declarations.cpp b/clang/test/SemaCXX/member-enum-declarations.cpp
new file mode 100644
index 0000000000000..e08f6e7a3fcd6
--- /dev/null
+++ b/clang/test/SemaCXX/member-enum-declarations.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 -std=c++11 -fsyntax-only %s -verify
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only %s -verify
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only %s -verify
+
+
+namespace ScopedEnumerations {
+
+template <typename T>
+struct S1 {
+  enum class E : T;
+};
+
+template <typename T>
+enum class S1<T>::E : T {
+  S1_X = 0x123
+};
+
+static_assert(static_cast<int>(S1<int>::E::S1_X) == 0x123, "");
+
+template <typename T>
+struct S2 {
+  static constexpr T f(int) { return 0; };
+  enum class E : T;
+  static constexpr T f(char) { return 1; };
+  enum class E : T { X = f(T{}) };
+};
+
+static_assert(static_cast<int>(S2<char>::E::X) == 1, "");
+
+template <typename T>
+struct S3 {
+  enum class E : T;
+  enum class E : T { X = 0x7FFFFF00 }; // expected-error {{cannot be narrowed to type 'char'}} expected-warning {{implicit conversion from 'int' to 'char'}}
+};
+template struct S3<char>; // expected-note {{in instantiation}}
+
+template <typename T>
+struct S4 {
+  enum class E : T;
+  enum class E : T { S4_X = 5 };
+};
+
+auto x4 = S4<int>::E::S4_X;
+
+template <typename T>
+T f1() {
+  enum class E : T { X_F1, Y_F1, Z_F1 };
+  return X_F1;  // expected-error {{use of undeclared identifier 'X_F1'}}
+}
+
+const int resf1 = f1<int>();
+
+}
+
+
+namespace UnscopedEnumerations {
+
+template <typename T>
+struct S1 {
+  enum E : T;
+};
+
+template <typename T>
+enum S1<T>::E : T {
+  S1_X = 0x123
+};
+
+static_assert(static_cast<int>(S1<int>::S1_X) == 0x123, "");
+
+template <typename T>
+struct S2 {
+  static constexpr T f(int) { return 0; };
+  enum E : T;
+  static constexpr T f(char) { return 1; };
+  enum E : T { S2_X = f(T{}) };
+};
+
+static_assert(static_cast<int>(S2<char>::E::S2_X) == 1, "");
+
+template <typename T>
+struct S3 {
+  enum E : T;
+  enum E : T { S3_X = 0x7FFFFF00 }; // expected-error {{cannot be narrowed to type 'char'}} expected-warning {{implicit conversion from 'int' to 'char'}}
+};
+template struct S3<char>; // expected-note {{in instantiation of template class}}
+
+template <typename T>
+struct S4 {
+  enum E : T;
+  enum E : T { S4_X = 5 };
+};
+
+auto x4 = S4<int>::S4_X;
+
+template <typename T>
+struct S5 {
+  enum E : T;
+  T S5_X = 5; // expected-note {{previous definition is here}}
+  enum E : T { S5_X = 5 }; // expected-error {{redefinition of 'S5_X'}}
+};
+
+
+template <typename T>
+T f1() {
+  enum E : T { X_F2, Y_F2, Z_F2 };
+  return X_F2;
+}
+
+const int resf1 = f1<int>();
+
+}
+

From 0cbb1d5673dd76112460a109c8c2af6c32c5a16d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 12:44:44 -0800
Subject: [PATCH 280/432] [GlobalMerge] Use constructor to set all bits in
 BitVector. NFC (#124375)

The constructor has an optional bool for the starting value for each
bit. Use that instead of calling set().
---
 llvm/lib/CodeGen/GlobalMerge.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 4debdea3f6568..7b1aa7c30e581 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -271,8 +271,7 @@ bool GlobalMergeImpl::doMerge(SmallVectorImpl<GlobalVariable *> &Globals,
 
   // If we want to just blindly group all globals together, do so.
   if (!GlobalMergeGroupByUse || (Opt.MergeConstAggressive && isConst)) {
-    BitVector AllGlobals(Globals.size());
-    AllGlobals.set();
+    BitVector AllGlobals(Globals.size(), true);
     return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
   }
 

From c24e5f982e2791768aeea8b1fb55e951f80aad75 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 12:45:41 -0800
Subject: [PATCH 281/432] [GlobalMerge] Fix inaccurate debug print. (#124377)

This message was not updated when MinSize was added.
---
 llvm/lib/CodeGen/GlobalMerge.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 7b1aa7c30e581..5993fc939a08a 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -728,7 +728,8 @@ bool GlobalMergeImpl::run(Module &M) {
 
     Type *Ty = GV.getValueType();
     TypeSize AllocSize = DL.getTypeAllocSize(Ty);
-    if (AllocSize < Opt.MaxOffset && AllocSize >= Opt.MinSize) {
+    bool CanMerge = AllocSize < Opt.MaxOffset && AllocSize >= Opt.MinSize;
+    if (CanMerge) {
       if (TM &&
           TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSS())
         BSSGlobals[{AddressSpace, Section}].push_back(&GV);
@@ -737,11 +738,8 @@ bool GlobalMergeImpl::run(Module &M) {
       else
         Globals[{AddressSpace, Section}].push_back(&GV);
     }
-    LLVM_DEBUG(dbgs() << "GV "
-                      << ((DL.getTypeAllocSize(Ty) < Opt.MaxOffset)
-                              ? "to merge: "
-                              : "not to merge: ")
-                      << GV << "\n");
+    LLVM_DEBUG(dbgs() << "GV " << (CanMerge ? "" : "not ") << "to merge: " << GV
+                      << "\n");
   }
 
   for (auto &P : Globals)

From 3bf8e671dbc808a3317a25c37017af7aa843d581 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 27 Jan 2025 21:58:00 +0100
Subject: [PATCH 282/432] [libc++][format] Add tests for flat_(|multi)map
 formatting (#124418)

These types should be formattable out-of-the-box. This patch validates
that is true.
---
 .../concept.formattable.compile.pass.cpp      |  8 +++++++
 .../format.functions.tests.h                  | 21 +++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
index 88a485a256c80..ec27acd75e110 100644
--- a/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.formattable/concept.formattable.compile.pass.cpp
@@ -24,6 +24,7 @@
 #include <concepts>
 #include <deque>
 #include <filesystem>
+#include <flat_map>
 #include <format>
 #include <forward_list>
 #include <list>
@@ -242,6 +243,13 @@ void test_P2286() {
   assert_is_formattable<std::multiset<int>, CharT>();
   assert_is_formattable<std::multimap<int, int>, CharT>();
 
+#if TEST_STD_VER >= 23
+  // assert_is_formattable<std::flat_set<int>, CharT>();
+  assert_is_formattable<std::flat_map<int, int>, CharT>();
+  // assert_is_formattable<std::flat_multiset<int>, CharT>();
+  assert_is_formattable<std::flat_multimap<int, int>, CharT>();
+#endif // TEST_STD_VER >= 2
+
   assert_is_formattable<std::unordered_set<int>, CharT>();
   assert_is_formattable<std::unordered_map<int, int>, CharT>();
   assert_is_formattable<std::unordered_multiset<int>, CharT>();
diff --git a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
index fc638da9c840d..3eba50d5aff1a 100644
--- a/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
+++ b/libcxx/test/std/utilities/format/format.range/format.range.fmtmap/format.functions.tests.h
@@ -10,6 +10,8 @@
 #define TEST_STD_UTILITIES_FORMAT_FORMAT_RANGE_FORMAT_RANGE_FMTMAP_FORMAT_FUNCTIONS_TESTS_H
 
 #include <algorithm>
+#include <deque>
+#include <flat_map>
 #include <format>
 #include <map>
 #include <unordered_map>
@@ -241,10 +243,7 @@ void test_char_to_wchar(TestFunction check, ExceptionTest check_exception) {
 // Bool
 //
 template <class CharT, class TestFunction, class ExceptionTest>
-void test_bool(TestFunction check, ExceptionTest check_exception) {
-  // duplicates are stored in order of insertion
-  std::multimap<bool, int> input{{true, 42}, {false, 0}, {true, 1}};
-
+void test_bool(TestFunction check, ExceptionTest check_exception, auto&& input) {
   check(SV("{false: 0, true: 42, true: 1}"), SV("{}"), input);
   check(SV("{false: 0, true: 42, true: 1}^42"), SV("{}^42"), input);
   check(SV("{false: 0, true: 42, true: 1}^42"), SV("{:}^42"), input);
@@ -339,6 +338,17 @@ void test_bool(TestFunction check, ExceptionTest check_exception) {
       "The argument index value is too large for the number of arguments supplied", SV("{:^^{}:#>{}}"), input, 41);
 }
 
+template <class CharT, class TestFunction, class ExceptionTest>
+void test_bool(TestFunction check, ExceptionTest check_exception) {
+  // duplicates are stored in order of insertion
+  test_bool<CharT>(check, check_exception, std::multimap<bool, int>{{true, 42}, {false, 0}, {true, 1}});
+#if TEST_STD_VER >= 23
+  test_bool<CharT>(check,
+                   check_exception,
+                   std::flat_multimap<bool, int, std::less<bool>, std::deque<bool>>{{true, 42}, {false, 0}, {true, 1}});
+#endif
+}
+
 //
 // Integral
 //
@@ -442,6 +452,9 @@ void test_int(TestFunction check, ExceptionTest check_exception, auto&& input) {
 template <class CharT, class TestFunction, class ExceptionTest>
 void test_int(TestFunction check, ExceptionTest check_exception) {
   test_int<CharT>(check, check_exception, std::map<int, int>{{1, -1}, {42, -42}, {-42, 42}});
+#if TEST_STD_VER >= 23
+  test_int<CharT>(check, check_exception, std::flat_map<int, int>{{1, -1}, {42, -42}, {-42, 42}});
+#endif
 }
 
 //

From 804b81d39f2d50743fd2090aed72dad29f5fb388 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Mon, 27 Jan 2025 21:14:16 +0000
Subject: [PATCH 283/432] [AArch64] Add FP8 Neon intrinsics for dot-product
 (#123613)

This patch adds the following intrinsics:

float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t
vm, fpm_t fpm)
float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, fpm_t fpm)

float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

float32x2_t vdot_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t
vm, fpm_t fpm)
float32x4_t vdotq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, fpm_t fpm)

float32x2_t vdot_lane_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x2_t vdot_laneq_f32_mf8_fpm(float32x2_t vd, mfloat8x8_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vdotq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vdotq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
---
 clang/include/clang/Basic/arm_neon.td         |  20 ++
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  44 +++
 clang/lib/CodeGen/CodeGenFunction.h           |   4 +
 .../fp8-intrinsics/acle_neon_fp8_fdot.c       | 254 ++++++++++++++++++
 .../acle_neon_fp8_fdot.c                      |  54 ++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  21 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  82 +++---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll    |  74 +++++
 10 files changed, 529 insertions(+), 40 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 9a6a77640ef5d..c6609f312969e 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2141,6 +2141,26 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "mQm">;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
+  def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "mQm">;
+
+  def VDOT_LANE_F16_MF8  : VInst<"vdot_lane_f16_mf8_fpm",  "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>;
+
+  def VDOTQ_LANE_F16_MF8  : VInst<"vdot_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOTQ_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
+  def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "mQm">;
+
+  def VDOT_LANE_F32_MF8  : VInst<"vdot_lane_f32_mf8_fpm",  "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
+
+  def VDOTQ_LANE_F32_MF8  : VInst<"vdot_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 91a2bf3020b9a..b9b9d509c2251 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
-class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0a06ce028a916..b4b26eb84d5f9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6766,6 +6766,24 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
   return EmitNeonCall(F, Ops, name);
 }
 
+llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
+    unsigned IID, bool ExtendLane, llvm::Type *RetTy,
+    SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
+
+  const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
+                             RetTy->getPrimitiveSizeInBits();
+  llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
+                       Ops[1]->getType()};
+  if (ExtendLane) {
+    auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
+                                        Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12761,6 +12779,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   unsigned Int;
   bool ExtractLow = false;
+  bool ExtendLane = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14028,6 +14047,31 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty,
                               Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
+
+  case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
+                               Ops, E, "fdot2");
+  case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
+                               ExtendLane, HalfTy, Ops, E, "fdot2_lane");
+  case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
+                               FloatTy, Ops, E, "fdot4");
+  case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
+                               ExtendLane, FloatTy, Ops, E, "fdot4_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index a5416ab91c8d6..fd6d44b2579b9 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4699,6 +4699,10 @@ class CodeGenFunction : public CodeGenTypeCache {
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   const CallExpr *E, const char *name);
+  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
+                                   llvm::Type *RetTy,
+                                   SmallVectorImpl<llvm::Value *> &Ops,
+                                   const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 0000000000000..4d2f5d550c4dc
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,254 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -O3 -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FDOT21_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z13test_vdot_f1613__Float16x4_t13__Mfloat8x8_tS0_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT21_I]]
+//
+float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FDOT21_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z14test_vdotq_f1613__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT21_I]]
+//
+float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z18test_vdot_lane_f1613__Float16x4_t13__Mfloat8x8_tS0_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z19test_vdot_laneq_f1613__Float16x4_t13__Mfloat8x8_t14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z19test_vdotq_lane_f1613__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z20test_vdotq_laneq_f1613__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_t13__Mfloat8x8_tS0_m(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_I]]
+//
+float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_I]]
+//
+float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_t13__Mfloat8x8_tS0_m(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_t13__Mfloat8x8_t14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 0000000000000..8bfe3ac26ab2c
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm);
+// expected-error@-1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm);
+// expected-error@-1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm);
+// expected-error@-1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm);
+// expected-error@-1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm);
+// expected-error@-1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm);
+// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+
+  (void) vdot_f32_mf8_fpm(va2, v8, v8, fpm);
+// expected-error@-1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
+  (void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm);
+// expected-error@-1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm);
+// expected-error@-1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm);
+// expected-error@-1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm);
+// expected-error@-1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm);
+// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+}
+
+void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
+              mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm);
+  // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 31c9546376c82..395db293063f4 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1015,6 +1015,27 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
                              llvm_anyvector_ty,
                              LLVMMatchType<1>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  // Dot-product
+  class AdvSIMD_FP8_DOT_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+  class AdvSIMD_FP8_DOT_LANE_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             llvm_v16i8_ty,
+                             llvm_i32_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
+  def int_aarch64_neon_fp8_fdot2 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot2_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+  def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 67b4366454845..dea2af16e3184 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6585,19 +6585,22 @@ multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
             (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
-// TODO: Create a new Value Type v8f8 and v16f8
-multiclass SIMDThreeSameVectorDOT2<string asm> {
-   def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
-                                          V64, v4f16, v8i8, null_frag>;
-   def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
-                                          V128, v8f16, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot2<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
+                                           V64, v4f16, v8i8, op>;
+    def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
+                                           V128, v8f16, v16i8, op>;
+  }
 }
 
-multiclass SIMDThreeSameVectorDOT4<string asm> {
-   def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
-                                          V64, v2f32, v8i8, null_frag>;
-   def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
-                                          V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
+                                           V64, v2f32, v8i8, op>;
+    def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
+                                           V128, v4f32, v16i8, op>;
+  }
 }
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
@@ -9138,15 +9141,16 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
                                     string dst_kind, string lhs_kind, string rhs_kind,
                                     RegisterOperand RegType,
                                     ValueType AccumType, ValueType InputType,
+                                    AsmVectorIndexOpnd VIdx,
                                     SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0b0, size, opc, RegType, RegType, V128,
-                            VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
                                  (InputType RegType:$Rn),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4i32 V128:$Rm),
-                                        VectorIndexS:$idx)))))))]> {
+                                        VIdx:$idx)))))))]> {
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
   let Inst{11}    = idx{1};  // H
@@ -9155,17 +9159,24 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
 multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
   def v8i8  : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b",
-                                              V64, v2i32, v8i8, OpNode>;
+                                              V64, v2i32, v8i8, VectorIndexS, OpNode>;
   def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b",
-                                              V128, v4i32, v16i8, OpNode>;
+                                              V128, v4i32, v16i8, VectorIndexS, OpNode>;
 }
 
-// TODO: The vectors v8i8 and v16i8 should be v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT4Index<string asm> {
-  def v8f8 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
-                                           V64, v2f32, v8i8, null_frag>;
-  def v16f8 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
-                                            V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
+                                              V64, v2f32, v8i8, VectorIndexS32b_timm, null_frag>;
+    def v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
+                                              V128, v4f32, v16i8, VectorIndexS32b_timm, null_frag>;
+  }
+
+  def : Pat<(v2f32 (op (v2f32 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v2f32) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
@@ -9174,14 +9185,15 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
                                       string dst_kind, string lhs_kind,
                                       string rhs_kind, RegisterOperand RegType,
                                       RegisterOperand RegType_lo, ValueType AccumType,
-                                      ValueType InputType, SDPatternOperator OpNode> :
+                                      ValueType InputType, AsmVectorIndexOpnd VIdx,
+                                      SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0, sz, opc, RegType, RegType, RegType_lo,
-                            VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
                 (AccumType (OpNode (AccumType RegType:$Rd),
                                    (InputType RegType:$Rn),
                                    (InputType (AArch64duplane16 (v8f16 V128_lo:$Rm),
-                                                VectorIndexH:$idx)))))]> {
+                                                VIdx:$idx)))))]> {
   // idx = H:L:M
   bits<3> idx;
   let Inst{11} = idx{2}; // H
@@ -9192,19 +9204,25 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
 multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
   def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h",
-                                              V64, V128_lo, v2f32, v4f16, OpNode>;
+                                              V64, V128_lo, v2f32, v4f16, VectorIndexH, OpNode>;
   def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h",
-                                              V128, V128_lo, v4f32, v8f16, OpNode>;
+                                              V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;
 }
 
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD vector x indexed element
-// TODO: Replace value types v8i8 and v16i8 by v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT2Index<string asm> {
-  def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
-                                            V64, V128_lo, v4f16, v8i8, null_frag>;
-  def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
-                                            V128, V128_lo, v8f16, v8i16, null_frag>;
+multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in { 
+    def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
+                                              V64, V128_lo, v4f16, v8i8, VectorIndexH32b_timm, null_frag>;
+    def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
+                                              V128, V128_lo, v8f16, v16i8, VectorIndexH32b_timm, null_frag>;
+  }
+  def : Pat<(v4f16 (op (v4f16 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f16) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 881af6eb95117..364566f63bca1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1497,7 +1497,7 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
                          ValueType AccumType, ValueType InputType>
       : BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind,
                                         lhs_kind, rhs_kind, RegType, AccumType,
-                                        InputType, null_frag> {
+                                        InputType, VectorIndexS, null_frag> {
   let Pattern = [(set (AccumType RegType:$dst),
                       (AccumType (AArch64usdot (AccumType RegType:$Rd),
                                  (InputType (bitconvert (AccumType
@@ -10369,14 +10369,14 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
  defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
 } // End let Predicates = [HasFP8FMA]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT2] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT2<"fdot">;
+let Predicates = [HasFP8DOT2] in {
+ defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>;
+ defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>;
 } // End let Predicates = [HasFP8DOT2]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT4] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT4<"fdot">;
+let Predicates = [HasFP8DOT4] in {
+ defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>;
+ defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>;
 } // End let Predicates = [HasFP8DOT4]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
new file mode 100644
index 0000000000000..b7a35c5fddf17
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8dot2,+fp8dot4 < %s | FileCheck %s
+
+define <4 x half> @test_fdot_f16(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <8 x half> %res
+}
+
+define <4 x half> @test_fdot_lane_f16(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.2b[0]
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_lane_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.2b[7]
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fdot_f32(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x float> @test_fdot_lane_f32(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_lane_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.4b[3]
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 3)
+  ret <4 x float> %res
+}

From ad9da92cf6f735747ef04fd56937e1d76819e503 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 27 Jan 2025 21:20:04 +0000
Subject: [PATCH 284/432] [LoopUnroll] Add RuntimeUnrollMultiExit to loop
 unroll options (NFC) (#124462)

Add an extra knob to RuntimeUnrollMultiExit to let backends control
whether to allow multi-exit unrolling on a per-loop basis.

This gives backends more fine-grained control on deciding if multi-exit
unrolling is profitable for a given loop and uarch. Similar to
4226e0a0c75.

PR: https://github.com/llvm/llvm-project/pull/124462
---
 llvm/include/llvm/Analysis/TargetTransformInfo.h   |  5 +++++
 llvm/include/llvm/Transforms/Utils/UnrollLoop.h    |  4 +++-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp      |  2 ++
 llvm/lib/Transforms/Utils/LoopUnroll.cpp           |  9 +++++----
 llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp    | 10 ++++++----
 llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp |  2 +-
 6 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 71b204f9c3fec..f07a4aea34d29 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -622,6 +622,11 @@ class TargetTransformInfo {
     /// Don't allow runtime unrolling if expanding the trip count takes more
     /// than SCEVExpansionBudget.
     unsigned SCEVExpansionBudget;
+    /// Allow runtime unrolling multi-exit loops. Should only be set if the
+    /// target determined that multi-exit unrolling is profitable for the loop.
+    /// Fall back to the generic logic to determine whether multi-exit unrolling
+    /// is profitable if set to false.
+    bool RuntimeUnrollMultiExit;
   };
 
   /// Get target-customized preferences for the generic loop unrolling
diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
index 8cf17ced458c8..ed560f6f6e2fa 100644
--- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -76,6 +76,7 @@ struct UnrollLoopOptions {
   bool ForgetAllSCEV;
   const Instruction *Heart = nullptr;
   unsigned SCEVExpansionBudget;
+  bool RuntimeUnrollMultiExit = false;
 };
 
 LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
@@ -91,7 +92,8 @@ bool UnrollRuntimeLoopRemainder(
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
-    unsigned SCEVExpansionBudget, Loop **ResultLoop = nullptr);
+    unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
+    Loop **ResultLoop = nullptr);
 
 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                                   unsigned TripMultiple, bool UnrollRemainder,
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 090348809e571..5f28ee616f0e4 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.UnrollAndJamInnerLoopThreshold = 60;
   UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
   UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
+  UP.RuntimeUnrollMultiExit = false;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1352,6 +1353,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   ULO.ForgetAllSCEV = ForgetAllSCEV;
   ULO.Heart = getLoopConvergenceHeart(L);
   ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
+  ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
   LoopUnrollResult UnrollResult = UnrollLoop(
       L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
   if (UnrollResult == LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index b11d92836a998..af6f9c2a8edd3 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -590,10 +590,11 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                               : isEpilogProfitable(L);
 
   if (ULO.Runtime &&
-      !UnrollRuntimeLoopRemainder(
-          L, ULO.Count, ULO.AllowExpensiveTripCount, EpilogProfitability,
-          ULO.UnrollRemainder, ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
-          PreserveLCSSA, ULO.SCEVExpansionBudget, RemainderLoop)) {
+      !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
+                                  EpilogProfitability, ULO.UnrollRemainder,
+                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+                                  PreserveLCSSA, ULO.SCEVExpansionBudget,
+                                  ULO.RuntimeUnrollMultiExit, RemainderLoop)) {
     if (ULO.Force)
       ULO.Runtime = false;
     else {
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index b0bc55cd64c37..524b268aee2f3 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -461,7 +461,7 @@ CloneLoopBlocks(Loop *L, Value *NewIter, const bool UseEpilogRemainder,
 
 /// Returns true if we can profitably unroll the multi-exit loop L. Currently,
 /// we return true only if UnrollRuntimeMultiExit is set to true.
-static bool canProfitablyUnrollMultiExitLoop(
+static bool canProfitablyRuntimeUnrollMultiExitLoop(
     Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
     bool UseEpilogRemainder) {
 
@@ -583,7 +583,8 @@ bool llvm::UnrollRuntimeLoopRemainder(
     bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
     LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
     const TargetTransformInfo *TTI, bool PreserveLCSSA,
-    unsigned SCEVExpansionBudget, Loop **ResultLoop) {
+    unsigned SCEVExpansionBudget, bool RuntimeUnrollMultiExit,
+    Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -632,8 +633,9 @@ bool llvm::UnrollRuntimeLoopRemainder(
     if (!PreserveLCSSA)
       return false;
 
-    if (!canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit,
-                                          UseEpilogRemainder)) {
+    if (!RuntimeUnrollMultiExit &&
+        !canProfitablyRuntimeUnrollMultiExitLoop(L, OtherExits, LatchExit,
+                                                 UseEpilogRemainder)) {
       LLVM_DEBUG(
           dbgs()
           << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
diff --git a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
index b49e37d9eee98..fb02c89c77a10 100644
--- a/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
+++ b/llvm/unittests/Transforms/Utils/UnrollLoopTest.cpp
@@ -73,6 +73,6 @@ while.end:                                        ; preds = %while.cond
 
   bool ret =
       UnrollRuntimeLoopRemainder(L, 4, true, false, false, false, &LI, &SE, &DT,
-                                 &AC, /*TTI=*/nullptr, PreserveLCSSA, 4);
+                                 &AC, /*TTI=*/nullptr, PreserveLCSSA, 4, false);
   EXPECT_FALSE(ret);
 }

From 5815a311050ae218ebcda53adeee24ed96851943 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 27 Jan 2025 22:21:48 +0100
Subject: [PATCH 285/432] [Clang] call HandleImmediateInvocation before
 checking for immediate escacalating expressions (#124414)

HandleImmediateInvocation can call MarkExpressionAsImmediateEscalating
and should always be called before
CheckImmediateEscalatingFunctionDefinition.

However, we were not doing that in `ActFunctionBody`.

We simply move CheckImmediateEscalatingFunctionDefinition to
PopExpressionEvaluationContext.

Fixes #119046
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Sema/Sema.h               |  6 ++--
 clang/lib/Sema/SemaDecl.cpp                   |  1 -
 clang/lib/Sema/SemaExpr.cpp                   |  3 ++
 clang/test/CodeGenCXX/gh119046.cpp            | 32 +++++++++++++++++++
 .../SemaCXX/cxx2b-consteval-propagate.cpp     | 17 ++++++++++
 6 files changed, 56 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/gh119046.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index af3632322b845..8b04f172946df 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1003,6 +1003,7 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
+- Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046)
 - Fixed the rejection of valid code when referencing an enumerator of an unscoped enum member with a prior declaration. (#GH124405)
 - Fixed immediate escalation of non-dependent expressions. (#GH123405)
 - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 528304409b809..f1d1e2e567193 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13139,10 +13139,10 @@ class Sema final : public SemaBase {
     ~SynthesizedFunctionScope() {
       if (PushedCodeSynthesisContext)
         S.popCodeSynthesisContext();
-      if (auto *FD = dyn_cast<FunctionDecl>(S.CurContext)) {
+
+      if (auto *FD = dyn_cast<FunctionDecl>(S.CurContext))
         FD->setWillHaveBody(false);
-        S.CheckImmediateEscalatingFunctionDefinition(FD, S.getCurFunction());
-      }
+
       S.PopExpressionEvaluationContext();
       S.PopFunctionScopeInfo();
     }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fe68eadc951b5..accef4c5d5d91 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16019,7 +16019,6 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       if (!FD->isDeletedAsWritten())
         FD->setBody(Body);
       FD->setWillHaveBody(false);
-      CheckImmediateEscalatingFunctionDefinition(FD, FSI);
 
       if (getLangOpts().CPlusPlus14) {
         if (!FD->isInvalidDecl() && Body && !FD->isDependentContext() &&
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 176627c3df37c..e098b21045f01 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17881,6 +17881,9 @@ void Sema::PopExpressionEvaluationContext() {
   WarnOnPendingNoDerefs(Rec);
   HandleImmediateInvocations(*this, Rec);
 
+  if (auto *FD = dyn_cast<FunctionDecl>(CurContext); FD && getCurFunction())
+    CheckImmediateEscalatingFunctionDefinition(FD, getCurFunction());
+
   // Warn on any volatile-qualified simple-assignments that are not discarded-
   // value expressions nor unevaluated operands (those cases get removed from
   // this list by CheckUnusedVolatileAssignment).
diff --git a/clang/test/CodeGenCXX/gh119046.cpp b/clang/test/CodeGenCXX/gh119046.cpp
new file mode 100644
index 0000000000000..cad76879f0862
--- /dev/null
+++ b/clang/test/CodeGenCXX/gh119046.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -std=c++2a -triple x86_64-elf-gnu %s -emit-llvm -o - | FileCheck %s
+
+struct S {
+    consteval void operator()() {}
+};
+
+template <class Fn>
+constexpr void dispatch(Fn fn) {
+    fn();
+}
+
+template <class Visitor>
+struct value_visitor {
+    constexpr void operator()() { visitor(); }
+    Visitor&& visitor;
+};
+
+template <class Visitor>
+constexpr auto make_dispatch() {
+    return dispatch<value_visitor<S>>;
+}
+
+template <class Visitor>
+constexpr void visit(Visitor&&) {
+    make_dispatch<Visitor>();
+}
+
+void f() { visit(S{}); }
+
+// CHECK: define {{.*}} @_Z1fv
+// CHECK-NOT: define {{.*}} @_Z5visitI1SEvOT_
+// CHECK-NOT: define {{.*}} @_Z13make_dispatchI1SEDav
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 05904d9ade067..e83e03da54f78 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -529,6 +529,23 @@ D d(0); // expected-note {{in implicit initialization for inherited constructor
 
 }
 
+namespace GH119046 {
+
+template <typename Cls> constexpr auto tfn(int) {
+  return (unsigned long long)(&Cls::sfn);
+  //expected-note@-1 {{'tfn<GH119046::S>' is an immediate function because its body evaluates the address of a consteval function 'sfn'}}
+};
+struct S { static consteval void sfn() {} };
+
+int f() {
+  int a = 0; // expected-note{{declared here}}
+  return tfn<S>(a);
+  //expected-error@-1 {{call to immediate function 'GH119046::tfn<GH119046::S>' is not a constant expression}}
+  //expected-note@-2 {{read of non-const variable 'a' is not allowed in a constant expression}}
+}
+
+}
+
 namespace GH123405 {
 
 consteval void fn() {}

From 713482fccf82d33c5c4ddb24538958617e1eb957 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 27 Jan 2025 21:35:36 +0000
Subject: [PATCH 286/432] [VPlan] Use State.get to extract lane mask for
 BranchOnMask.

Simplifies the code slightly and avoids redundant extracts/broadcasts if
the operand is live-in or already scalar.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 10 +--
 .../AArch64/invariant-replicate-region.ll     | 12 ++--
 .../LoopVectorize/RISCV/dead-ops-cost.ll      | 24 +++----
 .../LoopVectorize/X86/load-deref-pred.ll      | 48 +++++--------
 .../LoopVectorize/X86/x86-predication.ll      | 68 +++++++------------
 .../LoopVectorize/if-pred-non-void.ll         |  6 +-
 .../LoopVectorize/if-pred-stores.ll           | 28 +-------
 .../Transforms/LoopVectorize/induction.ll     | 18 ++---
 .../LoopVectorize/load-deref-pred-neg-off.ll  |  6 +-
 9 files changed, 68 insertions(+), 152 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2ae539c98fef4..2679ed6b26b5d 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2444,16 +2444,12 @@ void VPScalarCastRecipe ::print(raw_ostream &O, const Twine &Indent,
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   assert(State.Lane && "Branch on Mask works only on single instance.");
 
-  unsigned Lane = State.Lane->getKnownLane();
 
   Value *ConditionBit = nullptr;
   VPValue *BlockInMask = getMask();
-  if (BlockInMask) {
-    ConditionBit = State.get(BlockInMask);
-    if (ConditionBit->getType()->isVectorTy())
-      ConditionBit = State.Builder.CreateExtractElement(
-          ConditionBit, State.Builder.getInt32(Lane));
-  } else // Block in mask is all-one.
+  if (BlockInMask)
+    ConditionBit = State.get(BlockInMask, *State.Lane);
+  else // Block in mask is all-one.
     ConditionBit = State.Builder.getTrue();
 
   // Replace the temporary unreachable terminator with a new conditional branch,
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
index e1472b7249ee6..26e96cab3e357 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll
@@ -16,32 +16,28 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE6:.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP0]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_UREM_IF:.*]], label %[[PRED_UREM_CONTINUE:.*]]
 ; CHECK:       [[PRED_UREM_IF]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = urem i32 10, [[X]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE]]
 ; CHECK:       [[PRED_UREM_CONTINUE]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP2]], %[[PRED_UREM_IF]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_UREM_IF1:.*]], label %[[PRED_UREM_CONTINUE2:.*]]
 ; CHECK:       [[PRED_UREM_IF1]]:
 ; CHECK-NEXT:    [[TMP5:%.*]] = urem i32 10, [[X]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP5]], i32 1
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE2]]
 ; CHECK:       [[PRED_UREM_CONTINUE2]]:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <4 x i32> [ [[TMP3]], %[[PRED_UREM_CONTINUE]] ], [ [[TMP6]], %[[PRED_UREM_IF1]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT:    br i1 [[TMP8]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_UREM_IF3:.*]], label %[[PRED_UREM_CONTINUE4:.*]]
 ; CHECK:       [[PRED_UREM_IF3]]:
 ; CHECK-NEXT:    [[TMP9:%.*]] = urem i32 10, [[X]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP9]], i32 2
 ; CHECK-NEXT:    br label %[[PRED_UREM_CONTINUE4]]
 ; CHECK:       [[PRED_UREM_CONTINUE4]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = phi <4 x i32> [ [[TMP7]], %[[PRED_UREM_CONTINUE2]] ], [ [[TMP10]], %[[PRED_UREM_IF3]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i32 3
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_UREM_IF5:.*]], label %[[PRED_UREM_CONTINUE6]]
 ; CHECK:       [[PRED_UREM_IF5]]:
 ; CHECK-NEXT:    [[TMP13:%.*]] = urem i32 10, [[X]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP13]], i32 3
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
index 450405f193465..921313ba53818 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll
@@ -209,50 +209,42 @@ define i32 @cost_of_exit_branch_and_cond_insts(ptr %a, ptr %b, i1 %c, i16 %x) #0
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE18:.*]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[B]], i32 [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11:![0-9]+]], !noalias [[META14:![0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
 ; CHECK:       [[PRED_STORE_IF5]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE6]]
 ; CHECK:       [[PRED_STORE_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
 ; CHECK:       [[PRED_STORE_IF7]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
 ; CHECK:       [[PRED_STORE_CONTINUE8]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
 ; CHECK:       [[PRED_STORE_IF9]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
 ; CHECK:       [[PRED_STORE_CONTINUE10]]:
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
-; CHECK-NEXT:    br i1 [[TMP16]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
 ; CHECK:       [[PRED_STORE_IF11]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_CONTINUE12]]:
-; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
-; CHECK-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
 ; CHECK:       [[PRED_STORE_IF13]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
 ; CHECK:       [[PRED_STORE_CONTINUE14]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]]
 ; CHECK:       [[PRED_STORE_IF15]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE16]]
 ; CHECK:       [[PRED_STORE_CONTINUE16]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18]]
 ; CHECK:       [[PRED_STORE_IF17]]:
 ; CHECK-NEXT:    store i1 false, ptr [[A]], align 1, !alias.scope [[META11]], !noalias [[META14]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE18]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
index 3e50ee42866b9..b40037734d22e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll
@@ -524,8 +524,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP61:%.*]] = insertelement <4 x i1> [[TMP60]], i1 [[TMP57]], i32 1
 ; CHECK-NEXT:    [[TMP62:%.*]] = insertelement <4 x i1> [[TMP61]], i1 [[TMP58]], i32 2
 ; CHECK-NEXT:    [[TMP63:%.*]] = insertelement <4 x i1> [[TMP62]], i1 [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP64:%.*]] = extractelement <4 x i1> [[TMP39]], i32 0
-; CHECK-NEXT:    br i1 [[TMP64]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[TMP32]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
 ; CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP66:%.*]] = load i32, ptr [[TMP65]], align 4
@@ -533,8 +532,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
 ; CHECK-NEXT:    [[TMP68:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP67]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <4 x i1> [[TMP39]], i32 1
-; CHECK-NEXT:    br i1 [[TMP69]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
+; CHECK-NEXT:    br i1 [[TMP33]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
 ; CHECK:       pred.load.if4:
 ; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP71:%.*]] = load i32, ptr [[TMP70]], align 4
@@ -542,8 +540,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE5]]
 ; CHECK:       pred.load.continue5:
 ; CHECK-NEXT:    [[TMP73:%.*]] = phi <4 x i32> [ [[TMP68]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP72]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT:    [[TMP74:%.*]] = extractelement <4 x i1> [[TMP39]], i32 2
-; CHECK-NEXT:    br i1 [[TMP74]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
+; CHECK-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7:%.*]]
 ; CHECK:       pred.load.if6:
 ; CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP76:%.*]] = load i32, ptr [[TMP75]], align 4
@@ -551,8 +548,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE7]]
 ; CHECK:       pred.load.continue7:
 ; CHECK-NEXT:    [[TMP78:%.*]] = phi <4 x i32> [ [[TMP73]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP77]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i1> [[TMP39]], i32 3
-; CHECK-NEXT:    br i1 [[TMP79]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
+; CHECK-NEXT:    br i1 [[TMP35]], label [[PRED_LOAD_IF8:%.*]], label [[PRED_LOAD_CONTINUE9:%.*]]
 ; CHECK:       pred.load.if8:
 ; CHECK-NEXT:    [[TMP80:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i32, ptr [[TMP80]], align 4
@@ -560,8 +556,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE9]]
 ; CHECK:       pred.load.continue9:
 ; CHECK-NEXT:    [[TMP83:%.*]] = phi <4 x i32> [ [[TMP78]], [[PRED_LOAD_CONTINUE7]] ], [ [[TMP82]], [[PRED_LOAD_IF8]] ]
-; CHECK-NEXT:    [[TMP84:%.*]] = extractelement <4 x i1> [[TMP47]], i32 0
-; CHECK-NEXT:    br i1 [[TMP84]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
+; CHECK-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF10:%.*]], label [[PRED_LOAD_CONTINUE11:%.*]]
 ; CHECK:       pred.load.if10:
 ; CHECK-NEXT:    [[TMP85:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP86:%.*]] = load i32, ptr [[TMP85]], align 4
@@ -569,8 +564,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE11]]
 ; CHECK:       pred.load.continue11:
 ; CHECK-NEXT:    [[TMP88:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE9]] ], [ [[TMP87]], [[PRED_LOAD_IF10]] ]
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <4 x i1> [[TMP47]], i32 1
-; CHECK-NEXT:    br i1 [[TMP89]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
+; CHECK-NEXT:    br i1 [[TMP41]], label [[PRED_LOAD_IF12:%.*]], label [[PRED_LOAD_CONTINUE13:%.*]]
 ; CHECK:       pred.load.if12:
 ; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = load i32, ptr [[TMP90]], align 4
@@ -578,8 +572,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE13]]
 ; CHECK:       pred.load.continue13:
 ; CHECK-NEXT:    [[TMP93:%.*]] = phi <4 x i32> [ [[TMP88]], [[PRED_LOAD_CONTINUE11]] ], [ [[TMP92]], [[PRED_LOAD_IF12]] ]
-; CHECK-NEXT:    [[TMP94:%.*]] = extractelement <4 x i1> [[TMP47]], i32 2
-; CHECK-NEXT:    br i1 [[TMP94]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[PRED_LOAD_IF14:%.*]], label [[PRED_LOAD_CONTINUE15:%.*]]
 ; CHECK:       pred.load.if14:
 ; CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP96:%.*]] = load i32, ptr [[TMP95]], align 4
@@ -587,8 +580,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE15]]
 ; CHECK:       pred.load.continue15:
 ; CHECK-NEXT:    [[TMP98:%.*]] = phi <4 x i32> [ [[TMP93]], [[PRED_LOAD_CONTINUE13]] ], [ [[TMP97]], [[PRED_LOAD_IF14]] ]
-; CHECK-NEXT:    [[TMP99:%.*]] = extractelement <4 x i1> [[TMP47]], i32 3
-; CHECK-NEXT:    br i1 [[TMP99]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
+; CHECK-NEXT:    br i1 [[TMP43]], label [[PRED_LOAD_IF16:%.*]], label [[PRED_LOAD_CONTINUE17:%.*]]
 ; CHECK:       pred.load.if16:
 ; CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP101:%.*]] = load i32, ptr [[TMP100]], align 4
@@ -596,8 +588,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE17]]
 ; CHECK:       pred.load.continue17:
 ; CHECK-NEXT:    [[TMP103:%.*]] = phi <4 x i32> [ [[TMP98]], [[PRED_LOAD_CONTINUE15]] ], [ [[TMP102]], [[PRED_LOAD_IF16]] ]
-; CHECK-NEXT:    [[TMP104:%.*]] = extractelement <4 x i1> [[TMP55]], i32 0
-; CHECK-NEXT:    br i1 [[TMP104]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
+; CHECK-NEXT:    br i1 [[TMP48]], label [[PRED_LOAD_IF18:%.*]], label [[PRED_LOAD_CONTINUE19:%.*]]
 ; CHECK:       pred.load.if18:
 ; CHECK-NEXT:    [[TMP105:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP106:%.*]] = load i32, ptr [[TMP105]], align 4
@@ -605,8 +596,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE19]]
 ; CHECK:       pred.load.continue19:
 ; CHECK-NEXT:    [[TMP108:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE17]] ], [ [[TMP107]], [[PRED_LOAD_IF18]] ]
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <4 x i1> [[TMP55]], i32 1
-; CHECK-NEXT:    br i1 [[TMP109]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
+; CHECK-NEXT:    br i1 [[TMP49]], label [[PRED_LOAD_IF20:%.*]], label [[PRED_LOAD_CONTINUE21:%.*]]
 ; CHECK:       pred.load.if20:
 ; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP111:%.*]] = load i32, ptr [[TMP110]], align 4
@@ -614,8 +604,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE21]]
 ; CHECK:       pred.load.continue21:
 ; CHECK-NEXT:    [[TMP113:%.*]] = phi <4 x i32> [ [[TMP108]], [[PRED_LOAD_CONTINUE19]] ], [ [[TMP112]], [[PRED_LOAD_IF20]] ]
-; CHECK-NEXT:    [[TMP114:%.*]] = extractelement <4 x i1> [[TMP55]], i32 2
-; CHECK-NEXT:    br i1 [[TMP114]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
+; CHECK-NEXT:    br i1 [[TMP50]], label [[PRED_LOAD_IF22:%.*]], label [[PRED_LOAD_CONTINUE23:%.*]]
 ; CHECK:       pred.load.if22:
 ; CHECK-NEXT:    [[TMP115:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP116:%.*]] = load i32, ptr [[TMP115]], align 4
@@ -623,8 +612,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE23]]
 ; CHECK:       pred.load.continue23:
 ; CHECK-NEXT:    [[TMP118:%.*]] = phi <4 x i32> [ [[TMP113]], [[PRED_LOAD_CONTINUE21]] ], [ [[TMP117]], [[PRED_LOAD_IF22]] ]
-; CHECK-NEXT:    [[TMP119:%.*]] = extractelement <4 x i1> [[TMP55]], i32 3
-; CHECK-NEXT:    br i1 [[TMP119]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
+; CHECK-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF24:%.*]], label [[PRED_LOAD_CONTINUE25:%.*]]
 ; CHECK:       pred.load.if24:
 ; CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP121:%.*]] = load i32, ptr [[TMP120]], align 4
@@ -632,8 +620,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE25]]
 ; CHECK:       pred.load.continue25:
 ; CHECK-NEXT:    [[TMP123:%.*]] = phi <4 x i32> [ [[TMP118]], [[PRED_LOAD_CONTINUE23]] ], [ [[TMP122]], [[PRED_LOAD_IF24]] ]
-; CHECK-NEXT:    [[TMP124:%.*]] = extractelement <4 x i1> [[TMP63]], i32 0
-; CHECK-NEXT:    br i1 [[TMP124]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
+; CHECK-NEXT:    br i1 [[TMP56]], label [[PRED_LOAD_IF26:%.*]], label [[PRED_LOAD_CONTINUE27:%.*]]
 ; CHECK:       pred.load.if26:
 ; CHECK-NEXT:    [[TMP125:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP126:%.*]] = load i32, ptr [[TMP125]], align 4
@@ -641,8 +628,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE27]]
 ; CHECK:       pred.load.continue27:
 ; CHECK-NEXT:    [[TMP128:%.*]] = phi <4 x i32> [ poison, [[PRED_LOAD_CONTINUE25]] ], [ [[TMP127]], [[PRED_LOAD_IF26]] ]
-; CHECK-NEXT:    [[TMP129:%.*]] = extractelement <4 x i1> [[TMP63]], i32 1
-; CHECK-NEXT:    br i1 [[TMP129]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
+; CHECK-NEXT:    br i1 [[TMP57]], label [[PRED_LOAD_IF28:%.*]], label [[PRED_LOAD_CONTINUE29:%.*]]
 ; CHECK:       pred.load.if28:
 ; CHECK-NEXT:    [[TMP130:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[TMP131:%.*]] = load i32, ptr [[TMP130]], align 4
@@ -650,8 +636,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE29]]
 ; CHECK:       pred.load.continue29:
 ; CHECK-NEXT:    [[TMP133:%.*]] = phi <4 x i32> [ [[TMP128]], [[PRED_LOAD_CONTINUE27]] ], [ [[TMP132]], [[PRED_LOAD_IF28]] ]
-; CHECK-NEXT:    [[TMP134:%.*]] = extractelement <4 x i1> [[TMP63]], i32 2
-; CHECK-NEXT:    br i1 [[TMP134]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
+; CHECK-NEXT:    br i1 [[TMP58]], label [[PRED_LOAD_IF30:%.*]], label [[PRED_LOAD_CONTINUE31:%.*]]
 ; CHECK:       pred.load.if30:
 ; CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP136:%.*]] = load i32, ptr [[TMP135]], align 4
@@ -659,8 +644,7 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE31]]
 ; CHECK:       pred.load.continue31:
 ; CHECK-NEXT:    [[TMP138:%.*]] = phi <4 x i32> [ [[TMP133]], [[PRED_LOAD_CONTINUE29]] ], [ [[TMP137]], [[PRED_LOAD_IF30]] ]
-; CHECK-NEXT:    [[TMP139:%.*]] = extractelement <4 x i1> [[TMP63]], i32 3
-; CHECK-NEXT:    br i1 [[TMP139]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
+; CHECK-NEXT:    br i1 [[TMP59]], label [[PRED_LOAD_IF32:%.*]], label [[PRED_LOAD_CONTINUE33]]
 ; CHECK:       pred.load.if32:
 ; CHECK-NEXT:    [[TMP140:%.*]] = getelementptr inbounds i16, ptr [[ALLOCA]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP141:%.*]] = load i32, ptr [[TMP140]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
index 126bbf9afc34e..2ac30337a4fb9 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -25,8 +25,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr [[TMP4]], i32 4, <2 x i1> [[BROADCAST_SPLAT]], <2 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; CHECK:       pred.sdiv.if:
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]]
@@ -34,8 +33,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; CHECK-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; CHECK:       pred.sdiv.continue:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2]]
 ; CHECK:       pred.sdiv.if1:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_MASKED_LOAD]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]]
@@ -70,8 +68,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]]
 ; SINK-GATHER-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 0
 ; SINK-GATHER-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP4]], i32 4, <8 x i1> [[BROADCAST_SPLAT]], <8 x i32> poison)
-; SINK-GATHER-NEXT:    [[TMP5:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0
-; SINK-GATHER-NEXT:    br i1 [[TMP5]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.sdiv.if:
 ; SINK-GATHER-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 0
 ; SINK-GATHER-NEXT:    [[TMP7:%.*]] = sdiv i32 [[TMP6]], [[X:%.*]]
@@ -79,8 +76,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE]]
 ; SINK-GATHER:       pred.sdiv.continue:
 ; SINK-GATHER-NEXT:    [[TMP9:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SDIV_IF]] ]
-; SINK-GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
-; SINK-GATHER-NEXT:    br i1 [[TMP10]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]]
 ; SINK-GATHER:       pred.sdiv.if1:
 ; SINK-GATHER-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 1
 ; SINK-GATHER-NEXT:    [[TMP12:%.*]] = sdiv i32 [[TMP11]], [[X]]
@@ -88,8 +84,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE2]]
 ; SINK-GATHER:       pred.sdiv.continue2:
 ; SINK-GATHER-NEXT:    [[TMP14:%.*]] = phi <8 x i32> [ [[TMP9]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_SDIV_IF1]] ]
-; SINK-GATHER-NEXT:    [[TMP15:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
-; SINK-GATHER-NEXT:    br i1 [[TMP15]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]]
 ; SINK-GATHER:       pred.sdiv.if3:
 ; SINK-GATHER-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 2
 ; SINK-GATHER-NEXT:    [[TMP17:%.*]] = sdiv i32 [[TMP16]], [[X]]
@@ -97,8 +92,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE4]]
 ; SINK-GATHER:       pred.sdiv.continue4:
 ; SINK-GATHER-NEXT:    [[TMP19:%.*]] = phi <8 x i32> [ [[TMP14]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP18]], [[PRED_SDIV_IF3]] ]
-; SINK-GATHER-NEXT:    [[TMP20:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
-; SINK-GATHER-NEXT:    br i1 [[TMP20]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]]
 ; SINK-GATHER:       pred.sdiv.if5:
 ; SINK-GATHER-NEXT:    [[TMP21:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 3
 ; SINK-GATHER-NEXT:    [[TMP22:%.*]] = sdiv i32 [[TMP21]], [[X]]
@@ -106,8 +100,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE6]]
 ; SINK-GATHER:       pred.sdiv.continue6:
 ; SINK-GATHER-NEXT:    [[TMP24:%.*]] = phi <8 x i32> [ [[TMP19]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP23]], [[PRED_SDIV_IF5]] ]
-; SINK-GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
-; SINK-GATHER-NEXT:    br i1 [[TMP25]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]]
 ; SINK-GATHER:       pred.sdiv.if7:
 ; SINK-GATHER-NEXT:    [[TMP26:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 4
 ; SINK-GATHER-NEXT:    [[TMP27:%.*]] = sdiv i32 [[TMP26]], [[X]]
@@ -115,8 +108,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE8]]
 ; SINK-GATHER:       pred.sdiv.continue8:
 ; SINK-GATHER-NEXT:    [[TMP29:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP28]], [[PRED_SDIV_IF7]] ]
-; SINK-GATHER-NEXT:    [[TMP30:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
-; SINK-GATHER-NEXT:    br i1 [[TMP30]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]]
 ; SINK-GATHER:       pred.sdiv.if9:
 ; SINK-GATHER-NEXT:    [[TMP31:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 5
 ; SINK-GATHER-NEXT:    [[TMP32:%.*]] = sdiv i32 [[TMP31]], [[X]]
@@ -124,8 +116,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE10]]
 ; SINK-GATHER:       pred.sdiv.continue10:
 ; SINK-GATHER-NEXT:    [[TMP34:%.*]] = phi <8 x i32> [ [[TMP29]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP33]], [[PRED_SDIV_IF9]] ]
-; SINK-GATHER-NEXT:    [[TMP35:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
-; SINK-GATHER-NEXT:    br i1 [[TMP35]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]]
 ; SINK-GATHER:       pred.sdiv.if11:
 ; SINK-GATHER-NEXT:    [[TMP36:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 6
 ; SINK-GATHER-NEXT:    [[TMP37:%.*]] = sdiv i32 [[TMP36]], [[X]]
@@ -133,8 +124,7 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) {
 ; SINK-GATHER-NEXT:    br label [[PRED_SDIV_CONTINUE12]]
 ; SINK-GATHER:       pred.sdiv.continue12:
 ; SINK-GATHER-NEXT:    [[TMP39:%.*]] = phi <8 x i32> [ [[TMP34]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP38]], [[PRED_SDIV_IF11]] ]
-; SINK-GATHER-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
-; SINK-GATHER-NEXT:    br i1 [[TMP40]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]]
+; SINK-GATHER-NEXT:    br i1 [[C]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.sdiv.if13:
 ; SINK-GATHER-NEXT:    [[TMP41:%.*]] = extractelement <8 x i32> [[WIDE_MASKED_LOAD]], i32 7
 ; SINK-GATHER-NEXT:    [[TMP42:%.*]] = sdiv i32 [[TMP41]], [[X]]
@@ -218,17 +208,16 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <2 x i1> poison, i1 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT4]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE2:%.*]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[PRED_UDIV_CONTINUE2]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul <2 x i64> [[VEC_IND]], splat (i64 777)
-; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i64> [[TMP0]], i32 0
@@ -239,8 +228,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i64> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
@@ -268,7 +256,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ]
 ; CHECK-NEXT:    [[I7:%.*]] = mul i64 [[I]], 777
-; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]]
 ; CHECK-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
@@ -292,17 +280,16 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER:       vector.ph:
 ; SINK-GATHER-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 8
 ; SINK-GATHER-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; SINK-GATHER-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i1> poison, i1 [[C:%.*]], i64 0
-; SINK-GATHER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT]], <8 x i1> poison, <8 x i32> zeroinitializer
 ; SINK-GATHER-NEXT:    [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> poison, i32 [[X:%.*]], i64 0
 ; SINK-GATHER-NEXT:    [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> poison, <8 x i32> zeroinitializer
+; SINK-GATHER-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i1> poison, i1 [[TMP1:%.*]], i64 0
+; SINK-GATHER-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT16]], <8 x i1> poison, <8 x i32> zeroinitializer
 ; SINK-GATHER-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SINK-GATHER:       vector.body:
 ; SINK-GATHER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE14:%.*]] ]
 ; SINK-GATHER-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_UDIV_CONTINUE14]] ]
 ; SINK-GATHER-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[PRED_UDIV_CONTINUE14]] ]
 ; SINK-GATHER-NEXT:    [[TMP0:%.*]] = mul <8 x i64> [[VEC_IND]], splat (i64 777)
-; SINK-GATHER-NEXT:    [[TMP1:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 0
 ; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; SINK-GATHER:       pred.udiv.if:
 ; SINK-GATHER-NEXT:    [[TMP2:%.*]] = extractelement <8 x i64> [[TMP0]], i32 0
@@ -313,8 +300,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; SINK-GATHER:       pred.udiv.continue:
 ; SINK-GATHER-NEXT:    [[TMP8:%.*]] = phi <8 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
-; SINK-GATHER-NEXT:    [[TMP9:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 1
-; SINK-GATHER-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2:%.*]]
 ; SINK-GATHER:       pred.udiv.if1:
 ; SINK-GATHER-NEXT:    [[TMP10:%.*]] = extractelement <8 x i64> [[TMP0]], i32 1
 ; SINK-GATHER-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
@@ -324,8 +310,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE2]]
 ; SINK-GATHER:       pred.udiv.continue2:
 ; SINK-GATHER-NEXT:    [[TMP16:%.*]] = phi <8 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP14]], [[PRED_UDIV_IF1]] ]
-; SINK-GATHER-NEXT:    [[TMP17:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 2
-; SINK-GATHER-NEXT:    br i1 [[TMP17]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; SINK-GATHER:       pred.udiv.if3:
 ; SINK-GATHER-NEXT:    [[TMP18:%.*]] = extractelement <8 x i64> [[TMP0]], i32 2
 ; SINK-GATHER-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP18]]
@@ -335,8 +320,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; SINK-GATHER:       pred.udiv.continue4:
 ; SINK-GATHER-NEXT:    [[TMP24:%.*]] = phi <8 x i32> [ [[TMP16]], [[PRED_UDIV_CONTINUE2]] ], [ [[TMP22]], [[PRED_UDIV_IF3]] ]
-; SINK-GATHER-NEXT:    [[TMP25:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 3
-; SINK-GATHER-NEXT:    br i1 [[TMP25]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
 ; SINK-GATHER:       pred.udiv.if5:
 ; SINK-GATHER-NEXT:    [[TMP26:%.*]] = extractelement <8 x i64> [[TMP0]], i32 3
 ; SINK-GATHER-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
@@ -346,8 +330,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
 ; SINK-GATHER:       pred.udiv.continue6:
 ; SINK-GATHER-NEXT:    [[TMP32:%.*]] = phi <8 x i32> [ [[TMP24]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP30]], [[PRED_UDIV_IF5]] ]
-; SINK-GATHER-NEXT:    [[TMP33:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 4
-; SINK-GATHER-NEXT:    br i1 [[TMP33]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
 ; SINK-GATHER:       pred.udiv.if7:
 ; SINK-GATHER-NEXT:    [[TMP34:%.*]] = extractelement <8 x i64> [[TMP0]], i32 4
 ; SINK-GATHER-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP34]]
@@ -357,8 +340,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE8]]
 ; SINK-GATHER:       pred.udiv.continue8:
 ; SINK-GATHER-NEXT:    [[TMP40:%.*]] = phi <8 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP38]], [[PRED_UDIV_IF7]] ]
-; SINK-GATHER-NEXT:    [[TMP41:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 5
-; SINK-GATHER-NEXT:    br i1 [[TMP41]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
 ; SINK-GATHER:       pred.udiv.if9:
 ; SINK-GATHER-NEXT:    [[TMP42:%.*]] = extractelement <8 x i64> [[TMP0]], i32 5
 ; SINK-GATHER-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP42]]
@@ -368,8 +350,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE10]]
 ; SINK-GATHER:       pred.udiv.continue10:
 ; SINK-GATHER-NEXT:    [[TMP48:%.*]] = phi <8 x i32> [ [[TMP40]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP46]], [[PRED_UDIV_IF9]] ]
-; SINK-GATHER-NEXT:    [[TMP49:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 6
-; SINK-GATHER-NEXT:    br i1 [[TMP49]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
 ; SINK-GATHER:       pred.udiv.if11:
 ; SINK-GATHER-NEXT:    [[TMP50:%.*]] = extractelement <8 x i64> [[TMP0]], i32 6
 ; SINK-GATHER-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP50]]
@@ -379,8 +360,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    br label [[PRED_UDIV_CONTINUE12]]
 ; SINK-GATHER:       pred.udiv.continue12:
 ; SINK-GATHER-NEXT:    [[TMP56:%.*]] = phi <8 x i32> [ [[TMP48]], [[PRED_UDIV_CONTINUE10]] ], [ [[TMP54]], [[PRED_UDIV_IF11]] ]
-; SINK-GATHER-NEXT:    [[TMP57:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT]], i32 7
-; SINK-GATHER-NEXT:    br i1 [[TMP57]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14]]
 ; SINK-GATHER:       pred.udiv.if13:
 ; SINK-GATHER-NEXT:    [[TMP58:%.*]] = extractelement <8 x i64> [[TMP0]], i32 7
 ; SINK-GATHER-NEXT:    [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP58]]
@@ -408,7 +388,7 @@ define i32 @scalarize_and_sink_gather(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; SINK-GATHER-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; SINK-GATHER-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T6:%.*]], [[FOR_INC]] ]
 ; SINK-GATHER-NEXT:    [[I7:%.*]] = mul i64 [[I]], 777
-; SINK-GATHER-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; SINK-GATHER-NEXT:    br i1 [[TMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SINK-GATHER:       if.then:
 ; SINK-GATHER-NEXT:    [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I7]]
 ; SINK-GATHER-NEXT:    [[T2:%.*]] = load i32, ptr [[T0]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index e733a4a381e50..e0a93ce877358 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -761,8 +761,7 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], [[X:%.*]]
@@ -772,8 +771,7 @@ define i32 @predicated_udiv_scalarized_operand(ptr %a, i1 %c, i32 %x, i64 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP10]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], [[X]]
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index 2a85761da1e52..c060ccda6b011 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -332,17 +332,11 @@ define void @bug18724(i1 %cond, ptr %ptr, i1 %cond.2, i64 %v.1, i32 %v.2) {
 ; VEC-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR:%.*]], i64 0, i64 [[TMP6]]
 ; VEC-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP8]], align 4
-; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    br i1 [[COND_2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP6]]
 ; VEC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i32 [[TMP11]], ptr [[TMP10]], align 4
-; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP12]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; VEC:       pred.store.if1:
 ; VEC-NEXT:    [[TMP13:%.*]] = add i64 [[OFFSET_IDX]], 1
 ; VEC-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [768 x i32], ptr [[PTR]], i64 0, i64 [[TMP13]]
 ; VEC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
@@ -493,8 +487,6 @@ define void @minimal_bit_widths(i1 %c) {
 ;
 ; VEC-LABEL: @minimal_bit_widths(
 ; VEC-NEXT:  entry:
-; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -502,17 +494,11 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
 ; VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 0
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
-; VEC-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr undef, i64 [[TMP0]]
 ; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i8 [[TMP5]], ptr [[TMP4]], align 1
-; VEC-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; VEC:       pred.store.if1:
 ; VEC-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
 ; VEC-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr undef, i64 [[TMP7]]
 ; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
@@ -633,8 +619,6 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ;
 ; VEC-LABEL: @minimal_bit_widths_with_aliasing_store(
 ; VEC-NEXT:  entry:
-; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C1:%.*]], i64 0
-; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; VEC-NEXT:    br label [[FOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
@@ -643,17 +627,11 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) {
 ; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP2]], i32 0
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1
 ; VEC-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP3]], align 1
-; VEC-NEXT:    [[C:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]]
+; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP0]]
 ; VEC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i8 [[TMP5]], ptr [[TMP4]], align 1
-; VEC-NEXT:    br label [[FOR_INC]]
-; VEC:       pred.store.continue:
-; VEC-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
-; VEC:       pred.store.if1:
 ; VEC-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 1
 ; VEC-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP7]]
 ; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 939709b91062e..4bbf1e8fcc76a 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1971,8 +1971,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; CHECK:       pred.udiv.if:
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP0]]
@@ -1980,8 +1979,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; CHECK:       pred.udiv.continue:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF1:%.*]], label [[PRED_UDIV_CONTINUE2]]
 ; CHECK:       pred.udiv.if1:
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
@@ -2202,8 +2200,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 2
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NO-IC-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
-; UNROLL-NO-IC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if:
 ; UNROLL-NO-IC-NEXT:    [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
 ; UNROLL-NO-IC-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP0]]
@@ -2211,8 +2208,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE]]
 ; UNROLL-NO-IC:       pred.udiv.continue:
 ; UNROLL-NO-IC-NEXT:    [[TMP8:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP9]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if3:
 ; UNROLL-NO-IC-NEXT:    [[TMP10:%.*]] = add i32 [[INDEX]], 1
 ; UNROLL-NO-IC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
@@ -2221,8 +2217,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE4]]
 ; UNROLL-NO-IC:       pred.udiv.continue4:
 ; UNROLL-NO-IC-NEXT:    [[TMP14:%.*]] = phi <2 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP15]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
 ; UNROLL-NO-IC:       pred.udiv.if5:
 ; UNROLL-NO-IC-NEXT:    [[TMP16:%.*]] = add i32 [[INDEX]], 2
 ; UNROLL-NO-IC-NEXT:    [[TMP17:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
@@ -2231,8 +2226,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NO-IC-NEXT:    br label [[PRED_UDIV_CONTINUE6]]
 ; UNROLL-NO-IC:       pred.udiv.continue6:
 ; UNROLL-NO-IC-NEXT:    [[TMP20:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP19]], [[PRED_UDIV_IF5]] ]
-; UNROLL-NO-IC-NEXT:    [[TMP21:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; UNROLL-NO-IC-NEXT:    br i1 [[TMP21]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8]]
 ; UNROLL-NO-IC:       pred.udiv.if7:
 ; UNROLL-NO-IC-NEXT:    [[TMP22:%.*]] = add i32 [[INDEX]], 3
 ; UNROLL-NO-IC-NEXT:    [[TMP23:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
index 1dd526df503bd..c6103f5200053 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll
@@ -29,8 +29,7 @@ define i8 @test_negative_off(i16 %len, ptr %test_base) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i1, ptr [[TMP3]], align 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i1> poison, i1 [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i1> [[TMP6]], i1 [[TMP5]], i32 1
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
 ; CHECK:       pred.load.if:
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP0]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1
@@ -38,8 +37,7 @@ define i8 @test_negative_off(i16 %len, ptr %test_base) {
 ; CHECK-NEXT:    br label [[PRED_LOAD_CONTINUE]]
 ; CHECK:       pred.load.continue:
 ; CHECK-NEXT:    [[TMP12:%.*]] = phi <2 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT:    br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
 ; CHECK:       pred.load.if1:
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[ALLOCA]], i16 [[TMP1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP14]], align 1

From 6578790076fed5467bf613e48f7a0ad2f50f041a Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Mon, 27 Jan 2025 13:44:18 -0800
Subject: [PATCH 287/432] Reapply "[Fuchsia][cmake] Allow using FatLTO when
 building runtimes" (#119252) (#121820)

The previous failures were addressed with CMake changes in #121819
---
 clang/cmake/caches/Fuchsia-stage2.cmake    | 16 ++++++++++++++++
 llvm/cmake/modules/HandleLLVMOptions.cmake |  2 +-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake
index 1cbf691f29d58..a2e9ebd53bdc4 100644
--- a/clang/cmake/caches/Fuchsia-stage2.cmake
+++ b/clang/cmake/caches/Fuchsia-stage2.cmake
@@ -190,6 +190,10 @@ foreach(target aarch64-unknown-linux-gnu;armv7-unknown-linux-gnueabihf;i386-unkn
     set(RUNTIMES_${target}_LLVM_TOOLS_DIR "${CMAKE_BINARY_DIR}/bin" CACHE BOOL "")
     set(RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
+    # Enable FatLTO for Linux and baremetal runtimes
+    set(RUNTIMES_${target}_LLVM_ENABLE_LTO ON CACHE BOOL "")
+    set(RUNTIMES_${target}_LLVM_ENABLE_FATLTO ON CACHE BOOL "")
+
     # Use .build-id link.
     list(APPEND RUNTIME_BUILD_ID_LINK "${target}")
   endif()
@@ -272,6 +276,10 @@ if(FUCHSIA_SDK)
     set(RUNTIMES_${target}+asan+noexcept_LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
     set(RUNTIMES_${target}+asan+noexcept_LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 
+    # Enable FatLTO for Fuchsia runtimes
+    set(RUNTIMES_${target}_LLVM_ENABLE_LTO ON CACHE BOOL "")
+    set(RUNTIMES_${target}_LLVM_ENABLE_FATLTO ON CACHE BOOL "")
+
     # Use .build-id link.
     list(APPEND RUNTIME_BUILD_ID_LINK "${target}")
   endforeach()
@@ -369,6 +377,10 @@ foreach(target armv6m-none-eabi;armv7m-none-eabi;armv7em-none-eabi;armv8m.main-n
   set(RUNTIMES_${target}_LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES "libc;libcxx" CACHE STRING "")
+
+  # Enable FatLTO for baremetal runtimes
+  set(RUNTIMES_${target}_LLVM_ENABLE_LTO ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LLVM_ENABLE_FATLTO ON CACHE BOOL "")
 endforeach()
 
 foreach(target riscv32-unknown-elf)
@@ -420,6 +432,10 @@ foreach(target riscv32-unknown-elf)
   set(RUNTIMES_${target}_LLVM_INCLUDE_TESTS OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LLVM_ENABLE_ASSERTIONS OFF CACHE BOOL "")
   set(RUNTIMES_${target}_LLVM_ENABLE_RUNTIMES "libc;libcxx" CACHE STRING "")
+
+  # Enable FatLTO for baremetal runtimes
+  set(RUNTIMES_${target}_LLVM_ENABLE_LTO ON CACHE BOOL "")
+  set(RUNTIMES_${target}_LLVM_ENABLE_FATLTO ON CACHE BOOL "")
 endforeach()
 
 set(LLVM_BUILTIN_TARGETS "${BUILTIN_TARGETS}" CACHE STRING "")
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index f19125eb6bf27..55a87f5fdbb13 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1285,7 +1285,7 @@ elseif(LLVM_ENABLE_LTO)
   endif()
 endif()
 
-if(LLVM_ENABLE_FATLTO AND UNIX AND NOT APPLE)
+if(LLVM_ENABLE_FATLTO AND ((UNIX AND NOT APPLE) OR FUCHSIA))
   append("-ffat-lto-objects" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   if(NOT LINKER_IS_LLD_LINK)
     append("-ffat-lto-objects" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS)

From aca08a8515d48baff48328df5d54300fc7180bea Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Mon, 27 Jan 2025 13:44:44 -0800
Subject: [PATCH 288/432] [TableGen] Add assert to validate `Objects` list for
 `HwModeSelect` (#123794)

- Bail out of TableGen if any asserts fail before running the backend.
- Add asserts to validate that the `Objects` and `Modes` lists for
various `HwModeSelect` subclasses are of same length.
 - Eliminate equivalent check in CodeGenHWModes.cpp
---
 llvm/include/llvm/Target/Target.td            | 17 ++++++---
 llvm/lib/TableGen/Main.cpp                    |  5 +++
 llvm/lib/TableGen/TGParser.cpp                |  8 ++--
 llvm/test/TableGen/BitsInit.td                | 21 +----------
 llvm/test/TableGen/BitsInitErrors.td          | 37 +++++++++++++++++++
 llvm/test/TableGen/HwModeSelect.td            |  5 ++-
 llvm/utils/TableGen/Common/CodeGenHwModes.cpp |  8 ----
 .../attr-or-type-builder-invalid.td           |  5 +--
 8 files changed, 64 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/TableGen/BitsInitErrors.td

diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index f568a64971f09..e8b460aaf803b 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -43,8 +43,11 @@ def DefaultMode : HwMode<"", []>;
 // "Objects", which is a list of the same length as the list of modes.
 // The n-th element on the Objects list will be associated with the n-th
 // element on the Modes list.
-class HwModeSelect<list<HwMode> Ms> {
+class HwModeSelect<list<HwMode> Ms, int ObjectsLength> {
   list<HwMode> Modes = Ms;
+
+  assert !eq(ObjectsLength, !size(Modes)),
+     "The Objects and Modes lists must be the same length";
 }
 
 // A common class that implements a counterpart of ValueType, which is
@@ -53,7 +56,7 @@ class HwModeSelect<list<HwMode> Ms> {
 // objects could be used. This is specifically applicable to selection
 // patterns.
 class ValueTypeByHwMode<list<HwMode> Ms, list<ValueType> Ts>
-    : HwModeSelect<Ms>, ValueType<0, 0> {
+    : HwModeSelect<Ms, !size(Ts)>, ValueType<0, 0> {
   // The length of this list must be the same as the length of Ms.
   list<ValueType> Objects = Ts;
 }
@@ -64,7 +67,9 @@ class ValueTypeByHwMode<list<HwMode> Ms, list<ValueType> Ts>
 // or ValueType could be used. This is specifically applicable to selection
 // patterns.
 class PtrValueTypeByHwMode<ValueTypeByHwMode scalar, int addrspace>
-    : HwModeSelect<scalar.Modes>, PtrValueType<ValueType<0, 0>, addrspace> {
+    : HwModeSelect<scalar.Modes, !size(scalar.Objects)>,
+      PtrValueType<ValueType<0, 0>, addrspace> {
+  // The length of this list must be the same as the length of Ms.
   list<ValueType> Objects = scalar.Objects;
 }
 
@@ -78,7 +83,7 @@ class RegInfo<int RS, int SS, int SA> {
 
 // The register size/alignment information, parameterized by a HW mode.
 class RegInfoByHwMode<list<HwMode> Ms = [], list<RegInfo> Ts = []>
-    : HwModeSelect<Ms> {
+    : HwModeSelect<Ms, !size(Ts)> {
   // The length of this list must be the same as the length of Ms.
   list<RegInfo> Objects = Ts;
 }
@@ -89,7 +94,7 @@ class SubRegRange<int size, int offset = 0> {
 }
 
 class SubRegRangeByHwMode<list<HwMode> Ms = [], list<SubRegRange> Ts = []>
-    : HwModeSelect<Ms> {
+    : HwModeSelect<Ms, !size(Ts)> {
   // The length of this list must be the same as the length of Ms.
   list<SubRegRange> Objects = Ts;
 }
@@ -574,7 +579,7 @@ class InstructionEncoding {
 // an EncodingByHwMode, its Inst and Size members are ignored and Ts are used
 // to encode and decode based on HwMode.
 class EncodingByHwMode<list<HwMode> Ms = [], list<InstructionEncoding> Ts = []>
-    : HwModeSelect<Ms> {
+    : HwModeSelect<Ms, !size(Ts)> {
   // The length of this list must be the same as the length of Ms.
   list<InstructionEncoding> Objects = Ts;
 }
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 55a99cbfc58ac..35600bf2f1f86 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -128,6 +128,11 @@ int llvm::TableGenMain(const char *argv0,
     return 1;
   Timer.stopTimer();
 
+  // Return early if any other errors were generated during parsing
+  // (e.g., assert failures).
+  if (ErrorsPrinted > 0)
+    return reportError(argv0, Twine(ErrorsPrinted) + " errors.\n");
+
   // Write output to memory.
   Timer.startBackendTimer("Backend overall");
   std::string OutString;
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index d2115ab7627da..9a8301cffb930 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -286,11 +286,13 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, const Init *ValName,
       InitType = (Twine("' of type bit initializer with length ") +
                   Twine(BI->getNumBits())).str();
     else if (const auto *TI = dyn_cast<TypedInit>(V))
-      InitType = (Twine("' of type '") + TI->getType()->getAsString()).str();
+      InitType =
+          (Twine("' of type '") + TI->getType()->getAsString() + "'").str();
+
     return Error(Loc, "Field '" + ValName->getAsUnquotedString() +
                           "' of type '" + RV->getType()->getAsString() +
-                          "' is incompatible with value '" +
-                          V->getAsString() + InitType + "'");
+                          "' is incompatible with value '" + V->getAsString() +
+                          InitType);
   }
   return false;
 }
diff --git a/llvm/test/TableGen/BitsInit.td b/llvm/test/TableGen/BitsInit.td
index c5527aebb9417..0e7fa7acb722f 100644
--- a/llvm/test/TableGen/BitsInit.td
+++ b/llvm/test/TableGen/BitsInit.td
@@ -1,12 +1,10 @@
 
-// RUN: not llvm-tblgen %s 2>&1 > %t
-// RUN: FileCheck %s < %t
+// RUN: llvm-tblgen %s | FileCheck %s
 
 def a {
   bits<2> opc = { 0, 1 };
   bits<2> opc2 = { 1, 0 };
   bits<1> opc3 = { 1 };
-  bits<2> a = { opc, opc2 }; // error!
   bits<2> b = { opc{0}, opc2{0} };
   bits<2> c = { opc{1}, opc2{1} };
   bits<2> c = { opc3{0}, opc3 };
@@ -16,34 +14,25 @@ def a {
 // CHECK:   bits<2> opc = { 0, 1 };
 // CHECK:   bits<2> opc2 = { 1, 0 };
 // CHECK:   bits<1> opc3 = { 1 };
-// CHECK:   bits<2> a;
 // CHECK:   bits<2> b = { 1, 0 };
 // CHECK:   bits<2> c = { 1, 1 };
 // CHECK: }
 
 def {
-  bits<2> B1 = 0b011;  // bitfield is too small, reject
   bits<3> B2 = 0b011;  // ok
 
-  bits<2> C1 = 0b111;  // bitfield is too small, reject
   bits<3> C2 = 0b111;  // ok
 
   bits<2> D1 = { 0, 0 }; // ok
   bits<2> D2 = { 0b00 }; // ok
-  bits<3> D3 = { 0, 0 }; // type mismatch.  RHS doesn't have enough bits
-  bits<3> D4 = { 0b00 }; // type mismatch.  RHS doesn't have enough bits
   bits<1> D5 = { 0 };    // ok
   bits<1> D6 = { 1 };    // ok
-  bits<1> D7 = { 3 };    // type mismatch.  LHS doesn't have enough bits
-  bits<2> D8 = { 0 };    // type mismatch.  RHS doesn't have enough bits
 
   bits<8> E;
   let E{7...0} = {0,0,1,?,?,?,?,?};
   let E{3...0} = 0b0010;
 
   bits<8> F1 = { 0, 1, 0b1001, 0, 0b0 }; // ok
-  bits<7> F2 = { 0, 1, 0b1001, 0, 0b0 }; // LHS doesn't have enough bits
-  bits<9> F3 = { 0, 1, 0b1001, 0, 0b0 }; // RHS doesn't have enough bits
 
   bits<8> G1 = { 0, { 1, 0b1001, 0 }, 0b0 }; // ok
   bits<8> G2 = { 0, { 1, 0b1001 }, 0, 0b0 }; // ok
@@ -63,22 +52,14 @@ def {
 }
 
 // CHECK: def {{.*}} {
-// CHECK: bits<2> B1;
 // CHECK: bits<3> B2 = { 0, 1, 1 };
-// CHECK: bits<2> C1;
 // CHECK: bits<3> C2 = { 1, 1, 1 };
 // CHECK: bits<2> D1 = { 0, 0 };
 // CHECK: bits<2> D2 = { 0, 0 };
-// CHECK: bits<3> D3;
-// CHECK: bits<3> D4;
 // CHECK: bits<1> D5 = { 0 };
 // CHECK: bits<1> D6 = { 1 };
-// CHECK: bits<1> D7 = { !cast<bit>(3) };
-// CHECK: bits<2> D8;
 // CHECK: bits<8> E = { 0, 0, 1, ?, 0, 0, 1, 0 };
 // CHECK: bits<8> F1 = { 0, 1, 1, 0, 0, 1, 0, 0 };
-// CHECK: bits<7> F2;
-// CHECK: bits<9> F3;
 // CHECK: bits<8> G1 = { 0, 1, 1, 0, 0, 1, 0, 0 };
 // CHECK: bits<8> G2 = { 0, 1, 1, 0, 0, 1, 0, 0 };
 // CHECK: bits<8> G3 = { 0, 1, 1, 0, 0, 1, 0, 0 };
diff --git a/llvm/test/TableGen/BitsInitErrors.td b/llvm/test/TableGen/BitsInitErrors.td
new file mode 100644
index 0000000000000..8e8e8bf179e2c
--- /dev/null
+++ b/llvm/test/TableGen/BitsInitErrors.td
@@ -0,0 +1,37 @@
+
+// RUN: not llvm-tblgen %s 2>&1 | FileCheck %s -DFILE=%s
+
+def a {
+  bits<2> opc = { 0, 1 };
+  bits<2> opc2 = { 1, 0 };
+  bits<1> opc3 = { 1 };
+  // CHECK: [[FILE]]:[[@LINE+1]]:15: error: Field 'a' of type 'bits<2>' is incompatible with value '{ opc{1}, opc{0}, opc2{1}, opc2{0} }' of type bit initializer with length 4
+  bits<2> a = { opc, opc2 }; // error!
+}
+
+def {
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'B1' of type 'bits<2>' is incompatible with value '{ 0, 1, 1 }' of type bit initializer with length 3
+  bits<2> B1 = 0b011;  // bitfield is too small, reject
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'C1' of type 'bits<2>' is incompatible with value '{ 1, 1, 1 }' of type bit initializer with length 3
+  bits<2> C1 = 0b111;  // bitfield is too small, reject
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'D3' of type 'bits<3>' is incompatible with value '{ 0, 0 }' of type bit initializer with length 2
+  bits<3> D3 = { 0, 0 }; // type mismatch.  RHS doesn't have enough bits
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'D4' of type 'bits<3>' is incompatible with value '{ 0, 0 }' of type bit initializer with length 2
+  bits<3> D4 = { 0b00 }; // type mismatch.  RHS doesn't have enough bits
+
+  bits<1> D7 = { 3 };    // type mismatch.  LHS doesn't have enough bits
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'D8' of type 'bits<2>' is incompatible with value '{ 0 }' of type bit initializer with length 1
+  bits<2> D8 = { 0 };    // type mismatch.  RHS doesn't have enough bits
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'F2' of type 'bits<7>' is incompatible with value '{ 0, 1, 1, 0, 0, 1, 0, 0 }' of type bit initializer with length 8
+  bits<7> F2 = { 0, 1, 0b1001, 0, 0b0 }; // LHS doesn't have enough bits
+
+  // CHECK: [[FILE]]:[[@LINE+1]]:16: error: Field 'F3' of type 'bits<9>' is incompatible with value '{ 0, 1, 1, 0, 0, 1, 0, 0 }' of type bit initializer with length 8
+  bits<9> F3 = { 0, 1, 0b1001, 0, 0b0 }; // RHS doesn't have enough bits
+
+  // CHECK: Initializer of 'D7' in 'anonymous_0' could not be fully resolved: { !cast<bit>(3) }
+}
diff --git a/llvm/test/TableGen/HwModeSelect.td b/llvm/test/TableGen/HwModeSelect.td
index e849febe0c4cb..0bac59a92304d 100644
--- a/llvm/test/TableGen/HwModeSelect.td
+++ b/llvm/test/TableGen/HwModeSelect.td
@@ -1,4 +1,4 @@
-// RUN: not --crash llvm-tblgen -gen-dag-isel -I %p/../../include %s 2>&1 | FileCheck %s
+// RUN: not llvm-tblgen -gen-dag-isel -I %p/../../include %s 2>&1 | FileCheck %s -DFILE=%s
 
 // The HwModeSelect class is intended to serve as a base class for other
 // classes that are then used to select a value based on the HW mode.
@@ -24,7 +24,8 @@ def HasFeat2 : Predicate<"Subtarget->hasFeat2()">;
 def TestMode1 : HwMode<"+feat1", [HasFeat1]>;
 def TestMode2 : HwMode<"+feat2", [HasFeat2]>;
 
+// CHECK: error: assertion failed: The Objects and Modes lists must be the same length
+// CHECK: [[FILE]]:[[@LINE+1]]:5: error: assertion failed in this record
 def BadDef : ValueTypeByHwMode<[TestMode1, TestMode2, DefaultMode],
                                [i8, i16, i32, i64]>;
 
-// CHECK: error: in record BadDef derived from HwModeSelect: the lists Modes and Objects should have the same size
diff --git a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
index c744691ae9e08..9996b5a4451f0 100644
--- a/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenHwModes.cpp
@@ -41,14 +41,6 @@ void HwMode::dump() const { dbgs() << Name << ": " << Features << '\n'; }
 HwModeSelect::HwModeSelect(const Record *R, CodeGenHwModes &CGH) {
   std::vector<const Record *> Modes = R->getValueAsListOfDefs("Modes");
   std::vector<const Record *> Objects = R->getValueAsListOfDefs("Objects");
-  if (Modes.size() != Objects.size()) {
-    PrintError(
-        R->getLoc(),
-        "in record " + R->getName() +
-            " derived from HwModeSelect: the lists Modes and Objects should "
-            "have the same size");
-    report_fatal_error("error in target description.");
-  }
   for (auto [Mode, Object] : zip_equal(Modes, Objects)) {
     unsigned ModeId = CGH.getHwModeId(Mode);
     Items.emplace_back(ModeId, Object);
diff --git a/mlir/test/mlir-tblgen/attr-or-type-builder-invalid.td b/mlir/test/mlir-tblgen/attr-or-type-builder-invalid.td
index 4db7162184550..d16fcfc4cd099 100644
--- a/mlir/test/mlir-tblgen/attr-or-type-builder-invalid.td
+++ b/mlir/test/mlir-tblgen/attr-or-type-builder-invalid.td
@@ -1,4 +1,4 @@
-// RUN: not mlir-tblgen -gen-typedef-defs -I %S/../../include %s 2>&1 | FileCheck %s
+// RUN: not mlir-tblgen -gen-typedef-defs -I %S/../../include %s 2>&1 | FileCheck %s -DFILE=%s
 
 include "mlir/IR/AttrTypeBase.td"
 include "mlir/IR/OpBase.td"
@@ -13,14 +13,13 @@ class InvalidType<string name> : TypeDef<Test_Dialect, name> {
 }
 
 // This definition should not generate an error due to the use in `InvalidTypeA`
-// CHECK-NOT: Record `TestParameter' does not have a field named `type'!
 def TestParameter : TypeParameter<"int", "int parameter">;
 
 // Test builder uses wrong record class.
+// CHECK: [[FILE]]:[[@LINE+1]]:5: error: Initializer of 'typeName' in 'InvalidTypeA' could not be fully resolved: !strconcat("TestDialect", !strconcat(".", mnemonic))
 def InvalidTypeA : InvalidType<"InvalidTypeA"> {
   let parameters = (ins "int":$v0);
   let builders = [
-    // CHECK: Builder DAG arguments must be either strings or defs which inherit from CArg
     TypeBuilder<(ins TestParameter:$arg0), [{
       return $_get($_ctxt, arg0);
     }]>

From b707d52fc557d24c4269987a80dc270e7adad942 Mon Sep 17 00:00:00 2001
From: Jens Reidel <adrian@travitia.xyz>
Date: Mon, 27 Jan 2025 22:49:44 +0100
Subject: [PATCH 289/432] [compiler-rt][Mips] Properly guard references to
 _ABIN32 (#124492)

When targeting ABIO32 (mips32), _ABIN32 is undefined and the
preprocessor directives cause compile errors. Guard references to
_ABIN32 with defined(_ABIN32), just like the references to _ABIO32.

Signed-off-by: Jens Reidel <adrian@travitia.xyz>
---
 compiler-rt/lib/safestack/safestack_platform.h  |  3 ++-
 .../lib/sanitizer_common/sanitizer_linux.cpp    | 17 ++++++++++-------
 .../sanitizer_platform_limits_posix.h           | 11 +++++++----
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/safestack/safestack_platform.h b/compiler-rt/lib/safestack/safestack_platform.h
index 41c7c25fdaf4d..3f6a69026548e 100644
--- a/compiler-rt/lib/safestack/safestack_platform.h
+++ b/compiler-rt/lib/safestack/safestack_platform.h
@@ -54,7 +54,8 @@ extern "C" void *__mmap(void *, size_t, int, int, int, int, off_t);
 // but it still needs to use 64-bit syscalls.
 #if SANITIZER_LINUX &&                                \
     (defined(__x86_64__) || defined(__powerpc64__) || \
-     SANITIZER_WORDSIZE == 64 || (defined(__mips__) && _MIPS_SIM == _ABIN32))
+     SANITIZER_WORDSIZE == 64 ||                      \
+     (defined(__mips__) && defined(_ABIN32) && _MIPS_SIM == _ABIN32))
 #  define SANITIZER_LINUX_USES_64BIT_SYSCALLS 1
 #else
 #  define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 997b95f343d41..7aa48d29d2d53 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -134,9 +134,10 @@ const int FUTEX_WAKE_PRIVATE = FUTEX_WAKE | FUTEX_PRIVATE_FLAG;
 // Are we using 32-bit or 64-bit Linux syscalls?
 // x32 (which defines __x86_64__) has SANITIZER_WORDSIZE == 32
 // but it still needs to use 64-bit syscalls.
-#  if SANITIZER_LINUX && (defined(__x86_64__) || defined(__powerpc64__) || \
-                          SANITIZER_WORDSIZE == 64 ||                      \
-                          (defined(__mips__) && _MIPS_SIM == _ABIN32))
+#  if SANITIZER_LINUX &&                                \
+      (defined(__x86_64__) || defined(__powerpc64__) || \
+       SANITIZER_WORDSIZE == 64 ||                      \
+       (defined(__mips__) && defined(_ABIN32) && _MIPS_SIM == _ABIN32))
 #    define SANITIZER_LINUX_USES_64BIT_SYSCALLS 1
 #  else
 #    define SANITIZER_LINUX_USES_64BIT_SYSCALLS 0
@@ -429,8 +430,9 @@ uptr internal_stat(const char *path, void *buf) {
                              AT_NO_AUTOMOUNT, STATX_BASIC_STATS, (uptr)&bufx);
   statx_to_stat(&bufx, (struct stat *)buf);
   return res;
-#      elif (SANITIZER_WORDSIZE == 64 || SANITIZER_X32 ||    \
-             (defined(__mips__) && _MIPS_SIM == _ABIN32)) && \
+#      elif (                                                                 \
+          SANITIZER_WORDSIZE == 64 || SANITIZER_X32 ||                        \
+          (defined(__mips__) && defined(_ABIN32) && _MIPS_SIM == _ABIN32)) && \
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           0);
@@ -467,8 +469,9 @@ uptr internal_lstat(const char *path, void *buf) {
                              STATX_BASIC_STATS, (uptr)&bufx);
   statx_to_stat(&bufx, (struct stat *)buf);
   return res;
-#      elif (defined(_LP64) || SANITIZER_X32 ||              \
-             (defined(__mips__) && _MIPS_SIM == _ABIN32)) && \
+#      elif (                                                                 \
+          defined(_LP64) || SANITIZER_X32 ||                                  \
+          (defined(__mips__) && defined(_ABIN32) && _MIPS_SIM == _ABIN32)) && \
           !SANITIZER_SPARC
   return internal_syscall(SYSCALL(newfstatat), AT_FDCWD, (uptr)path, (uptr)buf,
                           AT_SYMLINK_NOFOLLOW);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 7e62dc0e0523e..1a7d9e64048eb 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -98,10 +98,13 @@ const unsigned struct_kernel_stat64_sz = 104;
 const unsigned struct_kernel_stat_sz = 144;
 const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__mips__)
-const unsigned struct_kernel_stat_sz =
-    SANITIZER_ANDROID
-        ? FIRST_32_SECOND_64(104, 128)
-        : FIRST_32_SECOND_64((_MIPS_SIM == _ABIN32) ? 176 : 160, 216);
+const unsigned struct_kernel_stat_sz = SANITIZER_ANDROID
+                                           ? FIRST_32_SECOND_64(104, 128)
+#      if defined(_ABIN32) && _MIPS_SIM == _ABIN32
+                                           : FIRST_32_SECOND_64(176, 216);
+#      else
+                                           : FIRST_32_SECOND_64(160, 216);
+#      endif
 const unsigned struct_kernel_stat64_sz = 104;
 #elif defined(__s390__) && !defined(__s390x__)
 const unsigned struct_kernel_stat_sz = 64;

From 5d1c596ab47b9412bb36bdfb0520d9af1343a5ce Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Mon, 27 Jan 2025 16:52:59 -0500
Subject: [PATCH 290/432] [AMDGPU][True16][MC] true16 for
 minimummaximum/max/min/max3/min3 (#124184)

true16 support for gfx12 instructions including:

v_minimummaximum_f16
v_maximumminimum_f16
v_maximum_f16
v_minimum_f16
v_maximum3_f16
v_minimum3_f16
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  32 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  13 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  24 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  13 +-
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s          | 426 +++++++------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 446 ++++++++------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     | 322 ++++++----
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   | 582 +++++++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 549 ++++++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp8.txt           | 336 ++++++++--
 10 files changed, 1988 insertions(+), 755 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1d98d68a2ea5d..5727d14ec49e8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5573,8 +5573,12 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
   case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
   case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
   case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
-  case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
-  case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
+  case AMDGPU::S_MINIMUM_F16:
+    return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64
+                                   : AMDGPU::V_MINIMUM_F16_fake16_e64;
+  case AMDGPU::S_MAXIMUM_F16:
+    return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64
+                                   : AMDGPU::V_MAXIMUM_F16_fake16_e64;
   case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
   case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
   case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
@@ -7547,9 +7551,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     return;
   }
   case AMDGPU::S_MINIMUM_F32:
-  case AMDGPU::S_MAXIMUM_F32:
-  case AMDGPU::S_MINIMUM_F16:
-  case AMDGPU::S_MAXIMUM_F16: {
+  case AMDGPU::S_MAXIMUM_F32: {
     const DebugLoc &DL = Inst.getDebugLoc();
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
@@ -7566,6 +7568,26 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Inst.eraseFromParent();
     return;
   }
+  case AMDGPU::S_MINIMUM_F16:
+  case AMDGPU::S_MAXIMUM_F16: {
+    const DebugLoc &DL = Inst.getDebugLoc();
+    Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
+                                                    ? &AMDGPU::VGPR_16RegClass
+                                                    : &AMDGPU::VGPR_32RegClass);
+    MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+                                 .addImm(0) // src0_modifiers
+                                 .add(Inst.getOperand(1))
+                                 .addImm(0) // src1_modifiers
+                                 .add(Inst.getOperand(2))
+                                 .addImm(0)  // clamp
+                                 .addImm(0)  // omod
+                                 .addImm(0); // opsel0
+    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+    legalizeOperands(*NewInstr, MDT);
+    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+    Inst.eraseFromParent();
+    return;
+  }
   }
 
   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 15c77c2a723e4..aa81d9b7e22a7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3788,15 +3788,18 @@ let True16Predicate = UseFakeTrue16Insts in {
 }
 } // End SubtargetPredicate = [isGFX9Plus]
 
-let OtherPredicates = [isGFX12Plus] in {
+let SubtargetPredicate = isGFX12Plus in {
 def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
 def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
-def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
-def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
 def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
 def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
-def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
-def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
+
+let True16Predicate = UseFakeTrue16Insts, SubtargetPredicate = isGFX12Plus in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_fake16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_fake16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
 }
 
 // Convert a floating-point power of 2 to the integer exponent.
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 4290c3a16262b..c06c932a5375e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -170,8 +170,8 @@ defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0, AddedComplexity = 1 in {
 defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
 defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
-defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
-defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
+defm V_MINIMUM_F16 : VOP3Inst_t16 <"v_minimum_f16", VOP_F16_F16_F16, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F16 : VOP3Inst_t16 <"v_maximum_f16", VOP_F16_F16_F16, DivergentBinFrag<fmaximum>>;
 
 let SchedRW = [WriteDoubleAdd] in {
 defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
@@ -637,8 +637,8 @@ defm V_MAX3_I16 : VOP3Inst_t16 <"v_max3_i16", VOP_I16_I16_I16_I16, AMDGPUsmax3>;
 defm V_MAX3_U16 : VOP3Inst_t16 <"v_max3_u16", VOP_I16_I16_I16_I16, AMDGPUumax3>;
 
 let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
-  defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
-  defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
+  defm V_MINIMUM3_F16 : VOP3Inst_t16 <"v_minimum3_f16", VOP_F16_F16_F16_F16, AMDGPUfminimum3>;
+  defm V_MAXIMUM3_F16 : VOP3Inst_t16 <"v_maximum3_f16", VOP_F16_F16_F16_F16, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
 defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -1443,8 +1443,8 @@ let SubtargetPredicate = HasF32ToF16BF16ConversionSRInsts in {
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
   defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-  defm V_MAXIMUMMINIMUM_F16 : VOP3Inst<"v_maximumminimum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
-  defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+  defm V_MAXIMUMMINIMUM_F16 : VOP3Inst_t16<"v_maximumminimum_f16", VOP_F16_F16_F16_F16>;
+  defm V_MINIMUMMAXIMUM_F16 : VOP3Inst_t16<"v_minimummaximum_f16", VOP_F16_F16_F16_F16>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
 let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
@@ -1594,8 +1594,8 @@ defm V_MIN3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22b, "v_min3_
 defm V_MAX3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x22c, "v_max3_num_f16", "V_MAX3_F16", "v_max3_f16">;
 defm V_MINIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22d>;
 defm V_MAXIMUM3_F32       : VOP3Only_Realtriple_gfx12<0x22e>;
-defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x22f>;
-defm V_MAXIMUM3_F16       : VOP3Only_Realtriple_t16_gfx12<0x230>;
+defm V_MINIMUM3_F16       : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x22f, "v_minimum3_f16">;
+defm V_MAXIMUM3_F16       : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x230, "v_maximum3_f16">;
 defm V_MED3_NUM_F32       : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
 defm V_MED3_NUM_F16       : VOP3_Realtriple_t16_and_fake16_gfx12<0x232, "v_med3_num_f16", "V_MED3_F16", "v_med3_f16">;
 defm V_MINMAX_NUM_F32     : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
@@ -1604,8 +1604,8 @@ defm V_MINMAX_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26a, "v_minma
 defm V_MAXMIN_NUM_F16     : VOP3_Realtriple_t16_and_fake16_gfx12<0x26b, "v_maxmin_num_f16", "V_MAXMIN_F16", "v_maxmin_f16">;
 defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
 defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
-defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
-defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
+defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26e, "v_minimummaximum_f16">;
+defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x26f, "v_maximumminimum_f16">;
 defm V_S_EXP_F32          : VOP3Only_Real_Base_gfx12<0x280>;
 defm V_S_EXP_F16          : VOP3Only_Real_Base_gfx12<0x281>;
 defm V_S_LOG_F32          : VOP3Only_Real_Base_gfx12<0x282>;
@@ -1622,8 +1622,8 @@ defm V_MINIMUM_F64        : VOP3Only_Real_Base_gfx12<0x341>;
 defm V_MAXIMUM_F64        : VOP3Only_Real_Base_gfx12<0x342>;
 defm V_MINIMUM_F32        : VOP3Only_Realtriple_gfx12<0x365>;
 defm V_MAXIMUM_F32        : VOP3Only_Realtriple_gfx12<0x366>;
-defm V_MINIMUM_F16        : VOP3Only_Realtriple_t16_gfx12<0x367>;
-defm V_MAXIMUM_F16        : VOP3Only_Realtriple_t16_gfx12<0x368>;
+defm V_MINIMUM_F16        : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
+defm V_MAXIMUM_F16        : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_maximum_f16">;
 
 defm V_PERMLANE16_VAR_B32  : VOP3Only_Real_Base_gfx12<0x30f>;
 defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 3b5358b737aa4..eb18cabe368ce 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1947,9 +1947,6 @@ multiclass VOP3Only_Realtriple_gfx12<bits<10> op, bit isSingle = 0> :
 multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
   VOP3_Real_Base<GFX12Gen, op, NAME, 1/*IsSingle*/>;
 
-multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
-  VOP3Only_Realtriple<GFX12Gen, op>;
-
 multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
                                      string pseudo_mnemonic = "", bit isSingle = 0> :
   VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
@@ -1960,6 +1957,16 @@ multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, str
   defm _fake16:VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
 }
 
+multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op, string asmName,
+                                     string opName = NAME, string pseudo_mnemonic = "">
+  : VOP3_Realtriple_t16_gfx12<op, asmName, opName, pseudo_mnemonic, 1>;
+
+multiclass VOP3Only_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName,
+                                     string opName = NAME, string pseudo_mnemonic = ""> {
+  defm _t16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic>;
+  defm _fake16 : VOP3Only_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic>;
+}
+
 multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
                                        string asmName, bit isSingle = 0> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index cd4ed2b9458e6..ff24cbbbd2bea 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -6445,101 +6445,119 @@ v_maximum_f32 v5, -src_scc, |vcc_lo|
 v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi|
 // GFX12: v_maximum_f32 v255, -|0xaf123456|, -|vcc_hi| ; encoding: [0xff,0x03,0x66,0xd7,0xff,0xd6,0x00,0x60,0x56,0x34,0x12,0xaf]
 
-v_minimum_f16 v5, v1, v2
-// GFX12: v_minimum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
+v_minimum_f16 v5.l, v1.l, v2.l
+// GFX12: v_minimum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
 
-v_minimum_f16 v5, v255, v255
-// GFX12: v_minimum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
+v_minimum_f16 v5.l, v255.l, v255.l
+// GFX12: v_minimum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
 
-v_minimum_f16 v5, s1, s2
-// GFX12: v_minimum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
+v_minimum_f16 v5.l, s1, s2
+// GFX12: v_minimum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
 
-v_minimum_f16 v5, s105, s105
-// GFX12: v_minimum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
+v_minimum_f16 v5.l, s105, s105
+// GFX12: v_minimum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
 
-v_minimum_f16 v5, vcc_lo, ttmp15
-// GFX12: v_minimum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
+v_minimum_f16 v5.l, vcc_lo, ttmp15
+// GFX12: v_minimum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_minimum_f16 v5, vcc_hi, 0xaf12
-// GFX12: v_minimum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+v_minimum_f16 v5.l, vcc_hi, 0xaf12
+// GFX12: v_minimum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
 
-v_minimum_f16 v5, ttmp15, src_scc
-// GFX12: v_minimum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
+v_minimum_f16 v5.l, ttmp15, src_scc
+// GFX12: v_minimum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_minimum_f16 v5, m0, 0.5
-// GFX12: v_minimum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
+v_minimum_f16 v5.l, m0, 0.5
+// GFX12: v_minimum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_minimum_f16 v5, exec_lo, -1
-// GFX12: v_minimum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
+v_minimum_f16 v5.l, exec_lo, -1
+// GFX12: v_minimum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
 
-v_minimum_f16 v5, |exec_hi|, null
-// GFX12: v_minimum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+v_minimum_f16 v5.l, |exec_hi|, null
+// GFX12: v_minimum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_minimum_f16 v5, null, exec_lo
-// GFX12: v_minimum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
+v_minimum_f16 v5.l, null, exec_lo
+// GFX12: v_minimum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_minimum_f16 v5, -1, exec_hi
-// GFX12: v_minimum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
+v_minimum_f16 v5.l, -1, exec_hi
+// GFX12: v_minimum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_minimum_f16 v5, 0.5, -m0
-// GFX12: v_minimum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
+v_minimum_f16 v5.l, 0.5, -m0
+// GFX12: v_minimum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
 
-v_minimum_f16 v5, -src_scc, |vcc_lo|
-// GFX12: v_minimum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
+v_minimum_f16 v5.l, -src_scc, |vcc_lo|
+// GFX12: v_minimum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
 
-v_minimum_f16 v255, -|0xaf12|, -|vcc_hi|
-// GFX12: v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi|
+// GFX12: v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
 
-v_minimum_f16 v205, v201, v200
-// GFX12: v_minimum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
+v_minimum_f16 v205.l, v201.l, v200.l
+// GFX12: v_minimum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
 
-v_maximum_f16 v5, v1, v2
-// GFX12: v_maximum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
+v_minimum_f16 v5.l, v1.h, v2.l
+// GFX12: v_minimum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00]
 
-v_maximum_f16 v5, v255, v255
-// GFX12: v_maximum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
+v_minimum_f16 v5.l, v255.l, v255.h
+// GFX12: v_minimum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00]
 
-v_maximum_f16 v5, s1, s2
-// GFX12: v_maximum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
+v_minimum_f16 v255.h, 0xfe0b, vcc_hi
+// GFX12: v_minimum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
-v_maximum_f16 v5, s105, s105
-// GFX12: v_maximum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
+v_maximum_f16 v5.l, v1.l, v2.l
+// GFX12: v_maximum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
 
-v_maximum_f16 v5, vcc_lo, ttmp15
-// GFX12: v_maximum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
+v_maximum_f16 v5.l, v255.l, v255.l
+// GFX12: v_maximum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
 
-v_maximum_f16 v5, vcc_hi, 0xaf12
-// GFX12: v_maximum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+v_maximum_f16 v5.l, s1, s2
+// GFX12: v_maximum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
 
-v_maximum_f16 v5, ttmp15, src_scc
-// GFX12: v_maximum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
+v_maximum_f16 v5.l, s105, s105
+// GFX12: v_maximum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
 
-v_maximum_f16 v5, m0, 0.5
-// GFX12: v_maximum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
+v_maximum_f16 v5.l, vcc_lo, ttmp15
+// GFX12: v_maximum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_maximum_f16 v5, exec_lo, -1
-// GFX12: v_maximum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
+v_maximum_f16 v5.l, vcc_hi, 0xaf12
+// GFX12: v_maximum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
 
-v_maximum_f16 v5, |exec_hi|, null
-// GFX12: v_maximum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+v_maximum_f16 v5.l, ttmp15, src_scc
+// GFX12: v_maximum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_maximum_f16 v5, null, exec_lo
-// GFX12: v_maximum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
+v_maximum_f16 v5.l, m0, 0.5
+// GFX12: v_maximum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_maximum_f16 v5, -1, exec_hi
-// GFX12: v_maximum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
+v_maximum_f16 v5.l, exec_lo, -1
+// GFX12: v_maximum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
 
-v_maximum_f16 v5, 0.5, -m0
-// GFX12: v_maximum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+v_maximum_f16 v5.l, |exec_hi|, null
+// GFX12: v_maximum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_maximum_f16 v5, -src_scc, |vcc_lo|
-// GFX12: v_maximum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+v_maximum_f16 v5.l, null, exec_lo
+// GFX12: v_maximum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_maximum_f16 v255, -|0xaf12|, -|vcc_hi|
-// GFX12: v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+v_maximum_f16 v5.l, -1, exec_hi
+// GFX12: v_maximum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_maximum_f16 v205, v201, v200
-// GFX12: v_maximum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+v_maximum_f16 v5.l, 0.5, -m0
+// GFX12: v_maximum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+
+v_maximum_f16 v5.l, -src_scc, |vcc_lo|
+// GFX12: v_maximum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+
+v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi|
+// GFX12: v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+
+v_maximum_f16 v205.l, v201.l, v200.l
+// GFX12: v_maximum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+
+v_maximum_f16 v5.l, v1.h, v2.l
+// GFX12: v_maximum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00]
+
+v_maximum_f16 v5.l, v255.l, v255.h
+// GFX12: v_maximum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00]
+
+v_maximum_f16 v255.h, 0xfe0b, vcc_hi
+// GFX12: v_maximum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_minimum_f64 v[5:6], v[1:2], v[3:4]
 // GFX12: v_minimum_f64 v[5:6], v[1:2], v[3:4]    ; encoding: [0x05,0x00,0x41,0xd7,0x01,0x07,0x02,0x00]
@@ -6721,92 +6739,116 @@ v_maximum3_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: v_maximum3_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x2e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_minimum3_f16 v5, v1, v2, s3
-// GFX12: v_minimum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+v_minimum3_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_minimum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+
+v_minimum3_f16 v5.l, v255.l, s2, s105
+// GFX12: v_minimum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+
+v_minimum3_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_minimum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+
+v_minimum3_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_minimum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+
+v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l
+// GFX12: v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
-v_minimum3_f16 v5, v255, s2, s105
-// GFX12: v_minimum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minimum3_f16 v5, s1, v255, exec_hi
-// GFX12: v_minimum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+v_minimum3_f16 v5.l, m0, 0.5, m0
+// GFX12: v_minimum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minimum3_f16 v5, s105, s105, exec_lo
-// GFX12: v_minimum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minimum3_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_minimum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minimum3_f16 v5, vcc_hi, 0xaf12, v255
-// GFX12: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12|
+// GFX12: v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
-v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minimum3_f16 v5, m0, 0.5, m0
-// GFX12: v_minimum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minimum3_f16 v5.l, 0.5, -m0, 0.5
+// GFX12: v_minimum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX12: v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp
+// GFX12: v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
-v_minimum3_f16 v5, null, exec_lo, -|0xaf12|
-// GFX12: v_minimum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+v_minimum3_f16 v5.h, v1.h, v2.h, s3
+// GFX12: v_minimum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00]
 
-v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minimum3_f16 v5.l, v255.h, s2, s105
+// GFX12: v_minimum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01]
 
-v_minimum3_f16 v5, 0.5, -m0, 0.5
-// GFX12: v_minimum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
+v_minimum3_f16 v5.h, -src_scc, |vcc_hi|, -1
+// GFX12: v_minimum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23]
 
-v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1
-// GFX12: v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
+v_minimum3_f16 v5.h, vcc_hi, 0xaf12, v255.h
+// GFX12: v_minimum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
-v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp
-// GFX12: v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+v_maximum3_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_maximum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maximum3_f16 v5, v1, v2, s3
-// GFX12: v_maximum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
+v_maximum3_f16 v5.l, v255.l, s2, s105
+// GFX12: v_maximum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maximum3_f16 v5, v255, s2, s105
-// GFX12: v_maximum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
+v_maximum3_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_maximum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maximum3_f16 v5, s1, v255, exec_hi
-// GFX12: v_maximum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
+v_maximum3_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_maximum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_maximum3_f16 v5, s105, s105, exec_lo
-// GFX12: v_maximum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
+v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_maximum3_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_maximum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l
+// GFX12: v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
-v_maximum3_f16 v5, vcc_hi, 0xaf12, v255
-// GFX12: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maximum3_f16 v5.l, m0, 0.5, m0
+// GFX12: v_maximum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_maximum3_f16 v5, m0, 0.5, m0
-// GFX12: v_maximum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12|
+// GFX12: v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
-v_maximum3_f16 v5, null, exec_lo, -|0xaf12|
-// GFX12: v_maximum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_maximum3_f16 v5.l, 0.5, -m0, 0.5
+// GFX12: v_maximum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_maximum3_f16 v5, 0.5, -m0, 0.5
-// GFX12: v_maximum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
+v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX12: v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1
-// GFX12: v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
+v_maximum3_f16 v5.h, v1.h, v2.h, s3
+// GFX12: v_maximum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00]
+
+v_maximum3_f16 v5.l, v255.h, s2, s105
+// GFX12: v_maximum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maximum3_f16 v5.h, -src_scc, |vcc_hi|, -1
+// GFX12: v_maximum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23]
+
+v_maximum3_f16 v5.h, vcc_hi, 0xaf12, v255.h
+// GFX12: v_maximum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 v_maximumminimum_f32 v5, v1, v2, s3
 // GFX12: v_maximumminimum_f32 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6d,0xd6,0x01,0x05,0x0e,0x00]
@@ -6898,92 +6940,116 @@ v_minimummaximum_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: v_minimummaximum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6c,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
-v_maximumminimum_f16 v5, v1, v2, s3
-// GFX12: v_maximumminimum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+v_maximumminimum_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_maximumminimum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+
+v_maximumminimum_f16 v5.l, v255.l, s2, s105
+// GFX12: v_maximumminimum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+
+v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+
+v_maximumminimum_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_maximumminimum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+
+v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+
+v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l
+// GFX12: v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+
+v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+
+v_maximumminimum_f16 v5.l, m0, 0.5, m0
+// GFX12: v_maximumminimum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+
+v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
 
-v_maximumminimum_f16 v5, v255, s2, s105
-// GFX12: v_maximumminimum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_maximumminimum_f16 v5, s1, v255, exec_hi
-// GFX12: v_maximumminimum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12|
+// GFX12: v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
-v_maximumminimum_f16 v5, s105, s105, exec_lo
-// GFX12: v_maximumminimum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5
+// GFX12: v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255
-// GFX12: v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX12: v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp
+// GFX12: v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
-v_maximumminimum_f16 v5, m0, 0.5, m0
-// GFX12: v_maximumminimum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+v_maximumminimum_f16 v5.h, v1.h, v2.h, s3
+// GFX12: v_maximumminimum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+v_maximumminimum_f16 v5.h, s1, v255.h, exec_hi
+// GFX12: v_maximumminimum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_maximumminimum_f16 v5.h, -src_scc, |vcc_hi|, -1
+// GFX12: v_maximumminimum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23]
 
-v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12|
-// GFX12: v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+v_maximumminimum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null clamp
+// GFX12: v_maximumminimum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
-v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, s3
+// GFX12: v_minimummaximum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
 
-v_maximumminimum_f16 v5, 0.5, -m0, 0.5
-// GFX12: v_maximumminimum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
+v_minimummaximum_f16 v5.l, v255.l, s2, s105
+// GFX12: v_minimummaximum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
 
-v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1
-// GFX12: v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
+v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi
+// GFX12: v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
 
-v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp
-// GFX12: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+v_minimummaximum_f16 v5.l, s105, s105, exec_lo
+// GFX12: v_minimummaximum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
 
-v_minimummaximum_f16 v5, v1, v2, s3
-// GFX12: v_minimummaximum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l
+// GFX12: v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
 
-v_minimummaximum_f16 v5, v255, s2, s105
-// GFX12: v_minimummaximum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
+v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l
+// GFX12: v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
-v_minimummaximum_f16 v5, s1, v255, exec_hi
-// GFX12: v_minimummaximum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15|
+// GFX12: v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
 
-v_minimummaximum_f16 v5, s105, s105, exec_lo
-// GFX12: v_minimummaximum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
+v_minimummaximum_f16 v5.l, m0, 0.5, m0
+// GFX12: v_minimummaximum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
 
-v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3
-// GFX12: v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
+v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi
+// GFX12: v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
 
-v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255
-// GFX12: v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo|
+// GFX12: v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
 
-v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15|
-// GFX12: v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
+v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12|
+// GFX12: v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
-v_minimummaximum_f16 v5, m0, 0.5, m0
-// GFX12: v_minimummaximum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
+v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc|
+// GFX12: v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
 
-v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi
-// GFX12: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5
+// GFX12: v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
 
-v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo|
-// GFX12: v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
+v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1
+// GFX12: v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
 
-v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12|
-// GFX12: v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+v_minimummaximum_f16 v5.h, v1.h, v2.h, s3
+// GFX12: v_minimummaximum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00]
 
-v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc|
-// GFX12: v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
+v_minimummaximum_f16 v5.h, s1, v255.h, exec_hi
+// GFX12: v_minimummaximum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01]
 
-v_minimummaximum_f16 v5, 0.5, -m0, 0.5
-// GFX12: v_minimummaximum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
+v_minimummaximum_f16 v5.h, -src_scc, |vcc_hi|, -1
+// GFX12: v_minimummaximum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23]
 
-v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1
-// GFX12: v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+v_minimummaximum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null clamp
+// GFX12: v_minimummaximum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
 v_s_exp_f32 s5, s1
 // GFX12: v_s_exp_f32 s5, s1                      ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 78ce7451c1ba7..64b21385e0ed2 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -5705,89 +5705,107 @@ v_maximum_f32 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_maximum_f32 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x66,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_minimum_f16 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_minimum_f16 v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_minimum_f16 v5, v1, v2 row_mirror
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_mirror
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_half_mirror
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_half_mirror
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_shl:1
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_shl:1
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_shl:15
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_shl:15
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_shr:1
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_shr:1
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_shr:15
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_shr:15
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_ror:1
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_ror:1
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_ror:15
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_ror:15
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_minimum_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_minimum_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_minimum_f16 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minimum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_minimum_f16 v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
-v_minimum_f16 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minimum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_minimum_f16 v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
-v_minimum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_minimum_f16 v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_maximum_f16 v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_minimum_f16 v5.h, v1.h, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_maximum_f16 v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_minimum_f16 v5.h, v1.h, v2.l row_ror:15
+// GFX12: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_mirror
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_minimum_f16 v255.h, -|v255.h|, -|v255.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
-v_maximum_f16 v5, v1, v2 row_half_mirror
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_maximum_f16 v5, v1, v2 row_shl:1
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_maximum_f16 v5, v1, v2 row_shl:15
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_mirror
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_shr:1
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_half_mirror
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_shr:15
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_shl:1
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_ror:1
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_shl:15
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_ror:15
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_shr:1
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_maximum_f16 v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_maximum_f16 v5.l, v1.l, v2.l row_shr:15
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_maximum_f16 v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maximum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+v_maximum_f16 v5.l, v1.l, v2.l row_ror:1
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_maximum_f16 v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maximum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+v_maximum_f16 v5.l, v1.l, v2.l row_ror:15
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_maximum_f16 v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+v_maximum_f16 v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_maximum_f16 v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+v_maximum_f16 v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+
+v_maximum_f16 v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+
+v_maximum_f16 v5.h, v1.h, v2.l quad_perm:[3,2,1,0]
+// GFX12: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+v_maximum_f16 v5.h, v1.h, v2.l row_ror:15
+// GFX12: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+v_maximum_f16 v255.h, -|v255.h|, -|v255.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 v_minimum3_f32 v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -5885,101 +5903,131 @@ v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x
 v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_minimum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minimum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f16 v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f16 v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_minimum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_minimum3_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_minimum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_minimum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_minimum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minimum3_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_minimum3_f16 v5, v1, v2, v3 row_mirror
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minimum3_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_minimum3_f16 v5, v1, v2, v255 row_half_mirror
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minimum3_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_minimum3_f16 v5, v1, v2, s105 row_shl:1
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minimum3_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_minimum3_f16 v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minimum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_minimum3_f16 v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minimum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_minimum3_f16 v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minimum3_f16 v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minimum3_f16 v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minimum3_f16 v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minimum3_f16 v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minimum3_f16 v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
 
-v_minimum3_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minimum3_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x2f,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
 
-v_minimum3_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_minimum3_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_minimum3_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_maximum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_maximum3_f16 v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximum3_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maximum3_f16 v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximum3_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_maximum3_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, v3 row_mirror
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, v255 row_half_mirror
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, s105 row_shl:1
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maximum3_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maximum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_maximum3_f16 v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maximum3_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_maximum3_f16 v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maximum3_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maximum3_f16 v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maximum3_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maximum3_f16 v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_maximum3_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_maximum3_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_maximum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_maximum3_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_maximum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_maximum3_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_maximum3_f16 v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_maximum3_f16 v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
+
+v_maximum3_f16 v5.l, -v1.h, |v2.l|, -1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+v_maximum3_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x30,0xd6,0xfa,0x04,0xc2,0x63,0x01,0x60,0x09,0x13]
+
+v_maximum3_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 v_maximumminimum_f32 v5, v1, v2, v3 quad_perm:[0,1,2,3]
 // GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6d,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
@@ -6068,98 +6116,128 @@ v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m
 v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6c,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_maximumminimum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_maximumminimum_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_maximumminimum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_maximumminimum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_maximumminimum_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_maximumminimum_f16 v5, v1, v2, v3 row_mirror
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_maximumminimum_f16 v5, v1, v2, v255 row_half_mirror
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_maximumminimum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_maximumminimum_f16 v5, v1, v2, s105 row_shl:1
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_maximumminimum_f16 v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximumminimum_f16 v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_maximumminimum_f16 v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_maximumminimum_f16 v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_maximumminimum_f16 v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6f,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_maximumminimum_f16 v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_maximumminimum_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
-v_maximumminimum_f16 v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximumminimum_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minimummaximum_f16 v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s2, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximumminimum_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_minimummaximum_f16 v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, 2.0, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
 
-v_maximumminimum_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
-v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l row_mirror
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.l row_half_mirror
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, s2, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, s105 row_shl:1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, 2.0, v3 quad_perm:[3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, 2.0, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0xe8,0x0d,0x04,0x01,0x1b,0x00,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_hi row_shl:15
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, v3 quad_perm:[0,1,2,3]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_lo row_shr:1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, v3 row_mirror
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+v_minimummaximum_f16 v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, v255 row_half_mirror
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, exec_hi row_ror:1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, s105 row_shl:1
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+v_minimummaximum_f16 v5.l, -v1.l, v2.l, |exec_lo| row_ror:15
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, vcc_hi row_shl:15
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
-v_minimummaximum_f16 v5, v1, v2, vcc_lo row_shr:1
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+v_minimummaximum_f16 v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
-v_minimummaximum_f16 v5, |v1|, v2, -ttmp15 row_shr:15
-// GFX12: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
-v_minimummaximum_f16 v5, v1, -|v2|, exec_hi row_ror:1
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+v_minimummaximum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
-v_minimummaximum_f16 v5, -v1, v2, |exec_lo| row_ror:15
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+v_minimummaximum_f16 v5.h, v1.h, v2.h, v3.h quad_perm:[3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
-v_minimummaximum_f16 v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.h quad_perm:[0,1,2,3]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x20,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0xff]
 
-v_minimummaximum_f16 v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+v_minimummaximum_f16 v5.l, -v1.h, |v2.l|, -1 mul:2 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
 
-v_minimummaximum_f16 v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x13,0x6e,0xd6,0xfa,0x04,0xc2,0x73,0x01,0x60,0x09,0x13]
 
-v_minimummaximum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+v_minimummaximum_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index b41f92b889368..c910e2020ee12 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -3902,29 +3902,35 @@ v_maximum_f32 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_maximum_f32 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_maximum_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x66,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_minimum_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_minimum_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_minimum_f16 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_minimum_f16 v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_minimum_f16 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minimum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_minimum_f16 v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
-v_minimum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_minimum_f16 v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_maximum_f16 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_minimum_f16 v255.h, -|v255.h|, -|v255.h| dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_maximum_f16 v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+v_maximum_f16 v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_maximum_f16 v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maximum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+v_maximum_f16 v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
-v_maximum_f16 v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+v_maximum_f16 v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_maximum_f16 v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_maximum_f16 v255.h, -|v255.h|, -|v255.h| dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 v_minimum3_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_minimum3_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -4010,89 +4016,119 @@ v_maximum3_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_maximum3_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_minimum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_minimum3_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_minimum3_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x2f,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x2f,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minimum3_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_minimum3_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_minimum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_maximum3_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, -|v2.l| -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_maximum3_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_maximum3_f16 v5.l, -v1.h, |v2.l|, -1 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x30,0xd6,0xe9,0x04,0x06,0x23,0x01,0x77,0x39,0x05]
 
-v_maximum3_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_maximum3_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x30,0xd6,0xea,0x04,0xc2,0x63,0x01,0x77,0x39,0x05]
+
+v_maximum3_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 v_maximumminimum_f32 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: v_maximumminimum_f32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6d,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -4178,86 +4214,116 @@ v_minimummaximum_f32 v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_minimummaximum_f32 v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: v_minimummaximum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6c,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maximumminimum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+
+v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_maximumminimum_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6f,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6f,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_maximumminimum_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
-v_maximumminimum_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s2, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, 2.0, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_maximumminimum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0c,0x04,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, 2.0, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0xe8,0x0d,0x04,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
-v_minimummaximum_f16 v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.h, v1.h, v2.h, v3.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x78,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, v1.l, v2.l, v255.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.h op_sel:[0,0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x20,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, -v1.h, |v2.l|, -1 mul:2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x0a,0x6e,0xd6,0xe9,0x04,0x06,0x2b,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+v_minimummaximum_f16 v5.l, -|v1.l|, -|v2.h|, 0.5 mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.h|, 0.5 op_sel:[0,1,0,0] mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x13,0x6e,0xd6,0xea,0x04,0xc2,0x73,0x01,0x77,0x39,0x05]
 
-v_minimummaximum_f16 v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+v_minimummaximum_f16 v255.h, -|v255.l|, -|v255.l|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index 4108fd9c8be62..4fa7dd5420571 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -7920,100 +7920,232 @@
 # GFX12: v_maximum_f32 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x66,0xd7,0x7f,0xf8,0x00,0x00]
 
 0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00
-# GFX12: v_minimum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
+# W32-REAL16: v_minimum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
+# W32-FAKE16: v_minimum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
+# W64-REAL16: v_minimum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
+# W64-FAKE16: v_minimum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x67,0xd7,0xc9,0x91,0x03,0x00]
 
 0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00
-# GFX12: v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x67,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_minimum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x67,0xd7,0xc1,0xfe,0x00,0x00]
 
 0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20
-# GFX12: v_minimum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
+# W32-REAL16: v_minimum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
+# W32-FAKE16: v_minimum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
+# W64-REAL16: v_minimum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
+# W64-FAKE16: v_minimum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x67,0xd7,0xfd,0xd4,0x00,0x20]
 
 0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40
-# GFX12: v_minimum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
+# W32-REAL16: v_minimum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
+# W32-FAKE16: v_minimum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
+# W64-REAL16: v_minimum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
+# W64-FAKE16: v_minimum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x67,0xd7,0xf0,0xfa,0x00,0x40]
 
 0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_minimum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_minimum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_minimum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x67,0xd7,0x7e,0x82,0x01,0x00]
 
 0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_minimum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
+# W32-FAKE16: v_minimum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
+# W64-FAKE16: v_minimum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x67,0xd7,0x7d,0xe0,0x01,0x00]
 
 0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_minimum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x67,0xd7,0x7c,0xfc,0x00,0x00]
 
 0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_minimum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x04,0x00,0x00]
 
 0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_minimum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x67,0xd7,0x69,0xd2,0x00,0x00]
 
 0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_minimum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_minimum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_minimum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x67,0xd7,0x7b,0xfa,0x01,0x00]
 
 0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_minimum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_minimum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_minimum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x67,0xd7,0x01,0x05,0x02,0x00]
 
 0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_minimum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_minimum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_minimum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x67,0xd7,0xff,0xff,0x03,0x00]
 
 0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00
-# GFX12: v_minimum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x67,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_minimum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x67,0xd7,0x6a,0xf6,0x00,0x00]
 
 0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_minimum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_minimum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x67,0xd7,0x7f,0xf8,0x00,0x00]
+
+0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00
+# W32-REAL16: v_minimum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_minimum_f16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_minimum_f16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x67,0xd7,0x01,0x05,0x02,0x00]
+
+0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00
+# W32-REAL16: v_minimum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_minimum_f16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_minimum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_minimum_f16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x67,0xd7,0xff,0xff,0x03,0x00]
+
+0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_minimum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_minimum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_minimum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_minimum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x67,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00
-# GFX12: v_maximum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+# W32-REAL16: v_maximum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+# W32-FAKE16: v_maximum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+# W64-REAL16: v_maximum_f16 v205.l, v201.l, v200.l    ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
+# W64-FAKE16: v_maximum_f16 v205, v201, v200          ; encoding: [0xcd,0x00,0x68,0xd7,0xc9,0x91,0x03,0x00]
 
 0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00
-# GFX12: v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v255.l, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v255, -|0xaf12|, -|vcc_hi| ; encoding: [0xff,0x03,0x68,0xd7,0xff,0xd6,0x00,0x60,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00
-# GFX12: v_maximum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, -1, exec_hi         ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, -1, exec_hi           ; encoding: [0x05,0x00,0x68,0xd7,0xc1,0xfe,0x00,0x00]
 
 0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20
-# GFX12: v_maximum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+# W32-REAL16: v_maximum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+# W32-FAKE16: v_maximum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+# W64-REAL16: v_maximum_f16 v5.l, -src_scc, |vcc_lo|  ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
+# W64-FAKE16: v_maximum_f16 v5, -src_scc, |vcc_lo|    ; encoding: [0x05,0x02,0x68,0xd7,0xfd,0xd4,0x00,0x20]
 
 0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40
-# GFX12: v_maximum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+# W32-REAL16: v_maximum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+# W32-FAKE16: v_maximum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+# W64-REAL16: v_maximum_f16 v5.l, 0.5, -m0            ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
+# W64-FAKE16: v_maximum_f16 v5, 0.5, -m0              ; encoding: [0x05,0x00,0x68,0xd7,0xf0,0xfa,0x00,0x40]
 
 0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00
-# GFX12: v_maximum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_maximum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, exec_lo, -1         ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_maximum_f16 v5, exec_lo, -1           ; encoding: [0x05,0x00,0x68,0xd7,0x7e,0x82,0x01,0x00]
 
 0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00
-# GFX12: v_maximum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
+# W32-FAKE16: v_maximum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, m0, 0.5             ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
+# W64-FAKE16: v_maximum_f16 v5, m0, 0.5               ; encoding: [0x05,0x00,0x68,0xd7,0x7d,0xe0,0x01,0x00]
 
 0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00
-# GFX12: v_maximum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, null, exec_lo       ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, null, exec_lo         ; encoding: [0x05,0x00,0x68,0xd7,0x7c,0xfc,0x00,0x00]
 
 0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00
-# GFX12: v_maximum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, s1, s2              ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, s1, s2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x04,0x00,0x00]
 
 0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00
-# GFX12: v_maximum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, s105, s105          ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, s105, s105            ; encoding: [0x05,0x00,0x68,0xd7,0x69,0xd2,0x00,0x00]
 
 0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00
-# GFX12: v_maximum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_maximum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, ttmp15, src_scc     ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_maximum_f16 v5, ttmp15, src_scc       ; encoding: [0x05,0x00,0x68,0xd7,0x7b,0xfa,0x01,0x00]
 
 0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00
-# GFX12: v_maximum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_maximum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, v1.l, v2.l          ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_maximum_f16 v5, v1, v2                ; encoding: [0x05,0x00,0x68,0xd7,0x01,0x05,0x02,0x00]
 
 0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00
-# GFX12: v_maximum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_maximum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, v255.l, v255.l      ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_maximum_f16 v5, v255, v255            ; encoding: [0x05,0x00,0x68,0xd7,0xff,0xff,0x03,0x00]
 
 0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00
-# GFX12: v_maximum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, vcc_hi, 0xaf12      ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, vcc_hi, 0xaf12        ; encoding: [0x05,0x00,0x68,0xd7,0x6b,0xfe,0x01,0x00,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00
-# GFX12: v_maximum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, vcc_lo, ttmp15      ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, vcc_lo, ttmp15        ; encoding: [0x05,0x00,0x68,0xd7,0x6a,0xf6,0x00,0x00]
 
 0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00
-# GFX12: v_maximum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_maximum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, |exec_hi|, null     ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v5, |exec_hi|, null       ; encoding: [0x05,0x01,0x68,0xd7,0x7f,0xf8,0x00,0x00]
+
+0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00
+# W32-REAL16: v_maximum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_maximum_f16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_maximum_f16 v5, v1, v2 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x68,0xd7,0x01,0x05,0x02,0x00]
+
+0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00
+# W32-REAL16: v_maximum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_maximum_f16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_maximum_f16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_maximum_f16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x68,0xd7,0xff,0xff,0x03,0x00]
+
+0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+# W32-REAL16: v_maximum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_maximum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_maximum_f16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_maximum_f16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] ; encoding: [0xff,0x40,0x68,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 0xfe,0x03,0x41,0xd7,0x82,0xd4,0x00,0x60
 # GFX12: v_minimum_f64 v[254:255], -|2|, -|vcc|  ; encoding: [0xfe,0x03,0x41,0xd7,0x82,0xd4,0x00,0x60]
@@ -8196,91 +8328,226 @@
 # GFX12: v_minimum3_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2d,0xd6,0x7e,0x82,0xad,0x01]
 
 0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maximum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maximum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x30,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_maximum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_maximum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x30,0xd6,0xfd,0xd4,0x04,0x23]
 
 0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maximum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maximum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x30,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maximum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maximum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x30,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_maximum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_maximum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_maximum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_maximum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_maximum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x30,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_maximum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maximum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x30,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00
-# GFX12: v_maximum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x30,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_maximum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maximum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x30,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_maximum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maximum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x30,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_maximum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maximum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maximum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maximum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maximum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x30,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_maximum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maximum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x30,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
-# GFX12: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_maximum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maximum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maximum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maximum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x30,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x30,0xd6,0x7e,0x82,0xad,0x01]
+
+0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00
+# W32-REAL16: v_maximum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maximum3_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maximum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maximum3_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x30,0xd6,0x01,0x05,0x0e,0x00]
+
+0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_maximum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maximum3_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maximum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maximum3_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x30,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23
+# W32-REAL16: v_maximum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23]
+# W32-FAKE16: v_maximum3_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-REAL16: v_maximum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-FAKE16: v_maximum3_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x30,0xd6,0xfd,0xd6,0x04,0x23]
+
+0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
+# W32-REAL16: v_maximum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximum3_f16 v5, vcc_hi, 0xaf12, v255 op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x30,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00
-# GFX12: v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum3_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x2f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
 0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minimum3_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minimum3_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x2f,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_minimum3_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_minimum3_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x2f,0xd6,0xfd,0xd4,0x04,0x23]
 
 0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minimum3_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minimum3_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x2f,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minimum3_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minimum3_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x2f,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_minimum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_minimum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_minimum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_minimum3_f16 v5.l, 0.5, -m0, 0.5      ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_minimum3_f16 v5, 0.5, -m0, 0.5        ; encoding: [0x05,0x00,0x2f,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_minimum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minimum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, m0, 0.5, m0        ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, m0, 0.5, m0          ; encoding: [0x05,0x00,0x2f,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00
-# GFX12: v_minimum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum3_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x2f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_minimum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minimum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, s1, v255, exec_hi    ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_minimum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minimum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, s105, s105, exec_lo  ; encoding: [0x05,0x00,0x2f,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_minimum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minimum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minimum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minimum3_f16 v5.l, v1.l, v2.l, s3     ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minimum3_f16 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x2f,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_minimum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minimum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, v255.l, s2, s105   ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, v255, s2, s105       ; encoding: [0x05,0x00,0x2f,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
-# GFX12: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum3_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_minimum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minimum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minimum3_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minimum3_f16 v5, vcc_lo, ttmp15, v3   ; encoding: [0x05,0x00,0x2f,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x2f,0xd6,0x7e,0x82,0xad,0x01]
+
+0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00
+# W32-REAL16: v_minimum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minimum3_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minimum3_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minimum3_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x2f,0xd6,0x01,0x05,0x0e,0x00]
+
+0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01
+# W32-REAL16: v_minimum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minimum3_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minimum3_f16 v5.l, v255.h, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minimum3_f16 v5, v255, s2, s105 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x2f,0xd6,0xff,0x05,0xa4,0x01]
+
+0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23
+# W32-REAL16: v_minimum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23]
+# W32-FAKE16: v_minimum3_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-REAL16: v_minimum3_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-FAKE16: v_minimum3_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x2f,0xd6,0xfd,0xd6,0x04,0x23]
+
+0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
+# W32-REAL16: v_minimum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimum3_f16 v5.h, vcc_hi, 0xaf12, v255.h op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16 v5, vcc_hi, 0xaf12, v255 op_sel:[0,0,1,1] ; encoding: [0x05,0x60,0x2f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0xff,0x83,0x6d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf
 # GFX12: v_maximumminimum_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x6d,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
@@ -8373,91 +8640,226 @@
 # GFX12: v_minimummaximum_f32 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6c,0xd6,0x7e,0x82,0xad,0x01]
 
 0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00
-# GFX12: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16 v255.l, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null clamp ; encoding: [0xff,0x83,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
 0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_maximumminimum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_maximumminimum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6f,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_maximumminimum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_maximumminimum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6f,0xd6,0xfd,0xd4,0x04,0x23]
 
 0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_maximumminimum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_maximumminimum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6f,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_maximumminimum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_maximumminimum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6f,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_maximumminimum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_maximumminimum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_maximumminimum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_maximumminimum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6f,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_maximumminimum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_maximumminimum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6f,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00
-# GFX12: v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6f,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_maximumminimum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_maximumminimum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_maximumminimum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6f,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_maximumminimum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_maximumminimum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maximumminimum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6f,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_maximumminimum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_maximumminimum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6f,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
-# GFX12: v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6f,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_maximumminimum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_maximumminimum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6f,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6f,0xd6,0x7e,0x82,0xad,0x01]
+
+0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00
+# W32-REAL16: v_maximumminimum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_maximumminimum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6f,0xd6,0x01,0x05,0x0e,0x00]
+
+0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_maximumminimum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_maximumminimum_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_maximumminimum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_maximumminimum_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6f,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23
+# W32-REAL16: v_maximumminimum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23]
+# W32-FAKE16: v_maximumminimum_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-REAL16: v_maximumminimum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-FAKE16: v_maximumminimum_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6f,0xd6,0xfd,0xd6,0x04,0x23]
+
+0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00
+# W32-REAL16: v_maximumminimum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16 v255, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6f,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
 0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3
-# GFX12: v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-REAL16: v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W32-FAKE16: v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-REAL16: v_minimummaximum_f16 v5.l, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
+# W64-FAKE16: v_minimummaximum_f16 v5, -1, -|exec_hi|, -|src_scc| ; encoding: [0x05,0x06,0x6e,0xd6,0xc1,0xfe,0xf4,0xc3]
 
 0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23
-# GFX12: v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-REAL16: v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+# W32-FAKE16: v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-REAL16: v_minimummaximum_f16 v5.l, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
+# W64-FAKE16: v_minimummaximum_f16 v5, -src_scc, |vcc_lo|, -1 ; encoding: [0x05,0x02,0x6e,0xd6,0xfd,0xd4,0x04,0x23]
 
 0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1
-# GFX12: v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-REAL16: v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W32-FAKE16: v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-REAL16: v_minimummaximum_f16 v5.l, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
+# W64-FAKE16: v_minimummaximum_f16 v5, -|exec_hi|, null, -|vcc_lo| ; encoding: [0x05,0x05,0x6e,0xd6,0x7f,0xf8,0xa8,0xa1]
 
 0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1
-# GFX12: v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-REAL16: v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
+# W32-FAKE16: v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-REAL16: v_minimummaximum_f16 v5.l, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
+# W64-FAKE16: v_minimummaximum_f16 v5, -|ttmp15|, -|src_scc|, -|ttmp15| ; encoding: [0x05,0x07,0x6e,0xd6,0x7b,0xfa,0xed,0xe1]
 
 0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43
-# GFX12: v_minimummaximum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-REAL16: v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
+# W32-FAKE16: v_minimummaximum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-REAL16: v_minimummaximum_f16 v5.l, 0.5, -m0, 0.5 ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
+# W64-FAKE16: v_minimummaximum_f16 v5, 0.5, -m0, 0.5  ; encoding: [0x05,0x00,0x6e,0xd6,0xf0,0xfa,0xc0,0x43]
 
 0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01
-# GFX12: v_minimummaximum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-REAL16: v_minimummaximum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.l, m0, 0.5, m0  ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, m0, 0.5, m0    ; encoding: [0x05,0x00,0x6e,0xd6,0x7d,0xe0,0xf5,0x01]
 
 0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00
-# GFX12: v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimummaximum_f16 v5.l, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimummaximum_f16 v5, null, exec_lo, -|0xaf12| ; encoding: [0x05,0x04,0x6e,0xd6,0x7c,0xfc,0xfc,0x83,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01
-# GFX12: v_minimummaximum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W32-REAL16: v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.l, s1, v255.l, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, s1, v255, exec_hi ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0xfe,0xff,0x01]
 
 0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01
-# GFX12: v_minimummaximum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-REAL16: v_minimummaximum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.l, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, s105, s105, exec_lo ; encoding: [0x05,0x00,0x6e,0xd6,0x69,0xd2,0xf8,0x01]
 
 0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00
-# GFX12: v_minimummaximum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W32-REAL16: v_minimummaximum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minimummaximum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minimummaximum_f16 v5.l, v1.l, v2.l, s3 ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minimummaximum_f16 v5, v1, v2, s3     ; encoding: [0x05,0x00,0x6e,0xd6,0x01,0x05,0x0e,0x00]
 
 0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01
-# GFX12: v_minimummaximum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
+# W32-REAL16: v_minimummaximum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.l, v255.l, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, v255, s2, s105 ; encoding: [0x05,0x00,0x6e,0xd6,0xff,0x05,0xa4,0x01]
 
 0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00
-# GFX12: v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-REAL16: v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimummaximum_f16 v5.l, vcc_hi, 0xaf12, v255.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimummaximum_f16 v5, vcc_hi, 0xaf12, v255 ; encoding: [0x05,0x00,0x6e,0xd6,0x6b,0xfe,0xfd,0x07,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04
-# GFX12: v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-REAL16: v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
+# W32-FAKE16: v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-REAL16: v_minimummaximum_f16 v5.l, vcc_lo, ttmp15, v3.l ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
+# W64-FAKE16: v_minimummaximum_f16 v5, vcc_lo, ttmp15, v3 ; encoding: [0x05,0x00,0x6e,0xd6,0x6a,0xf6,0x0c,0x04]
 
 0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01
-# GFX12: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+# W32-REAL16: v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.l, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, |exec_lo|, -1, vcc_hi ; encoding: [0x05,0x01,0x6e,0xd6,0x7e,0x82,0xad,0x01]
+
+0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00
+# W32-REAL16: v_minimummaximum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W32-FAKE16: v_minimummaximum_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W64-REAL16: v_minimummaximum_f16 v5.h, v1.h, v2.h, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+# W64-FAKE16: v_minimummaximum_f16 v5, v1, v2, s3 op_sel:[1,1,0,1] ; encoding: [0x05,0x58,0x6e,0xd6,0x01,0x05,0x0e,0x00]
+
+0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01
+# W32-REAL16: v_minimummaximum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W32-FAKE16: v_minimummaximum_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W64-REAL16: v_minimummaximum_f16 v5.h, s1, v255.h, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+# W64-FAKE16: v_minimummaximum_f16 v5, s1, v255, exec_hi op_sel:[0,1,0,1] ; encoding: [0x05,0x50,0x6e,0xd6,0x01,0xfe,0xff,0x01]
+
+0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23
+# W32-REAL16: v_minimummaximum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23]
+# W32-FAKE16: v_minimummaximum_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-REAL16: v_minimummaximum_f16 v5.h, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23]
+# W64-FAKE16: v_minimummaximum_f16 v5, -src_scc, |vcc_hi|, -1 op_sel:[0,0,0,1] ; encoding: [0x05,0x42,0x6e,0xd6,0xfd,0xd6,0x04,0x23]
+
+0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00
+# W32-REAL16: v_minimummaximum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W32-FAKE16: v_minimummaximum_f16 v255, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-REAL16: v_minimummaximum_f16 v255.h, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
+# W64-FAKE16: v_minimummaximum_f16 v255, -|0xaf12|, -|vcc_hi|, null op_sel:[0,0,0,1] clamp ; encoding: [0xff,0xc3,0x6e,0xd6,0xff,0xd6,0xf0,0x61,0x12,0xaf,0x00,0x00]
 
 0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00
 # GFX12: v_s_exp_f32 s5, s1                      ; encoding: [0x05,0x00,0x80,0xd6,0x01,0x00,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 0be540da8287b..e44808607a2b3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -7268,88 +7268,208 @@
 # GFX12: v_maximum_f32_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x66,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
 
 0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13
-# GFX12: v_minimum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x67,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_minimum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x67,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# W32-REAL16: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x67,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x67,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
-# GFX12: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x03,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
 0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13
-# GFX12: v_maximum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, -v1, |v2| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x02,0x68,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x09,0x13]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
 0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
 0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01
-# GFX12: v_maximum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, |v1|, -v2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x01,0x68,0xd7,0xfa,0x04,0x02,0x40,0x01,0x5f,0x01,0x01]
+
+0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+
+0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
+# W32-REAL16: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.h, v1.h, v2.l op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 op_sel:[1,0,1] row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x48,0x68,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+
+0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0x5b,0x68,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x05,0x30]
 
 0xff,0x87,0x2e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
 # GFX12: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
@@ -7442,94 +7562,220 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2d,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x30,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x30,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x30,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x30,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
 0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x30,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x30,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x30,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x30,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x30,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x30,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x2f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x2f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x2f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
 0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x2f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x2f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x2f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x2f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x2f,0xd6,0xfa,0x04,0x06,0x23,0x01,0x5f,0x01,0x01]
+
+0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x2f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x05,0x30]
 
 0xff,0x87,0x6d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
 # GFX12: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6d,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
@@ -7622,91 +7868,218 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6c,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
 
 0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6f,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6f,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6f,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6f,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
 0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6f,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6f,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6f,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6f,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6f,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6f,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
 0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30
-# GFX12: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x6e,0xd6,0xfa,0xfe,0xf7,0xe3,0xff,0x6f,0x0d,0x30]
 
 0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x04,0x6e,0xd6,0xfa,0x04,0xfa,0x21,0x01,0x2f,0x01,0xff]
 
 0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x03,0x6e,0xd6,0xfa,0x04,0xf2,0x61,0x01,0x50,0x01,0xff]
 
 0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x05,0x6e,0xd6,0xfa,0x04,0x06,0xa3,0x01,0x5f,0x01,0x01]
 
 0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xfa,0x04,0xc2,0xc3,0x01,0x60,0x09,0x13]
 
 0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x02,0x6e,0xd6,0xfa,0x04,0xfe,0x41,0x01,0x21,0x01,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xa6,0x01,0x01,0x01,0x01,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xfe,0x07,0x01,0x41,0x01,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x00,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s3, v3.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x06,0x0c,0x04,0x01,0x1b,0x00,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x40,0x01,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xae,0x01,0x01,0x0f,0x01,0xff]
 
 0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x6e,0xd6,0xfa,0x04,0xaa,0x01,0x01,0x11,0x01,0xff]
 
 0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff
-# GFX12: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x01,0x6e,0xd6,0xfa,0x04,0xee,0x81,0x01,0x1f,0x01,0xff]
+
+0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.h, v1.h, v2.h, v3.h op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 op_sel:[1,1,1,1] quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x78,0x6e,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
+
+0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.h, |v2.l|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, |v2|, -1 op_sel:[1,0,0,0] mul:2 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x0a,0x6e,0xd6,0xfa,0x04,0x06,0x2b,0x01,0x5f,0x01,0x01]
+
+0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xff,0xc7,0x6e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 343a71abb27d0..6fbdc85da43c4 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -5013,28 +5013,64 @@
 # GFX12: v_minimum_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x65,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
 
 0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_maximum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x68,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_maximum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x68,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_maximum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x68,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x68,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
-# GFX12: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
-# GFX12: v_minimum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, -v1.l, |v2.l| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x67,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
-# GFX12: v_minimum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x67,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
-# GFX12: v_minimum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum_f16_e64_dpp v5.l, |v1.l|, -v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum_f16_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x67,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimum_f16_e64_dpp v255.h, -|v255.h|, -|v255.h| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimum_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[1,1,1] dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x5b,0x67,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
 0xff,0x87,0x2e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
 # GFX12: v_maximum3_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
@@ -5115,82 +5151,172 @@
 # GFX12: v_minimum3_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2d,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x30,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x30,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x30,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x30,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x30,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x30,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x30,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x30,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimum3_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x2f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x2f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x2f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x2f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x2f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x2f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimum3_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x2f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimum3_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimum3_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x2f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0xff,0x87,0x6d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
 # GFX12: v_maximumminimum_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6d,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
@@ -5271,79 +5397,169 @@
 # GFX12: v_minimummaximum_f32_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
 
 0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6f,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6f,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6f,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6f,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6f,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6f,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6f,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6f,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_maximumminimum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_maximumminimum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6f,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
 0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00
-# GFX12: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v255.l, -|v255.l|, -|v255.l|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x87,0x6e,0xd6,0xe9,0xfe,0xf7,0xe3,0xff,0x00,0x00,0x00]
 
 0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -v1.l, v2.l, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -v1, v2, |exec_lo| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x04,0x6e,0xd6,0xe9,0x04,0xfa,0x21,0x01,0x77,0x39,0x05]
 
 0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, -|v2.l|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, -|v2|, null dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x03,0x6e,0xd6,0xe9,0x04,0xf2,0x61,0x01,0x77,0x39,0x05]
 
 0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, -|v1.l|, v2.l, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, -|v1|, v2, -|-1| dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x05,0x6e,0xd6,0xe9,0x04,0x06,0xa3,0x01,0x77,0x39,0x05]
 
 0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, -|0.5| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x06,0x6e,0xd6,0xea,0x04,0xc2,0xc3,0x01,0x77,0x39,0x05]
 
 0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, -|v2.l|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, -|v2|, exec_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x02,0x6e,0xd6,0xe9,0x04,0xfe,0x41,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, s105 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xa6,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v255.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v255 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xfe,0x07,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, s3, v3.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, s3, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x06,0x0c,0x04,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_hi dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xae,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, v1.l, v2.l, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, v1, v2, vcc_lo dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6e,0xd6,0xe9,0x04,0xaa,0x01,0x01,0x77,0x39,0x05]
 
 0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05
-# GFX12: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v5.l, |v1.l|, v2.l, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v5, |v1|, v2, -ttmp15 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6e,0xd6,0xe9,0x04,0xee,0x81,0x01,0x77,0x39,0x05]
+
+0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
+# W32-REAL16: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_minimummaximum_f16_e64_dpp v255.h, -|v255.l|, -|v255.l|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_minimummaximum_f16_e64_dpp v255, -|v255|, -|v255|, -|src_scc| op_sel:[0,0,0,1] clamp div:2 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0xc7,0x6e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]

From c310b4e7bd2a6e736fbcd8d4885ff46b45ab9977 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie@intel.com>
Date: Tue, 28 Jan 2025 07:11:46 +0900
Subject: [PATCH 291/432] Revert "[Clang] __has_builtin should return false for
 aux triple builtins (#121839) (#124626)

This reverts commit 1c28b9237382b093f477479c993c80181922ca6a.

Breaks CUDA on ARM, see
[here](https://github.com/llvm/llvm-project/pull/121839#issuecomment-2616858927).
---
 clang/lib/Lex/PPMacroExpansion.cpp           | 10 +++-------
 clang/test/Headers/__cpuidex_conflict.c      |  4 +---
 clang/test/Preprocessor/builtin_aux_info.cpp | 18 ------------------
 3 files changed, 4 insertions(+), 28 deletions(-)
 delete mode 100644 clang/test/Preprocessor/builtin_aux_info.cpp

diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 9cf29668f251f..347c13da0ad21 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -1804,9 +1804,8 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
                                            diag::err_feature_check_malformed);
         if (!II)
           return false;
-        auto BuiltinID = II->getBuiltinID();
-        if (BuiltinID != 0) {
-          switch (BuiltinID) {
+        else if (II->getBuiltinID() != 0) {
+          switch (II->getBuiltinID()) {
           case Builtin::BI__builtin_cpu_is:
             return getTargetInfo().supportsCpuIs();
           case Builtin::BI__builtin_cpu_init:
@@ -1819,11 +1818,8 @@ void Preprocessor::ExpandBuiltinMacro(Token &Tok) {
             // usual allocation and deallocation functions. Required by libc++
             return 201802;
           default:
-            // __has_builtin should return false for aux builtins.
-            if (getBuiltinInfo().isAuxBuiltinID(BuiltinID))
-              return false;
             return Builtin::evaluateRequiredTargetFeatures(
-                getBuiltinInfo().getRequiredFeatures(BuiltinID),
+                getBuiltinInfo().getRequiredFeatures(II->getBuiltinID()),
                 getTargetInfo().getTargetOpts().FeatureMap);
           }
           return true;
diff --git a/clang/test/Headers/__cpuidex_conflict.c b/clang/test/Headers/__cpuidex_conflict.c
index 0f5e6e5e0a0ff..8687a6aa2f897 100644
--- a/clang/test/Headers/__cpuidex_conflict.c
+++ b/clang/test/Headers/__cpuidex_conflict.c
@@ -3,9 +3,7 @@
 // RUN: %clang_cc1 %s -ffreestanding -fms-extensions -fms-compatibility \
 // RUN:  -fms-compatibility-version=19.00 -triple x86_64-pc-windows-msvc -emit-llvm -o -
 // %clang_cc1 %s -ffreestanding -triple x86_64-w64-windows-gnu -fms-extensions -emit-llvm -o -
-//
-// FIXME: See https://github.com/llvm/llvm-project/pull/121839
-// RUN: not %clang_cc1 %s -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu
+// RUN: %clang_cc1 %s -ffreestanding -fopenmp -fopenmp-is-target-device -aux-triple x86_64-unknown-linux-gnu
 
 typedef __SIZE_TYPE__ size_t;
 
diff --git a/clang/test/Preprocessor/builtin_aux_info.cpp b/clang/test/Preprocessor/builtin_aux_info.cpp
deleted file mode 100644
index 60c8c6c492479..0000000000000
--- a/clang/test/Preprocessor/builtin_aux_info.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: %clang_cc1 -fopenmp -triple=spirv64 -fopenmp-is-target-device \
-// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s
-
-// RUN: %clang_cc1 -fopenmp -triple=nvptx64 -fopenmp-is-target-device \
-// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s
-
-// RUN: %clang_cc1 -fopenmp -triple=amdgcn-amd-amdhsa -fopenmp-is-target-device \
-// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s
-
-// RUN: %clang_cc1 -fopenmp -triple=aarch64 -fopenmp-is-target-device \
-// RUN: -aux-triple x86_64-linux-unknown -E %s | FileCheck -implicit-check-not=BAD %s
-
-// CHECK: GOOD
-#if __has_builtin(__builtin_ia32_pause)
-  BAD
-#else
-  GOOD
-#endif

From 5a81a559d69fb84e1e8ef623ac4b642081c14c51 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 27 Jan 2025 22:21:12 +0000
Subject: [PATCH 292/432] [GISel] Explicitly disable BF16 tablegen patterns.
 (#124113)

We currently have an issue where bf16 patters can be used to match fp16
types, as GISel does not know about the difference between the two. This
patch explicitly disables them to make sure that they are never used.

The opposite can also happen too, where fp16 patterns are used for
operators that should be bf16. So this also changes any operations with
bf16 types to now cause a fallback to SDAG.

The pass setup for GISel has been slightly adjusted to make sure that a
verify pass does not get added between AMD-SDAG and SIFixSGPRCopiesPass,
which otherwise can cause verifier issues when falling back.
---
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   30 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   13 +-
 .../test/CodeGen/AArch64/bf16-instructions.ll | 1643 +++++--------
 llvm/test/CodeGen/AArch64/fptrunc.ll          |    1 +
 .../AMDGPU/GlobalISel/irtranslator-call.ll    |   50 +-
 .../GlobalISel/llvm.amdgcn.set.inactive.ll    |   36 +-
 ...ffer-fat-pointers-contents-legalization.ll |    7 +-
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     | 2034 +++++++----------
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll |   14 +-
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll |   14 +-
 .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll |   48 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |   46 +-
 .../llvm.amdgcn.cvt.scalef32.pk.gfx950.ll     |  198 +-
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll     |  264 +--
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll     |    2 +-
 .../llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll  |  252 +-
 .../test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll |    2 +-
 .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll   |    8 +-
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      |   28 +-
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  |    2 +-
 .../AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll |   32 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll |  248 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll    |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    |    6 +-
 ...mdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll |   18 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |    2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |    2 +-
 .../AMDGPU/llvm.amdgcn.smfmac.gfx950.ll       | 1110 +++------
 .../AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll   |    6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   |   10 +-
 llvm/utils/TableGen/GlobalISelEmitter.cpp     |   21 +
 33 files changed, 2072 insertions(+), 4085 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 21622ea43724c..3e43299bb8110 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -296,8 +296,21 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
   MachinePreds[Edge].push_back(NewPred);
 }
 
+static bool containsBF16Type(const User &U) {
+  // BF16 cannot currently be represented by LLT, to avoid miscompiles we
+  // prevent any instructions using them. FIXME: This can be removed once LLT
+  // supports bfloat.
+  return U.getType()->getScalarType()->isBFloatTy() ||
+         any_of(U.operands(), [](Value *V) {
+           return V->getType()->getScalarType()->isBFloatTy();
+         });
+}
+
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(U))
+    return false;
+
   // Get or create a virtual register for each value.
   // Unless the value is a Constant => loadimm cst?
   // or inline constant each time?
@@ -317,6 +330,9 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
 
 bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
                                     MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(U))
+    return false;
+
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
   Register Res = getOrCreateVReg(U);
   uint32_t Flags = 0;
@@ -334,6 +350,9 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(U))
+    return false;
+
   auto *CI = cast<CmpInst>(&U);
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
   Register Op1 = getOrCreateVReg(*U.getOperand(1));
@@ -1553,8 +1572,7 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
-  if (U.getType()->getScalarType()->isBFloatTy() ||
-      U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
+  if (containsBF16Type(U))
     return false;
 
   uint32_t Flags = 0;
@@ -2647,6 +2665,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
 bool IRTranslator::translateInlineAsm(const CallBase &CB,
                                       MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(CB))
+    return false;
 
   const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
 
@@ -2736,6 +2756,9 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
 }
 
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(U))
+    return false;
+
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
@@ -3371,6 +3394,9 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
+  if (containsBF16Type(U))
+    return false;
+
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
   auto Flags = TLI->getAtomicMemOperandFlags(I, *DL);
 
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index d8d9f38da3eae..847a1aef39c56 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1017,7 +1017,7 @@ bool TargetPassConfig::addCoreISelPasses() {
   if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
     DebugifyIsSafe = false;
 
-  // Add instruction selector passes.
+  // Add instruction selector passes for global isel if enabled.
   if (Selector == SelectorType::GlobalISel) {
     SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true);
     if (addIRTranslator())
@@ -1043,15 +1043,14 @@ bool TargetPassConfig::addCoreISelPasses() {
     // Pass to reset the MachineFunction if the ISel failed.
     addPass(createResetMachineFunctionPass(
         reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled()));
+  }
 
-    // Provide a fallback path when we do not want to abort on
-    // not-yet-supported input.
-    if (!isGlobalISelAbortEnabled() && addInstSelector())
+  // Run the SDAG InstSelector, providing a fallback path when we do not want to
+  // abort on not-yet-supported input.
+  if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
+    if (addInstSelector())
       return true;
 
-  } else if (addInstSelector())
-    return true;
-
   // Expand pseudo-instructions emitted by ISel. Don't run the verifier before
   // FinalizeISel.
   addPass(&FinalizeISelID);
diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
index ecf64ecbbd3ff..2fc9c53112ab6 100644
--- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll
@@ -3,7 +3,34 @@
 ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-SD
 ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16,+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for test_fptosi_i32
+; CHECK-GI:       warning: Instruction selection used fallback path for test_fadd
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fsub
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmul
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmadd
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fdiv
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_frem
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_call
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_call_flipped
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tailcall_flipped
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select_cc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_select_cc_f32_f16
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_une
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ueq
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ugt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_uge
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ult
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ule
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_uno
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_one
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_oeq
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ogt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_oge
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_olt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ole
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fcmp_ord
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fccmp
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_br_cc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptosi_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptosi_i64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptoui_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptoui_i64
@@ -17,9 +44,40 @@
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fptrunc_double
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fpext_float
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fpext_double
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sqrt
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_powi
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sin
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_cos
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tan
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_acos
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_asin
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_atan
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_atan2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_cosh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sinh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_tanh
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_pow
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_exp
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_exp2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log10
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_log2
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fma
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fabs
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_minnum
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_maxnum
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_f32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_f64
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_copysign_extended
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_floor
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_ceil
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_trunc
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_rint
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_nearbyint
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_round
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_roundeven
+; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_fmuladd
 
 define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-LABEL: test_fadd:
@@ -39,20 +97,15 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fadd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fadd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fadd h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fadd bfloat %a, %b
   ret bfloat %r
 }
@@ -75,20 +128,15 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fsub:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fsub s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fsub:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fsub h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fsub:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsub s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fsub bfloat %a, %b
   ret bfloat %r
 }
@@ -111,20 +159,15 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmul:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmul:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmul h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmul:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fmul bfloat %a, %b
   ret bfloat %r
 }
@@ -157,25 +200,20 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmadd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    shll v1.4s, v2.4h, #16
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmadd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmadd h0, h0, h1, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmadd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %mul = fmul fast bfloat %a, %b
   %r = fadd fast bfloat %mul, %c
   ret bfloat %r
@@ -199,20 +237,15 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fdiv:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fdiv s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fdiv:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fdiv h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fdiv:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fdiv s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = fdiv bfloat %a, %b
   ret bfloat %r
 }
@@ -239,29 +272,19 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_frem:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl fmodf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_frem:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl fmodf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_frem:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl fmodf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = frem bfloat %a, %b
   ret bfloat %r
 }
@@ -385,14 +408,12 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 ;
 ; CHECK-GI-LABEL: test_select_cc:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $s1
-; CHECK-GI-NEXT:    fcmp h2, h3
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    csel w8, w8, w9, ne
-; CHECK-GI-NEXT:    fmov s0, w8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-GI-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-GI-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-GI-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-GI-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-GI-NEXT:    fcmp s2, s3
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, ne
 ; CHECK-GI-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
   %r = select i1 %cc, bfloat %a, bfloat %b
@@ -400,31 +421,15 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 {
 }
 
 define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 {
-; CHECK-CVT-LABEL: test_select_cc_f32_f16:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h3 killed $h3 def $d3
-; CHECK-CVT-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-CVT-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-CVT-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-CVT-NEXT:    fcmp s2, s3
-; CHECK-CVT-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_select_cc_f32_f16:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h3 killed $h3 def $d3
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v3.4s, v3.4h, #16
-; CHECK-SD-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-SD-NEXT:    fcmp s2, s3
-; CHECK-SD-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_select_cc_f32_f16:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h2, h3
-; CHECK-GI-NEXT:    fcsel s0, s0, s1, ne
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_select_cc_f32_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h3 killed $h3 def $d3
+; CHECK-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-NEXT:    shll v3.4s, v3.4h, #16
+; CHECK-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-NEXT:    fcmp s2, s3
+; CHECK-NEXT:    fcsel s0, s0, s1, ne
+; CHECK-NEXT:    ret
   %cc = fcmp une bfloat %c, %d
   %r = select i1 %cc, float %a, float %b
   ret float %r
@@ -466,429 +471,199 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d)
 }
 
 define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_une:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ne
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_une:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ne
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_une:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ne
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_une:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
   %r = fcmp une bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ueq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w8, eq
-; CHECK-CVT-NEXT:    csinc w0, w8, wzr, vc
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ueq:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, eq
-; CHECK-SD-NEXT:    csinc w0, w8, wzr, vc
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ueq:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w8, eq
-; CHECK-GI-NEXT:    cset w9, vs
-; CHECK-GI-NEXT:    orr w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ueq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, eq
+; CHECK-NEXT:    csinc w0, w8, wzr, vc
+; CHECK-NEXT:    ret
   %r = fcmp ueq bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ugt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, hi
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ugt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, hi
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ugt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, hi
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ugt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, hi
+; CHECK-NEXT:    ret
   %r = fcmp ugt bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, pl
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_uge:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, pl
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_uge:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, pl
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_uge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, pl
+; CHECK-NEXT:    ret
   %r = fcmp uge bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ult:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, lt
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ult:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, lt
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ult:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, lt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ult:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, lt
+; CHECK-NEXT:    ret
   %r = fcmp ult bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ule:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, le
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ule:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, le
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ule:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, le
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ule:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, le
+; CHECK-NEXT:    ret
   %r = fcmp ule bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_uno:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, vs
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_uno:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, vs
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_uno:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, vs
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_uno:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, vs
+; CHECK-NEXT:    ret
   %r = fcmp uno bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_one:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w8, mi
-; CHECK-CVT-NEXT:    csinc w0, w8, wzr, le
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_one:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w8, mi
-; CHECK-SD-NEXT:    csinc w0, w8, wzr, le
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_one:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w8, mi
-; CHECK-GI-NEXT:    cset w9, gt
-; CHECK-GI-NEXT:    orr w0, w8, w9
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_one:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w8, mi
+; CHECK-NEXT:    csinc w0, w8, wzr, le
+; CHECK-NEXT:    ret
   %r = fcmp one bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oeq:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, eq
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_oeq:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, eq
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_oeq:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, eq
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_oeq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
   %r = fcmp oeq bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ogt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, gt
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ogt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, gt
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ogt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, gt
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ogt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
   %r = fcmp ogt bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_oge:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ge
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_oge:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ge
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_oge:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ge
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_oge:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ge
+; CHECK-NEXT:    ret
   %r = fcmp oge bfloat %a, %b
   ret i1 %r
-}
-
-define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_olt:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, mi
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_olt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, mi
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_olt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, mi
-; CHECK-GI-NEXT:    ret
-  %r = fcmp olt bfloat %a, %b
-  ret i1 %r
-}
-
-define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ole:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, ls
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ole:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, ls
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ole:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, ls
-; CHECK-GI-NEXT:    ret
+}
+
+define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_olt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, mi
+; CHECK-NEXT:    ret
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 {
+; CHECK-LABEL: test_fcmp_ole:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, ls
+; CHECK-NEXT:    ret
   %r = fcmp ole bfloat %a, %b
   ret i1 %r
 }
 
 define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 {
-; CHECK-CVT-LABEL: test_fcmp_ord:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    cset w0, vc
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fcmp_ord:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    cset w0, vc
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fcmp_ord:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    cset w0, vc
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fcmp_ord:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    cset w0, vc
+; CHECK-NEXT:    ret
   %r = fcmp ord bfloat %a, %b
   ret i1 %r
 }
@@ -924,15 +699,15 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 ;
 ; CHECK-GI-LABEL: test_fccmp:
 ; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    movi v1.2s, #69, lsl #24
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-GI-NEXT:    shll v2.4s, v0.4h, #16
+; CHECK-GI-NEXT:    movi v3.2s, #72, lsl #24
+; CHECK-GI-NEXT:    fcmp s2, s1
 ; CHECK-GI-NEXT:    fmov h1, #5.00000000
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-GI-NEXT:    fmov h2, #8.00000000
-; CHECK-GI-NEXT:    fmov w8, s0
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    fmov w9, s1
-; CHECK-GI-NEXT:    fccmp h0, h2, #4, mi
-; CHECK-GI-NEXT:    csel w8, w8, w9, gt
-; CHECK-GI-NEXT:    strh w8, [x0]
+; CHECK-GI-NEXT:    fccmp s2, s3, #4, mi
+; CHECK-GI-NEXT:    fcsel h0, h0, h1, gt
+; CHECK-GI-NEXT:    str h0, [x0]
 ; CHECK-GI-NEXT:    ret
   %cmp1 = fcmp ogt bfloat %in, 0xR4800
   %cmp2 = fcmp olt bfloat %in, 0xR4500
@@ -943,34 +718,16 @@ define void @test_fccmp(bfloat %in, ptr %out) {
 }
 
 define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 {
-; CHECK-CVT-LABEL: test_br_cc:
-; CHECK-CVT:       // %bb.0: // %common.ret
-; CHECK-CVT-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-CVT-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-CVT-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-CVT-NEXT:    fcmp s0, s1
-; CHECK-CVT-NEXT:    csel x8, x0, x1, pl
-; CHECK-CVT-NEXT:    str wzr, [x8]
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_br_cc:
-; CHECK-SD:       // %bb.0: // %common.ret
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fcmp s0, s1
-; CHECK-SD-NEXT:    csel x8, x0, x1, pl
-; CHECK-SD-NEXT:    str wzr, [x8]
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_br_cc:
-; CHECK-GI:       // %bb.0: // %common.ret
-; CHECK-GI-NEXT:    fcmp h0, h1
-; CHECK-GI-NEXT:    csel x8, x0, x1, pl
-; CHECK-GI-NEXT:    str wzr, [x8]
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_br_cc:
+; CHECK:       // %bb.0: // %common.ret
+; CHECK-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-NEXT:    fcmp s0, s1
+; CHECK-NEXT:    csel x8, x0, x1, pl
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:    ret
   %c = fcmp uge bfloat %a, %b
   br i1 %c, label %then, label %else
 then:
@@ -1426,18 +1183,13 @@ define bfloat @test_sqrt(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sqrt:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fsqrt s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sqrt:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fsqrt h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sqrt:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fsqrt s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sqrt.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1461,25 +1213,16 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_powi:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl __powisf2
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_powi:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl __powisf2
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_powi:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl __powisf2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b)
   ret bfloat %r
 }
@@ -1504,25 +1247,16 @@ define bfloat @test_sin(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sin:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl sinf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sin:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl sinf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl sinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sin.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1546,25 +1280,16 @@ define bfloat @test_cos(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_cos:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl cosf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_cos:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl cosf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_cos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl cosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.cos.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1588,25 +1313,16 @@ define bfloat @test_tan(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_tan:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl tanf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_tan:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl tanf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_tan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl tanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.tan.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1630,25 +1346,16 @@ define bfloat @test_acos(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_acos:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl acosf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_acos:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl acosf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_acos:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl acosf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.acos.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1672,25 +1379,16 @@ define bfloat @test_asin(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_asin:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl asinf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_asin:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl asinf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_asin:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl asinf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.asin.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1714,25 +1412,16 @@ define bfloat @test_atan(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_atan:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl atanf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_atan:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl atanf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_atan:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl atanf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.atan.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1759,29 +1448,19 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_atan2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl atan2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_atan2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl atan2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_atan2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl atan2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -1805,25 +1484,16 @@ define bfloat @test_cosh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_cosh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl coshf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_cosh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl coshf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_cosh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl coshf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.cosh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1847,25 +1517,16 @@ define bfloat @test_sinh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_sinh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl sinhf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_sinh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl sinhf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_sinh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl sinhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.sinh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1889,25 +1550,16 @@ define bfloat @test_tanh(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_tanh:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl tanhf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_tanh:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl tanhf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_tanh:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl tanhf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.tanh.f16(bfloat %a)
   ret bfloat %r
 }
@@ -1934,29 +1586,19 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_pow:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-SD-NEXT:    bl powf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_pow:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    fcvt s1, h1
-; CHECK-GI-NEXT:    bl powf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_pow:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECK-BF16-NEXT:    bl powf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -1973,32 +1615,23 @@ define bfloat @test_exp(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    mov w8, #32767 // =0x7fff
 ; CHECK-CVT-NEXT:    ubfx w10, w9, #16, #1
 ; CHECK-CVT-NEXT:    add w8, w9, w8
-; CHECK-CVT-NEXT:    add w8, w10, w8
-; CHECK-CVT-NEXT:    lsr w8, w8, #16
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_exp:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl expf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
+; CHECK-CVT-NEXT:    add w8, w10, w8
+; CHECK-CVT-NEXT:    lsr w8, w8, #16
+; CHECK-CVT-NEXT:    fmov s0, w8
+; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-GI-LABEL: test_exp:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl expf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_exp:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl expf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.exp.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2022,25 +1655,16 @@ define bfloat @test_exp2(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_exp2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl exp2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_exp2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl exp2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_exp2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl exp2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.exp2.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2064,25 +1688,16 @@ define bfloat @test_log(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl logf
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl logf
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl logf
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2106,25 +1721,16 @@ define bfloat @test_log10(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log10:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl log10f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log10:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl log10f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log10:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl log10f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log10.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2148,25 +1754,16 @@ define bfloat @test_log2(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_log2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-SD-NEXT:    bl log2f
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_log2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    fcvt s0, h0
-; CHECK-GI-NEXT:    bl log2f
-; CHECK-GI-NEXT:    bfcvt h0, s0
-; CHECK-GI-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_log2:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-BF16-NEXT:    bl log2f
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.log2.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2191,49 +1788,30 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fma:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v2.4s, v2.4h, #16
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmadd s0, s0, s1, s2
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fma:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmadd h0, h0, h1, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fma:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v2.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmadd s0, s0, s1, s2
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %r
 }
 
 define bfloat @test_fabs(bfloat %a) #0 {
-; CHECK-CVT-LABEL: test_fabs:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-NEXT:    fmov w8, s0
-; CHECK-CVT-NEXT:    and w8, w8, #0x7fff
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-CVT-NEXT:    ret
-;
-; CHECK-SD-LABEL: test_fabs:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $s0
-; CHECK-SD-NEXT:    fmov w8, s0
-; CHECK-SD-NEXT:    and w8, w8, #0x7fff
-; CHECK-SD-NEXT:    fmov s0, w8
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fabs:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fabs h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: test_fabs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $s0
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    and w8, w8, #0x7fff
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
   %r = call bfloat @llvm.fabs.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2256,20 +1834,15 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_minnum:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fminnm s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_minnum:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fminnm h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_minnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fminnm s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -2292,20 +1865,15 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_maxnum:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmaxnm s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_maxnum:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmaxnm h0, h0, h1
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_maxnum:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmaxnm s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b)
   ret bfloat %r
 }
@@ -2338,11 +1906,11 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 {
 ;
 ; CHECK-GI-LABEL: test_copysign:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mvni v2.4h, #128, lsl #8
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    mvni v2.8h, #128, lsl #8
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $q0
+; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $q1
+; CHECK-GI-NEXT:    bif v0.16b, v1.16b, v2.16b
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
 ; CHECK-GI-NEXT:    ret
   %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b)
   ret bfloat %r
@@ -2488,18 +2056,13 @@ define bfloat @test_floor(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_floor:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintm s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_floor:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintm h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_floor:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintm s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.floor.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2520,18 +2083,13 @@ define bfloat @test_ceil(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_ceil:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintp s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_ceil:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintp h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_ceil:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintp s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.ceil.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2552,18 +2110,13 @@ define bfloat @test_trunc(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_trunc:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintz s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_trunc:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintz h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_trunc:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintz s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.trunc.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2584,18 +2137,13 @@ define bfloat @test_rint(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_rint:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintx s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_rint:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintx h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_rint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintx s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.rint.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2616,18 +2164,13 @@ define bfloat @test_nearbyint(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_nearbyint:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frinti s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_nearbyint:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frinti h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_nearbyint:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frinti s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.nearbyint.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2648,18 +2191,13 @@ define bfloat @test_round(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_round:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frinta s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_round:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frinta h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_round:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frinta s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.round.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2680,18 +2218,13 @@ define bfloat @test_roundeven(bfloat %a) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_roundeven:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    frintn s0, s0
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_roundeven:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    frintn h0, h0
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_roundeven:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    frintn s0, s0
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.roundeven.f16(bfloat %a)
   ret bfloat %r
 }
@@ -2724,26 +2257,20 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 {
 ; CHECK-CVT-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; CHECK-CVT-NEXT:    ret
 ;
-; CHECK-SD-LABEL: test_fmuladd:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    // kill: def $h1 killed $h1 def $d1
-; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 def $d0
-; CHECK-SD-NEXT:    // kill: def $h2 killed $h2 def $d2
-; CHECK-SD-NEXT:    shll v1.4s, v1.4h, #16
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fmul s0, s0, s1
-; CHECK-SD-NEXT:    shll v1.4s, v2.4h, #16
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    shll v0.4s, v0.4h, #16
-; CHECK-SD-NEXT:    fadd s0, s0, s1
-; CHECK-SD-NEXT:    bfcvt h0, s0
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: test_fmuladd:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fmul h0, h0, h1
-; CHECK-GI-NEXT:    fadd h0, h0, h2
-; CHECK-GI-NEXT:    ret
+; CHECK-BF16-LABEL: test_fmuladd:
+; CHECK-BF16:       // %bb.0:
+; CHECK-BF16-NEXT:    // kill: def $h1 killed $h1 def $d1
+; CHECK-BF16-NEXT:    // kill: def $h0 killed $h0 def $d0
+; CHECK-BF16-NEXT:    // kill: def $h2 killed $h2 def $d2
+; CHECK-BF16-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fmul s0, s0, s1
+; CHECK-BF16-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    shll v0.4s, v0.4h, #16
+; CHECK-BF16-NEXT:    fadd s0, s0, s1
+; CHECK-BF16-NEXT:    bfcvt h0, s0
+; CHECK-BF16-NEXT:    ret
   %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c)
   ret bfloat %r
 }
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index 2187717c4148a..b4c38e9f2df3b 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel=1 -mattr=+fullfp16,+bf16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define float @fptrunc_f64_f32(double %a) {
 ; CHECK-LABEL: fptrunc_f64_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
index 7691f4c30de04..78f33a174980d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s
 
 declare hidden void @external_void_func_void() #0
 
@@ -5594,48 +5594,14 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
 
 define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
   ; CHECK-LABEL: name: test_call_external_void_func_bf16_inreg
-  ; CHECK: bb.1 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr16
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16
-  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32)
-  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_bf16_inreg
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]]
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]]
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]]
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]]
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]]
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]]
-  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
-  ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16)
-  ; CHECK-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32)
-  ; CHECK-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
-  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3
-  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>)
-  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]](p4)
-  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY11]](p4)
-  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY12]](p4)
-  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY13]](s64)
-  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY14]](s32)
-  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY15]](s32)
-  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY16]](s32)
-  ; CHECK-NEXT:   $sgpr15 = COPY [[COPY17]](s32)
-  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY18]](s32)
-  ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
-  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc
-  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16
+  ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.0):
   call void @external_void_func_bf16_inreg(bfloat inreg %arg)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index 73b891e43de99..ee89b28a0d2bb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) {
 ; GCN-LABEL: set_inactive:
@@ -284,17 +284,15 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[2:3], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
-; GCN-NEXT:    s_mov_b64 exec, s[2:3]
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v1, s6
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v0
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> <bfloat 1.0, bfloat 1.0>) #0
@@ -359,21 +357,23 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa
 ; GCN-LABEL: set_inactive_v4bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    v_mov_b32_e32 v4, s3
-; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v3, s[4:5]
-; GCN-NEXT:    v_cndmask_b32_e64 v2, v0, v4, s[4:5]
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_mov_b32_e32 v0, 0x3f803f80
+; GCN-NEXT:    v_cndmask_b32_e64 v1, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; GCN-NEXT:    s_mov_b64 exec, s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-NEXT:    v_mov_b32_e32 v4, v2
-; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    buffer_store_dwordx2 v[3:4], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> <bfloat 1.0, bfloat 1.0, bfloat 1.0, bfloat 1.0>) #0
   %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
index 9912ce3604a49..7eaa52d89b9b6 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s
 
 ; Note: if you're adding tests here, also add them to
 ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by
@@ -629,7 +629,6 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    buffer_load_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
   %ret = load <4 x bfloat>, ptr addrspace(7) %p
@@ -647,10 +646,6 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) {
 ; GISEL-LABEL: store_v4bf16:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
 ; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[16:19], 0
 ; GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index b128be2186df2..935ae48654b64 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 
 define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX7-LABEL: fmul_select_f32_test1:
@@ -2541,114 +2541,72 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 }
 
 define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test1:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test1:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test1:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2656,114 +2614,72 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3f00
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3f00
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test2:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test2:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test2:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3f00
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test2:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -2771,158 +2687,111 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_v2bf16_test3:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_v2bf16_test3:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x4000
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2bf16_test3:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_v2bf16_test3:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x4000
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 2.000000e+00, bfloat 2.000000e+00>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
@@ -2930,267 +2799,185 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 }
 
 define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v3
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3f00
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-SDAG-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x7060302
-; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v1, s4
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX10-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_v2bf16_test4:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_v2bf16_test4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x3f80
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x3f00
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
+; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3f00
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v5, 0x400000, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_v2bf16_test4:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX10-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_v2bf16_test4:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x3f00
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2
   %y = select <2 x i1> %bool, <2 x bfloat> <bfloat 5.000000e-01, bfloat 5.000000e-01>, <2 x bfloat> <bfloat 1.000000e+00, bfloat 1.000000e+00>
   %ldexp = fmul <2 x bfloat> %x, %y
   ret <2 x bfloat> %ldexp
 }
-
-define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+
+define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
+; GFX7-LABEL: fmul_select_bf16_test5:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, v3, 2.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test5:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test5:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test5:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test5:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test5:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3198,116 +2985,74 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x40400000
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc100
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc100
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4040
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc100
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test6:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test6:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4040
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test6:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test6:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc100
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test6:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4040
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test6:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3315,115 +3060,73 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc080
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4100
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc080
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4100
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test7:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, -4.0, v3, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test7:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc080
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4100
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test7:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test7:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x4100
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test7:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc080
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test7:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3431,111 +3134,73 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 15
-; GFX9-SDAG-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x8000
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test8:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 31, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test8:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshlrev_b16 v1, 15, v1
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test8:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fmul_select_bf16_test8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b16 v1, 15, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00
   %ldexp = fmul bfloat %x, %y
@@ -3543,121 +3208,74 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc2000000
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc1800000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 5, v1
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc200
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffc180
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
-; GFX9-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffc180
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test9:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xc2000000
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0xc1800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
-; GFX10-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test9:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffc200
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffc180
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test9:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test9:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffc180
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test9:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fff
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_add_nc_u32_e32 v1, 5, v1
-; GFX11-GISEL-NEXT:    v_med3_i32 v1, 0xffff8000, v1, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test9:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01
   %ldexp = fmul bfloat %x, %y
@@ -3665,111 +3283,74 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 }
 
 define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v3, 0xdb800000
-; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v4, 7
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0xffffe000
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0xffffe000
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_mov_b32_e32 v3, 0xdb800000
+; GFX7-NEXT:    v_bfrev_b32_e32 v4, 7
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffffdb80
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffffe000
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xffffe000
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80
   %ldexp = fmul bfloat %x, %y
@@ -3777,111 +3358,74 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 }
 
 define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7-SDAG:       ; %bb.0:
-; GFX7-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT:    v_bfrev_b32_e32 v3, 50
-; GFX7-SDAG-NEXT:    v_mov_b32_e32 v4, 0x34800000
-; GFX7-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX7-GISEL:       ; %bb.0:
-; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT:    v_not_b32_e32 v3, 21
-; GFX7-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
-; GFX7-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4c00
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, 0x3480
-; GFX9-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-SDAG-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-SDAG-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX9-SDAG-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_not_b32_e32 v3, 21
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 25, v3, vcc
-; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3480
-; GFX10-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX10-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX10-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_bfrev_b32_e32 v3, 50
+; GFX7-NEXT:    v_mov_b32_e32 v4, 0x34800000
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
-; GFX10-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x4c00
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x3480
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
-; GFX11-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
-; GFX11-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX11-SDAG-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-SDAG-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX11-SDAG-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
-; GFX11-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0x3480
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
-; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %bool = icmp eq i32 %bool.arg1, %bool.arg2
   %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00
   %ldexp = fmul bfloat %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index a0578756433ff..62f16fe2760ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX12-GISEL
 
 declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
   ret void
@@ -29,7 +29,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
 ; GFX12-GISEL-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
@@ -60,7 +60,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -92,7 +92,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
 ; GFX12-GISEL-NEXT:    s_endpgm
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
index d8ea0ddf77b7a..5d9944add13a3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 | FileCheck %s -check-prefix=GFX950-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 | FileCheck %s -check-prefix=GFX950-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX950-GISEL
 
 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
@@ -20,9 +20,9 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <
 ;
 ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
-; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen sc0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen sc0
 ; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0
 ; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX950-GISEL-NEXT:    flat_store_dword v[2:3], v0
@@ -44,9 +44,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val,
 ;
 ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
-; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen
 ; GFX950-GISEL-NEXT:    s_endpgm
   %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index acd48a64dea1f..befe0d405307b 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s
@@ -594,35 +594,35 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
 
 define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
   ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
+  ; GISEL-GFX11: bb.0 (%ir-block.0):
   ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
   ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
-  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
-  ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
+  ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
+  ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
+  ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX10: bb.1 (%ir-block.0):
+  ; GISEL-GFX10: bb.0 (%ir-block.0):
   ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
   ; GISEL-GFX10-NEXT: {{  $}}
-  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; GISEL-GFX10-NEXT:   $vgpr0 = COPY [[COPY]]
-  ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; GISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
+  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
-  ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
-  ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; GISEL-GFX10-NEXT:   $vgpr0 = COPY [[COPY1]]
+  ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY]]
+  ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
+  ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX10-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 2e2a1094ba99a..ef91f36d60373 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s
@@ -873,32 +873,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre
 }
 
 define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) {
-  ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX11: bb.1 (%ir-block.0):
-  ; GISEL-GFX11-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX11-NEXT: {{  $}}
-  ; GISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX11-NEXT:   S_ENDPGM 0
-  ;
-  ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat
-  ; GISEL-GFX10: bb.1 (%ir-block.0):
-  ; GISEL-GFX10-NEXT:   liveins: $sgpr0, $vgpr8
-  ; GISEL-GFX10-NEXT: {{  $}}
-  ; GISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
-  ; GISEL-GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-  ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX10-NEXT:   [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-  ; GISEL-GFX10-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX10-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
-  ; GISEL-GFX10-NEXT:   S_ENDPGM 0
-  ;
   ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat
   ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0):
   ; DAGISEL-GFX11-WF32-NEXT:   liveins: $sgpr0, $vgpr8
@@ -996,9 +970,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
   ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
   ;
   ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1020,10 +994,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; DAGISEL-GFX11-WF32-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
@@ -1032,10 +1006,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
-  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
+  ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; DAGISEL-GFX11-WF64-NEXT:   S_ENDPGM 0
   ;
   ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
index 046a72b9307d0..a0ba97d3b639c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale)
@@ -983,85 +983,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %scale) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale)
   ret <32 x bfloat> %ret
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
-; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-NEXT:    v_mov_b32_e32 v18, s2
+; GCN-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-NEXT:    v_mov_b32_e32 v20, s16
+; GCN-NEXT:    v_mov_b32_e32 v21, s17
+; GCN-NEXT:    s_mov_b32 s0, 0x42c80000
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0)
   ret <32 x bfloat> %ret
 }
@@ -1126,85 +1076,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) {
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %scale) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v22, v6
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, v5
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, v4
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, v3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, v1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, v0
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v22, v6
+; GCN-NEXT:    v_mov_b32_e32 v21, v5
+; GCN-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-NEXT:    v_mov_b32_e32 v19, v3
+; GCN-NEXT:    v_mov_b32_e32 v18, v2
+; GCN-NEXT:    v_mov_b32_e32 v17, v1
+; GCN-NEXT:    v_mov_b32_e32 v16, v0
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale)
   ret <32 x bfloat> %ret
 }
 
 define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) {
-; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v16, s0
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v17, s1
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v18, s2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v19, s3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v20, s16
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v21, s17
-; GFX950-SDAG-NEXT:    s_mov_b32 s0, 0x42c80000
-; GFX950-SDAG-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    s_mov_b32 s4, s16
-; GFX950-GISEL-NEXT:    s_mov_b32 s5, s17
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v16
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v19
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v9, 16, v20
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v21
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v22
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v23
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, v16
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v17
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v18
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, v19
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, v20
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, v21
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, v22
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, v23
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-NEXT:    v_mov_b32_e32 v18, s2
+; GCN-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-NEXT:    v_mov_b32_e32 v20, s16
+; GCN-NEXT:    v_mov_b32_e32 v21, s17
+; GCN-NEXT:    s_mov_b32 s0, 0x42c80000
+; GCN-NEXT:    v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0)
   ret <32 x bfloat> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
index f9fd7e253b124..517c87193598d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale)
@@ -19,44 +19,11 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -90,82 +57,26 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src,
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s16, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s16, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s16, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s16, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s16, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s16, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s16, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s16, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s16, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s16, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s16, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s16, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s16, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s16, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s16, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s16, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT:    s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -252,44 +163,11 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v25, v18
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, v17
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[24:25], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[24:25], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -323,82 +201,26 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src,
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s16, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s16, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s16, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s16, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s16, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s16, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s16, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s16, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s16, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s16, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s16, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s16, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s16, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s16, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s16, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s16, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s16, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s16, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s16, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
-; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
+; GFX950-GISEL-NEXT:    s_mov_b32 s0, 0x42c80000
+; GFX950-GISEL-NEXT:    v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
index e1bf9f0daa1ef..d3851b1a084d6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s
 
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel)
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
index 1107b46f8f6d3..7433f6611cd9b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s
 
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel)
 declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
index 0d4598f316c41..18b20e101a938 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s
 
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
 declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale)
@@ -19,42 +19,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -88,82 +55,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s17, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s17, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s17, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s17, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s17, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s17, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s17, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s17, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s17, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s17, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s17, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s17, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s17, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s33, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s17, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s17, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s33, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s17, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -244,42 +155,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v0
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v1
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v2
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v3
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v4
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v25, 16, v5
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 16, v8
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v29, 16, v9
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v30, 16, v10
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v31, 16, v11
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 16, v12
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v33, 16, v13
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX950-GISEL-NEXT:    v_lshrrev_b32_e32 v35, 16, v15
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GFX950-GISEL-NEXT:    s_nop 0
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[18:19], v[24:25], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[18:19], v[20:23], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
@@ -313,82 +191,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr
 ;
 ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl:
 ; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_lshr_b32 s17, s0, 16
-; GFX950-GISEL-NEXT:    s_lshr_b32 s18, s1, 16
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s17, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s19, s2, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s0, s17, s0
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s18, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s20, s3, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s1, s17, s1
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s19, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s21, s4, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s2, s17, s2
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s20, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s22, s5, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s3, s17, s3
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s21, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s23, s6, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s4, s17, s4
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s22, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s24, s7, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s5, s17, s5
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s23, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s6, s6, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s25, s8, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s6, s17, s6
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s24, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s7, s7, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s26, s9, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s7, s17, s7
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s25, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s8, s8, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s27, s10, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s8, s17, s8
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s26, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s9, s9, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s28, s11, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s9, s17, s9
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s27, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s10, s10, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s29, s12, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s10, s17, s10
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s28, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s11, s11, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s30, s13, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s11, s17, s11
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s29, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s12, s12, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s31, s14, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s12, s17, s12
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s30, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s13, s13, 0xffff
-; GFX950-GISEL-NEXT:    s_lshr_b32 s33, s15, 16
-; GFX950-GISEL-NEXT:    s_or_b32 s13, s17, s13
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s31, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s14, s14, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s14, s17, s14
-; GFX950-GISEL-NEXT:    s_lshl_b32 s17, s33, 16
-; GFX950-GISEL-NEXT:    s_and_b32 s15, s15, 0xffff
-; GFX950-GISEL-NEXT:    s_or_b32 s15, s17, s15
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[8:9]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
-; GFX950-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, s2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, s3
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v8, s6
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v9, s7
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v10, s8
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v11, s9
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v12, s10
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v13, s11
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v14, s12
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v15, s13
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v16, s14
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v17, s15
 ; GFX950-GISEL-NEXT:    v_mov_b32_e32 v24, 0x42c80000
 ; GFX950-GISEL-NEXT:    v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24
-; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[22:23], off offset:16
+; GFX950-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[18:21], off
 ; GFX950-GISEL-NEXT:    s_endpgm
   %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0)
   store <6 x i32> %cvt, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
index 4a58d6346fc57..64a15bc102759 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s
 
 declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1)
 declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index f694d55f83b68..eb5bded6d2610 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s
 
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
@@ -146,11 +146,11 @@ define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr add
 ;
 ; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
 ; GFX950-GISEL:       ; %bb.0: ; %entry
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX950-GISEL-NEXT:    ds_read_b64_tr_b16 v[0:1], v0 offset:32
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v5, v2
 ; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
+; GFX950-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; GFX950-GISEL-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index 42acf089e8648..159592cab6a34 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
 
 declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
 
@@ -40,17 +40,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
 ; GFX950-ISEL:       ; %bb.0: ; %entry
 ; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v0, s2, v0, v1 clamp
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX950-ISEL-NEXT:    s_nop 1
-; GFX950-ISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2 clamp
+; GFX950-ISEL-NEXT:    s_nop 2
+; GFX950-ISEL-NEXT:    global_store_dword v0, v1, s[8:9]
 ; GFX950-ISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -100,17 +100,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
 ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
 ; GFX950-ISEL:       ; %bb.0: ; %entry
 ; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
 ; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
 ; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX950-ISEL-NEXT:    v_dot2c_f32_bf16_e32 v1, s2, v0
-; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-ISEL-NEXT:    s_nop 1
-; GFX950-ISEL-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-ISEL-NEXT:    v_dot2c_f32_bf16_e32 v2, s2, v1
+; GFX950-ISEL-NEXT:    s_nop 2
+; GFX950-ISEL-NEXT:    global_store_dword v0, v2, s[8:9]
 ; GFX950-ISEL-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 8427b4e7f6f35..537aab9a3e9c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index be4fa79951daf..4db256de1ce1b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
 
 declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
 declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 12f9029392a43..7be0d9ca329aa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s
 
 define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
 ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2:
@@ -156,16 +156,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v3, 0
@@ -190,16 +180,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
@@ -219,16 +199,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x
 ;
 ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
 ; GFX12-GISEL:       ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX12-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX12-GISEL-NEXT:    image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 1.0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
index 722c53a9dd607..d9ee276c3f076 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
 
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg)
@@ -1856,198 +1856,92 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32>
 declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)
 
 define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v8
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v9
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v10
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v11
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, v8
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, v9
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, v10
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, v11
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_accvgpr_read_b32 v0, a0
-; GISEL-NEXT:    v_accvgpr_read_b32 v1, a1
-; GISEL-NEXT:    v_accvgpr_read_b32 v2, a2
-; GISEL-NEXT:    v_accvgpr_read_b32 v3, a3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v8
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v9
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v10
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v11
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1)
   ret <4 x float> %result
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3]
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
 }
 
 define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 {
-; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    v_mov_b32_e32 v8, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s0
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s1
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s2
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s3
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_accvgpr_write_b32 a0, s0
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_accvgpr_write_b32 a1, s1
-; GISEL-NEXT:    v_accvgpr_write_b32 a2, s2
-; GISEL-NEXT:    v_accvgpr_write_b32 a3, s3
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, a[0:3], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v8, a[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1)
   store <4 x float> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
index 53e37479f68e6..481e721e3c21d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s
 
 ; GFX10PLUS-LABEL: {{^}}dpp8_test:
 ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 71961a57bd080..5eb6d203098ee 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
index f8caf84d5c51a..09cc55b53539b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
@@ -15,14 +15,12 @@
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-
-; FIXME: These should fail when bfloat support is handled correctly
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
 
 ; Make sure buffer fadd atomics with return values are not selected
 ; for gfx908 where they do not work.
@@ -66,7 +64,7 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr
 
 ;--- raw-ret-v2bf16-error.ll
 ; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
-; ERR-RAW-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
+; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
 
 define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) {
   %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0)
@@ -75,7 +73,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4
 
 ;--- struct-ret-v2bf16-error.ll
 ; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
-; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD
+; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD
 
 define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 9a2f0aa5adb77..0605a158b974f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s
 
 define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) {
 ; CHECK-SDAG-LABEL: test_readfirstlane_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index a8560ff1aa2b0..edb6ebcee1325 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s
 
 declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0
 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
index 0a330e91f8206..66c02a9bd0c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
@@ -628,57 +628,31 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0,
 declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg)
 
 define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
-; SDAG:       ; %bb.0: ; %bb
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; SDAG-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; SDAG-NEXT:    v_mov_b32_e32 v17, s16
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
-; GISEL:       ; %bb.0: ; %bb
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s16
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
-; GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GISEL-NEXT:    s_nop 5
-; GISEL-NEXT:    global_store_dwordx4 v0, v[8:11], s[6:7]
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v0, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[4:5], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[6:7], s[14:15]
+; GCN-NEXT:    v_mov_b32_e32 v17, s16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7]
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
@@ -689,266 +663,94 @@ bb:
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v2
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GISEL-NEXT:    v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v17, 16, v4
-; GISEL-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GISEL-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GISEL-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GISEL-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GISEL-NEXT:    v_lshrrev_b32_e32 v22, 16, v9
-; GISEL-NEXT:    v_lshrrev_b32_e32 v23, 16, v10
-; GISEL-NEXT:    v_lshrrev_b32_e32 v24, 16, v11
-; GISEL-NEXT:    v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
-; GISEL-NEXT:    s_nop 6
-; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mov_b32_e32 v1, v13
-; GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GISEL-NEXT:    v_mov_b32_e32 v3, v15
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <4 x float> %result
 }
 
 define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v8, s0
-; SDAG-NEXT:    v_mov_b32_e32 v9, s1
-; SDAG-NEXT:    v_mov_b32_e32 v10, s2
-; SDAG-NEXT:    v_mov_b32_e32 v11, s3
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, s24
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, s25
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, s26
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, s27
-; SDAG-NEXT:    v_mov_b32_e32 v12, s28
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
-; SDAG-NEXT:    s_nop 6
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_lshr_b32 s4, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s1, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s2, 16
-; GISEL-NEXT:    s_or_b32 s0, s4, s0
-; GISEL-NEXT:    s_lshl_b32 s4, s5, 16
-; GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s3, 16
-; GISEL-NEXT:    s_or_b32 s1, s4, s1
-; GISEL-NEXT:    s_lshl_b32 s4, s6, 16
-; GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
-; GISEL-NEXT:    s_or_b32 s2, s4, s2
-; GISEL-NEXT:    s_lshl_b32 s4, s7, 16
-; GISEL-NEXT:    s_and_b32 s3, s3, 0xffff
-; GISEL-NEXT:    s_or_b32 s3, s4, s3
-; GISEL-NEXT:    s_lshr_b32 s4, s16, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s17, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s12, s16, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s18, 16
-; GISEL-NEXT:    s_or_b32 s4, s4, s12
-; GISEL-NEXT:    s_lshl_b32 s5, s5, 16
-; GISEL-NEXT:    s_and_b32 s12, s17, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s19, 16
-; GISEL-NEXT:    s_or_b32 s5, s5, s12
-; GISEL-NEXT:    s_lshl_b32 s6, s6, 16
-; GISEL-NEXT:    s_and_b32 s12, s18, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s8, s20, 16
-; GISEL-NEXT:    s_or_b32 s6, s6, s12
-; GISEL-NEXT:    s_lshl_b32 s7, s7, 16
-; GISEL-NEXT:    s_and_b32 s12, s19, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s9, s21, 16
-; GISEL-NEXT:    s_or_b32 s7, s7, s12
-; GISEL-NEXT:    s_lshl_b32 s8, s8, 16
-; GISEL-NEXT:    s_and_b32 s12, s20, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
-; GISEL-NEXT:    s_lshr_b32 s10, s22, 16
-; GISEL-NEXT:    s_or_b32 s8, s8, s12
-; GISEL-NEXT:    s_lshl_b32 s9, s9, 16
-; GISEL-NEXT:    s_and_b32 s12, s21, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
-; GISEL-NEXT:    s_lshr_b32 s11, s23, 16
-; GISEL-NEXT:    s_or_b32 s9, s9, s12
-; GISEL-NEXT:    s_lshl_b32 s10, s10, 16
-; GISEL-NEXT:    s_and_b32 s12, s22, 0xffff
-; GISEL-NEXT:    s_or_b32 s10, s10, s12
-; GISEL-NEXT:    s_lshl_b32 s11, s11, 16
-; GISEL-NEXT:    s_and_b32 s12, s23, 0xffff
-; GISEL-NEXT:    s_or_b32 s11, s11, s12
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[26:27]
-; GISEL-NEXT:    v_mov_b32_e32 v16, s28
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_16x16x64_bf16 v[0:3], v[12:15], v[4:11], v16
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-NEXT:    v_mov_b32_e32 v11, s3
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NEXT:    v_mov_b32_e32 v6, s22
+; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s24
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s25
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s26
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s27
+; GCN-NEXT:    v_mov_b32_e32 v12, s28
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12
+; GCN-NEXT:    s_nop 6
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <4 x float> %result
 }
@@ -960,71 +762,38 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0
 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg)
 
 define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
-; SDAG:       ; %bb.0: ; %bb
-; SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; SDAG-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; SDAG-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; SDAG-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
-; SDAG-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; SDAG-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; SDAG-NEXT:    s_load_dword s16, s[4:5], 0x64
-; SDAG-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; SDAG-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; SDAG-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; SDAG-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; SDAG-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; SDAG-NEXT:    v_mov_b32_e32 v28, s16
-; SDAG-NEXT:    s_waitcnt vmcnt(0)
-; SDAG-NEXT:    s_nop 0
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; SDAG-NEXT:    v_mov_b32_e32 v16, 0
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; SDAG-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; SDAG-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; SDAG-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
-; GISEL:       ; %bb.0: ; %bb
-; GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GISEL-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
-; GISEL-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
-; GISEL-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
-; GISEL-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
-; GISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
-; GISEL-NEXT:    s_load_dword s16, s[4:5], 0x64
-; GISEL-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
-; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
-; GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
-; GISEL-NEXT:    v_mov_b32_e32 v28, s16
-; GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GISEL-NEXT:    s_nop 0
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
-; GISEL-NEXT:    v_mov_b32_e32 v16, 0
-; GISEL-NEXT:    s_nop 7
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
-; GISEL-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
-; GISEL-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
-; GISEL-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
-; GISEL-NEXT:    s_endpgm
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr:
+; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GCN-NEXT:    v_lshlrev_b32_e32 v16, 6, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    global_load_dwordx4 v[12:15], v16, s[6:7] offset:48
+; GCN-NEXT:    global_load_dwordx4 v[8:11], v16, s[6:7] offset:32
+; GCN-NEXT:    global_load_dwordx4 v[4:7], v16, s[6:7] offset:16
+; GCN-NEXT:    global_load_dwordx4 v[0:3], v16, s[6:7]
+; GCN-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GCN-NEXT:    s_load_dword s16, s[4:5], 0x64
+; GCN-NEXT:    v_mov_b64_e32 v[26:27], s[2:3]
+; GCN-NEXT:    v_mov_b64_e32 v[24:25], s[0:1]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_mov_b64_e32 v[22:23], s[14:15]
+; GCN-NEXT:    v_mov_b64_e32 v[20:21], s[12:13]
+; GCN-NEXT:    v_mov_b64_e32 v[18:19], s[10:11]
+; GCN-NEXT:    v_mov_b64_e32 v[16:17], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v28, s16
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_nop 0
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    global_store_dwordx4 v16, v[8:11], s[6:7] offset:32
+; GCN-NEXT:    global_store_dwordx4 v16, v[12:15], s[6:7] offset:48
+; GCN-NEXT:    global_store_dwordx4 v16, v[0:3], s[6:7]
+; GCN-NEXT:    global_store_dwordx4 v16, v[4:7], s[6:7] offset:16
+; GCN-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id
@@ -1035,448 +804,209 @@ bb:
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    v_mov_b32_e32 v48, v0
-; GISEL-NEXT:    v_mov_b32_e32 v49, v1
-; GISEL-NEXT:    v_mov_b32_e32 v50, v2
-; GISEL-NEXT:    v_mov_b32_e32 v51, v3
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v49
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v50
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GISEL-NEXT:    v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_e32 v30, v4
-; GISEL-NEXT:    v_mov_b32_e32 v31, v5
-; GISEL-NEXT:    v_mov_b32_e32 v32, v6
-; GISEL-NEXT:    v_mov_b32_e32 v33, v7
-; GISEL-NEXT:    v_mov_b32_e32 v34, v8
-; GISEL-NEXT:    v_mov_b32_e32 v35, v9
-; GISEL-NEXT:    v_mov_b32_e32 v36, v10
-; GISEL-NEXT:    v_mov_b32_e32 v37, v11
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
-; GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v31
-; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v32
-; GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
-; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v35
-; GISEL-NEXT:    v_lshrrev_b32_e32 v6, 16, v36
-; GISEL-NEXT:    v_lshrrev_b32_e32 v7, 16, v37
-; GISEL-NEXT:    v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[12:13]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[16:17]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[26:27]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
   ret <16 x float> %result
 }
 
 define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) {
-; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-NEXT:    v_mov_b32_e32 v28, s0
-; SDAG-NEXT:    v_mov_b32_e32 v29, s1
-; SDAG-NEXT:    v_mov_b32_e32 v30, s2
-; SDAG-NEXT:    v_mov_b32_e32 v31, s3
-; SDAG-NEXT:    v_mov_b32_e32 v12, s24
-; SDAG-NEXT:    v_mov_b32_e32 v27, v9
-; SDAG-NEXT:    v_mov_b32_e32 v26, v8
-; SDAG-NEXT:    v_mov_b32_e32 v25, v7
-; SDAG-NEXT:    v_mov_b32_e32 v24, v6
-; SDAG-NEXT:    v_mov_b32_e32 v23, v5
-; SDAG-NEXT:    v_mov_b32_e32 v22, v4
-; SDAG-NEXT:    v_mov_b32_e32 v21, v3
-; SDAG-NEXT:    v_mov_b32_e32 v20, v2
-; SDAG-NEXT:    v_mov_b32_e32 v19, v1
-; SDAG-NEXT:    v_mov_b32_e32 v18, v0
-; SDAG-NEXT:    v_mov_b32_e32 v13, s25
-; SDAG-NEXT:    v_mov_b32_e32 v14, s26
-; SDAG-NEXT:    v_mov_b32_e32 v15, s27
-; SDAG-NEXT:    v_mov_b32_e32 v16, s28
-; SDAG-NEXT:    v_mov_b32_e32 v17, s29
-; SDAG-NEXT:    v_accvgpr_write_b32 a0, v12
-; SDAG-NEXT:    v_mov_b32_e32 v0, s16
-; SDAG-NEXT:    v_mov_b32_e32 v1, s17
-; SDAG-NEXT:    v_mov_b32_e32 v2, s18
-; SDAG-NEXT:    v_mov_b32_e32 v3, s19
-; SDAG-NEXT:    v_mov_b32_e32 v4, s20
-; SDAG-NEXT:    v_mov_b32_e32 v5, s21
-; SDAG-NEXT:    v_mov_b32_e32 v6, s22
-; SDAG-NEXT:    v_mov_b32_e32 v7, s23
-; SDAG-NEXT:    v_accvgpr_write_b32 a1, v13
-; SDAG-NEXT:    v_accvgpr_write_b32 a2, v14
-; SDAG-NEXT:    v_accvgpr_write_b32 a3, v15
-; SDAG-NEXT:    v_accvgpr_write_b32 a4, v16
-; SDAG-NEXT:    v_accvgpr_write_b32 a5, v17
-; SDAG-NEXT:    v_accvgpr_write_b32 a6, v18
-; SDAG-NEXT:    v_accvgpr_write_b32 a7, v19
-; SDAG-NEXT:    v_accvgpr_write_b32 a8, v20
-; SDAG-NEXT:    v_accvgpr_write_b32 a9, v21
-; SDAG-NEXT:    v_accvgpr_write_b32 a10, v22
-; SDAG-NEXT:    v_accvgpr_write_b32 a11, v23
-; SDAG-NEXT:    v_accvgpr_write_b32 a12, v24
-; SDAG-NEXT:    v_accvgpr_write_b32 a13, v25
-; SDAG-NEXT:    v_accvgpr_write_b32 a14, v26
-; SDAG-NEXT:    v_accvgpr_write_b32 a15, v27
-; SDAG-NEXT:    s_nop 1
-; SDAG-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
-; SDAG-NEXT:    s_nop 7
-; SDAG-NEXT:    s_nop 2
-; SDAG-NEXT:    v_accvgpr_read_b32 v0, a0
-; SDAG-NEXT:    v_accvgpr_read_b32 v1, a1
-; SDAG-NEXT:    v_accvgpr_read_b32 v2, a2
-; SDAG-NEXT:    v_accvgpr_read_b32 v3, a3
-; SDAG-NEXT:    v_accvgpr_read_b32 v4, a4
-; SDAG-NEXT:    v_accvgpr_read_b32 v5, a5
-; SDAG-NEXT:    v_accvgpr_read_b32 v6, a6
-; SDAG-NEXT:    v_accvgpr_read_b32 v7, a7
-; SDAG-NEXT:    v_accvgpr_read_b32 v8, a8
-; SDAG-NEXT:    v_accvgpr_read_b32 v9, a9
-; SDAG-NEXT:    v_accvgpr_read_b32 v10, a10
-; SDAG-NEXT:    v_accvgpr_read_b32 v11, a11
-; SDAG-NEXT:    v_accvgpr_read_b32 v12, a12
-; SDAG-NEXT:    v_accvgpr_read_b32 v13, a13
-; SDAG-NEXT:    v_accvgpr_read_b32 v14, a14
-; SDAG-NEXT:    v_accvgpr_read_b32 v15, a15
-; SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_lshr_b32 s4, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s5, s1, 16
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s2, 16
-; GISEL-NEXT:    s_or_b32 s8, s4, s0
-; GISEL-NEXT:    s_lshl_b32 s0, s5, 16
-; GISEL-NEXT:    s_and_b32 s1, s1, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s7, s3, 16
-; GISEL-NEXT:    s_or_b32 s9, s0, s1
-; GISEL-NEXT:    s_lshl_b32 s0, s6, 16
-; GISEL-NEXT:    s_and_b32 s1, s2, 0xffff
-; GISEL-NEXT:    s_or_b32 s10, s0, s1
-; GISEL-NEXT:    s_lshl_b32 s0, s7, 16
-; GISEL-NEXT:    s_and_b32 s1, s3, 0xffff
-; GISEL-NEXT:    s_or_b32 s11, s0, s1
-; GISEL-NEXT:    s_lshr_b32 s0, s16, 16
-; GISEL-NEXT:    s_lshr_b32 s1, s17, 16
-; GISEL-NEXT:    s_lshl_b32 s0, s0, 16
-; GISEL-NEXT:    s_and_b32 s12, s16, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s18, 16
-; GISEL-NEXT:    s_or_b32 s0, s0, s12
-; GISEL-NEXT:    s_lshl_b32 s1, s1, 16
-; GISEL-NEXT:    s_and_b32 s12, s17, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s3, s19, 16
-; GISEL-NEXT:    s_or_b32 s1, s1, s12
-; GISEL-NEXT:    s_lshl_b32 s2, s2, 16
-; GISEL-NEXT:    s_and_b32 s12, s18, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s4, s20, 16
-; GISEL-NEXT:    s_or_b32 s2, s2, s12
-; GISEL-NEXT:    s_lshl_b32 s3, s3, 16
-; GISEL-NEXT:    s_and_b32 s12, s19, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s5, s21, 16
-; GISEL-NEXT:    s_or_b32 s3, s3, s12
-; GISEL-NEXT:    s_lshl_b32 s4, s4, 16
-; GISEL-NEXT:    s_and_b32 s12, s20, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s6, s22, 16
-; GISEL-NEXT:    s_or_b32 s4, s4, s12
-; GISEL-NEXT:    s_lshl_b32 s5, s5, 16
-; GISEL-NEXT:    s_and_b32 s12, s21, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[36:37], s[10:11]
-; GISEL-NEXT:    s_lshr_b32 s7, s23, 16
-; GISEL-NEXT:    s_or_b32 s5, s5, s12
-; GISEL-NEXT:    s_lshl_b32 s6, s6, 16
-; GISEL-NEXT:    s_and_b32 s12, s22, 0xffff
-; GISEL-NEXT:    v_mov_b64_e32 v[34:35], s[8:9]
-; GISEL-NEXT:    s_or_b32 s6, s6, s12
-; GISEL-NEXT:    s_lshl_b32 s7, s7, 16
-; GISEL-NEXT:    s_and_b32 s12, s23, 0xffff
-; GISEL-NEXT:    s_or_b32 s7, s7, s12
-; GISEL-NEXT:    v_mov_b32_e32 v18, s24
-; GISEL-NEXT:    v_mov_b32_e32 v19, s25
-; GISEL-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL-NEXT:    v_mov_b32_e32 v16, v10
-; GISEL-NEXT:    v_mov_b32_e32 v20, s26
-; GISEL-NEXT:    v_mov_b32_e32 v21, s27
-; GISEL-NEXT:    v_mov_b32_e32 v22, s28
-; GISEL-NEXT:    v_mov_b32_e32 v23, s29
-; GISEL-NEXT:    v_mov_b64_e32 v[54:55], s[6:7]
-; GISEL-NEXT:    v_mov_b64_e32 v[0:1], v[18:19]
-; GISEL-NEXT:    v_mov_b64_e32 v[52:53], s[4:5]
-; GISEL-NEXT:    v_mov_b64_e32 v[50:51], s[2:3]
-; GISEL-NEXT:    v_mov_b64_e32 v[48:49], s[0:1]
-; GISEL-NEXT:    v_mov_b64_e32 v[2:3], v[20:21]
-; GISEL-NEXT:    v_mov_b64_e32 v[4:5], v[22:23]
-; GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[24:25]
-; GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
-; GISEL-NEXT:    v_mov_b64_e32 v[10:11], v[28:29]
-; GISEL-NEXT:    v_mov_b64_e32 v[12:13], v[30:31]
-; GISEL-NEXT:    v_mov_b64_e32 v[14:15], v[32:33]
-; GISEL-NEXT:    s_nop 1
-; GISEL-NEXT:    v_smfmac_f32_32x32x32_bf16 v[0:15], v[34:37], v[48:55], v16
-; GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v28, s0
+; GCN-NEXT:    v_mov_b32_e32 v29, s1
+; GCN-NEXT:    v_mov_b32_e32 v30, s2
+; GCN-NEXT:    v_mov_b32_e32 v31, s3
+; GCN-NEXT:    v_mov_b32_e32 v12, s24
+; GCN-NEXT:    v_mov_b32_e32 v27, v9
+; GCN-NEXT:    v_mov_b32_e32 v26, v8
+; GCN-NEXT:    v_mov_b32_e32 v25, v7
+; GCN-NEXT:    v_mov_b32_e32 v24, v6
+; GCN-NEXT:    v_mov_b32_e32 v23, v5
+; GCN-NEXT:    v_mov_b32_e32 v22, v4
+; GCN-NEXT:    v_mov_b32_e32 v21, v3
+; GCN-NEXT:    v_mov_b32_e32 v20, v2
+; GCN-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-NEXT:    v_mov_b32_e32 v18, v0
+; GCN-NEXT:    v_mov_b32_e32 v13, s25
+; GCN-NEXT:    v_mov_b32_e32 v14, s26
+; GCN-NEXT:    v_mov_b32_e32 v15, s27
+; GCN-NEXT:    v_mov_b32_e32 v16, s28
+; GCN-NEXT:    v_mov_b32_e32 v17, s29
+; GCN-NEXT:    v_accvgpr_write_b32 a0, v12
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    v_mov_b32_e32 v2, s18
+; GCN-NEXT:    v_mov_b32_e32 v3, s19
+; GCN-NEXT:    v_mov_b32_e32 v4, s20
+; GCN-NEXT:    v_mov_b32_e32 v5, s21
+; GCN-NEXT:    v_mov_b32_e32 v6, s22
+; GCN-NEXT:    v_mov_b32_e32 v7, s23
+; GCN-NEXT:    v_accvgpr_write_b32 a1, v13
+; GCN-NEXT:    v_accvgpr_write_b32 a2, v14
+; GCN-NEXT:    v_accvgpr_write_b32 a3, v15
+; GCN-NEXT:    v_accvgpr_write_b32 a4, v16
+; GCN-NEXT:    v_accvgpr_write_b32 a5, v17
+; GCN-NEXT:    v_accvgpr_write_b32 a6, v18
+; GCN-NEXT:    v_accvgpr_write_b32 a7, v19
+; GCN-NEXT:    v_accvgpr_write_b32 a8, v20
+; GCN-NEXT:    v_accvgpr_write_b32 a9, v21
+; GCN-NEXT:    v_accvgpr_write_b32 a10, v22
+; GCN-NEXT:    v_accvgpr_write_b32 a11, v23
+; GCN-NEXT:    v_accvgpr_write_b32 a12, v24
+; GCN-NEXT:    v_accvgpr_write_b32 a13, v25
+; GCN-NEXT:    v_accvgpr_write_b32 a14, v26
+; GCN-NEXT:    v_accvgpr_write_b32 a15, v27
+; GCN-NEXT:    s_nop 1
+; GCN-NEXT:    v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10
+; GCN-NEXT:    s_nop 7
+; GCN-NEXT:    s_nop 2
+; GCN-NEXT:    v_accvgpr_read_b32 v0, a0
+; GCN-NEXT:    v_accvgpr_read_b32 v1, a1
+; GCN-NEXT:    v_accvgpr_read_b32 v2, a2
+; GCN-NEXT:    v_accvgpr_read_b32 v3, a3
+; GCN-NEXT:    v_accvgpr_read_b32 v4, a4
+; GCN-NEXT:    v_accvgpr_read_b32 v5, a5
+; GCN-NEXT:    v_accvgpr_read_b32 v6, a6
+; GCN-NEXT:    v_accvgpr_read_b32 v7, a7
+; GCN-NEXT:    v_accvgpr_read_b32 v8, a8
+; GCN-NEXT:    v_accvgpr_read_b32 v9, a9
+; GCN-NEXT:    v_accvgpr_read_b32 v10, a10
+; GCN-NEXT:    v_accvgpr_read_b32 v11, a11
+; GCN-NEXT:    v_accvgpr_read_b32 v12, a12
+; GCN-NEXT:    v_accvgpr_read_b32 v13, a13
+; GCN-NEXT:    v_accvgpr_read_b32 v14, a14
+; GCN-NEXT:    v_accvgpr_read_b32 v15, a15
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
   ret <16 x float> %result
 }
@@ -4627,5 +4157,3 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg
 }
 
 attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
index 0ca96d5a1eb19..fa32ee108d382 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
 
 ; DPP control value 337 is valid for 64-bit DPP on gfx942
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index cbc76a32a75e4..7342c366799e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
 
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
 
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0
 declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0
@@ -2128,10 +2128,10 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    flat_load_ushort v4, v[0:1]
-; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX802-GISEL-NEXT:    v_readfirstlane_b32 m0, v3
 ; GFX802-GISEL-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX802-GISEL-NEXT:    s_mov_b32 m0, s5
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX802-GISEL-NEXT:    s_nop 1
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v4, s4, m0
 ; GFX802-GISEL-NEXT:    flat_store_short v[0:1], v4
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index 5466d315c05a4..2969dd9156ccb 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -2346,6 +2346,20 @@ void GlobalISelEmitter::emitRunCustomAction(raw_ostream &OS) {
      << "}\n";
 }
 
+bool hasBFloatType(const TreePatternNode &Node) {
+  for (unsigned I = 0, E = Node.getNumTypes(); I < E; I++) {
+    auto Ty = Node.getType(I);
+    for (auto T : Ty)
+      if (T.second == MVT::bf16 ||
+          (T.second.isVector() && T.second.getScalarType() == MVT::bf16))
+        return true;
+  }
+  for (const TreePatternNode &C : Node.children())
+    if (hasBFloatType(C))
+      return true;
+  return false;
+}
+
 void GlobalISelEmitter::run(raw_ostream &OS) {
   if (!UseCoverageFile.empty()) {
     RuleCoverage = CodeGenCoverage();
@@ -2382,6 +2396,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
     if (Pat.getGISelShouldIgnore())
       continue; // skip without warning
+
+    // Skip any patterns containing BF16 types, as GISel cannot currently tell
+    // the difference between fp16 and bf16. FIXME: This can be removed once
+    // BF16 is supported properly.
+    if (hasBFloatType(Pat.getSrcPattern()))
+      continue;
+
     auto MatcherOrErr = runOnPattern(Pat);
 
     // The pattern analysis can fail, indicating an unsupported pattern.

From 3b2b7ec07d970b225b63af9e96d7a9c09334cd27 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 27 Jan 2025 17:30:50 -0500
Subject: [PATCH 293/432] [AMDGPU] Handle invariant marks in
 `AMDGPUPromoteAllocaPass` (#124607)

Fixes SWDEV-509327.
---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 30 +++++++-
 .../AMDGPU/promote-alloca-invariant-marks.ll  | 75 +++++++++++++++++++
 2 files changed, 101 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 907f82ed7fc52..fec0ebcd70593 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1556,12 +1556,34 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
     case Intrinsic::launder_invariant_group:
-    case Intrinsic::strip_invariant_group:
+    case Intrinsic::strip_invariant_group: {
+      SmallVector<Type *> ArgTy;
+      SmallVector<Value *> Args;
+      if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
+        Value *Size = Intr->getArgOperand(0);
+        ArgTy.emplace_back(Offset->getType());
+        Args.emplace_back(Size);
+        Args.emplace_back(Offset);
+      } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
+        Value *InvariantPtr = Intr->getArgOperand(0);
+        Value *Size = Intr->getArgOperand(1);
+        ArgTy.emplace_back(Offset->getType());
+        Args.emplace_back(InvariantPtr);
+        Args.emplace_back(Size);
+        Args.emplace_back(Offset);
+      } else {
+        ArgTy.emplace_back(Offset->getType());
+        Args.emplace_back(Offset);
+      }
+      Function *F = Intrinsic::getOrInsertDeclaration(
+          Intr->getModule(), Intr->getIntrinsicID(), ArgTy);
+      CallInst *NewIntr =
+          CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
+      Intr->mutateType(NewIntr->getType());
+      Intr->replaceAllUsesWith(NewIntr);
       Intr->eraseFromParent();
-      // FIXME: I think the invariant marker should still theoretically apply,
-      // but the intrinsics need to be changed to accept pointers with any
-      // address space.
       continue;
+    }
     case Intrinsic::objectsize: {
       Value *Src = Intr->getOperand(0);
 
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll
new file mode 100644
index 0000000000000..fca4be5e76dae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca %s -o - | FileCheck %s
+
+declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture)
+declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture)
+declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5))
+declare ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5))
+
+define amdgpu_kernel void @use_invariant_start_and_end() {
+; CHECK-LABEL: define amdgpu_kernel void @use_invariant_start_and_end() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_start_and_end.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[INVARIENT1:%.*]] = call ptr @llvm.invariant.start.p3(i64 0, ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr [[INVARIENT1]], align 1
+; CHECK-NEXT:    call void @llvm.invariant.end.p3(ptr [[INVARIENT1]], i64 0, ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, align 4, addrspace(5)
+  %invarient = call ptr @llvm.invariant.start.p5(i64 0, ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr %invarient, align 1
+  call void @llvm.invariant.end.p5(ptr %invarient, i64 0, ptr addrspace(5) %alloca)
+  ret void
+}
+
+define amdgpu_kernel void @use_invariant_group_and_strip() {
+; CHECK-LABEL: define amdgpu_kernel void @use_invariant_group_and_strip() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_group_and_strip.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[INVARIENT2:%.*]] = call ptr addrspace(3) @llvm.launder.invariant.group.p3(ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[INVARIENT2]], align 1
+; CHECK-NEXT:    [[STRIP1:%.*]] = call ptr addrspace(3) @llvm.strip.invariant.group.p3(ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[STRIP1]], align 1
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, align 4, addrspace(5)
+  %invarient = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr addrspace(5) %invarient, align 1
+  %strip = call ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr addrspace(5) %strip, align 1
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{}
+; CHECK: [[RNG1]] = !{i32 0, i32 1025}
+;.

From a7a4c16c672bdd8e245af533a1f170522e26e42a Mon Sep 17 00:00:00 2001
From: Diego Caballero <dieg0ca6aller0@gmail.com>
Date: Mon, 27 Jan 2025 14:36:19 -0800
Subject: [PATCH 294/432] [mlir][Vector] Support efficient shape cast lowering
 for n-D vectors (#123497)

This PR implements a generalization of the existing more efficient
lowering of shape casts from 2-D to 1D and 1-D to 2-D vectors. This
significantly reduces code size and generates more performant code for
n-D shape casts that make their way to LLVM/SPIR-V.
---
 .../Transforms/LowerVectorShapeCast.cpp       | 164 ++++++++++--------
 ...vector-shape-cast-lowering-transforms.mlir |  45 ++---
 2 files changed, 108 insertions(+), 101 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
index 95ebd4e9fe3d9..239dc9aa1de6f 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShapeCast.cpp
@@ -11,40 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Arith/Utils/Utils.h"
-#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Dialect/Utils/IndexingUtils.h"
-#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
 #include "mlir/Dialect/Vector/Utils/VectorUtils.h"
-#include "mlir/IR/BuiltinAttributeInterfaces.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
-#include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/VectorInterfaces.h"
 
 #define DEBUG_TYPE "vector-shape-cast-lowering"
 
 using namespace mlir;
 using namespace mlir::vector;
 
+/// Increments n-D `indices` by `step` starting from the innermost dimension.
+static void incIdx(SmallVectorImpl<int64_t> &indices, VectorType vecType,
+                   int step = 1) {
+  for (int dim : llvm::reverse(llvm::seq<int>(0, indices.size()))) {
+    assert(indices[dim] < vecType.getDimSize(dim) &&
+           "Indices are out of bound");
+    indices[dim] += step;
+    if (indices[dim] < vecType.getDimSize(dim))
+      break;
+
+    indices[dim] = 0;
+    step = 1;
+  }
+}
+
 namespace {
-/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D
-/// vectors progressively on the way to target llvm.matrix intrinsics.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.extract from 2-D + vector.insert_strided_slice offset into 1-D
-class ShapeCastOp2DDownCastRewritePattern
+/// ShapeOp n-D -> 1-D downcast serves the purpose of flattening N-D to 1-D
+/// vectors progressively. This iterates over the n-1 major dimensions of the
+/// n-D vector and performs rewrites into:
+///   vector.extract from n-D + vector.insert_strided_slice offset into 1-D
+class ShapeCastOpNDDownCastRewritePattern
     : public OpRewritePattern<vector::ShapeCastOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
@@ -53,35 +56,52 @@ class ShapeCastOp2DDownCastRewritePattern
                                 PatternRewriter &rewriter) const override {
     auto sourceVectorType = op.getSourceVectorType();
     auto resultVectorType = op.getResultVectorType();
-
     if (sourceVectorType.isScalable() || resultVectorType.isScalable())
       return failure();
 
-    if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1)
+    int64_t srcRank = sourceVectorType.getRank();
+    int64_t resRank = resultVectorType.getRank();
+    if (srcRank < 2 || resRank != 1)
       return failure();
 
+    // Compute the number of 1-D vector elements involved in the reshape.
+    int64_t numElts = 1;
+    for (int64_t dim = 0; dim < srcRank - 1; ++dim)
+      numElts *= sourceVectorType.getDimSize(dim);
+
     auto loc = op.getLoc();
-    Value desc = rewriter.create<arith::ConstantOp>(
+    SmallVector<int64_t> srcIdx(srcRank - 1, 0);
+    SmallVector<int64_t> resIdx(resRank, 0);
+    int64_t extractSize = sourceVectorType.getShape().back();
+    Value result = rewriter.create<arith::ConstantOp>(
         loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
-    unsigned mostMinorVectorSize = sourceVectorType.getShape()[1];
-    for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::ExtractOp>(loc, op.getSource(), i);
-      desc = rewriter.create<vector::InsertStridedSliceOp>(
-          loc, vec, desc,
-          /*offsets=*/i * mostMinorVectorSize, /*strides=*/1);
+
+    // Compute the indices of each 1-D vector element of the source extraction
+    // and destination slice insertion and generate such instructions.
+    for (int64_t i = 0; i < numElts; ++i) {
+      if (i != 0) {
+        incIdx(srcIdx, sourceVectorType, /*step=*/1);
+        incIdx(resIdx, resultVectorType, /*step=*/extractSize);
+      }
+
+      Value extract =
+          rewriter.create<vector::ExtractOp>(loc, op.getSource(), srcIdx);
+      result = rewriter.create<vector::InsertStridedSliceOp>(
+          loc, extract, result,
+          /*offsets=*/resIdx, /*strides=*/1);
     }
-    rewriter.replaceOp(op, desc);
+
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
 
-/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D
-/// vectors progressively.
-/// This iterates over the most major dimension of the 2-D vector and performs
-/// rewrites into:
-///   vector.extract_strided_slice from 1-D + vector.insert into 2-D
+/// ShapeOp 1-D -> n-D upcast serves the purpose of unflattening n-D from 1-D
+/// vectors progressively. This iterates over the n-1 major dimension of the n-D
+/// vector and performs rewrites into:
+///   vector.extract_strided_slice from 1-D + vector.insert into n-D
 /// Note that 1-D extract_strided_slice are lowered to efficient vector.shuffle.
-class ShapeCastOp2DUpCastRewritePattern
+class ShapeCastOpNDUpCastRewritePattern
     : public OpRewritePattern<vector::ShapeCastOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
@@ -90,43 +110,43 @@ class ShapeCastOp2DUpCastRewritePattern
                                 PatternRewriter &rewriter) const override {
     auto sourceVectorType = op.getSourceVectorType();
     auto resultVectorType = op.getResultVectorType();
-
     if (sourceVectorType.isScalable() || resultVectorType.isScalable())
       return failure();
 
-    if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2)
+    int64_t srcRank = sourceVectorType.getRank();
+    int64_t resRank = resultVectorType.getRank();
+    if (srcRank != 1 || resRank < 2)
       return failure();
 
+    // Compute the number of 1-D vector elements involved in the reshape.
+    int64_t numElts = 1;
+    for (int64_t dim = 0; dim < resRank - 1; ++dim)
+      numElts *= resultVectorType.getDimSize(dim);
+
+    // Compute the indices of each 1-D vector element of the source slice
+    // extraction and destination insertion and generate such instructions.
     auto loc = op.getLoc();
-    Value desc = rewriter.create<arith::ConstantOp>(
+    SmallVector<int64_t> srcIdx(srcRank, 0);
+    SmallVector<int64_t> resIdx(resRank - 1, 0);
+    int64_t extractSize = resultVectorType.getShape().back();
+    Value result = rewriter.create<arith::ConstantOp>(
         loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
-    unsigned mostMinorVectorSize = resultVectorType.getShape()[1];
-    for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) {
-      Value vec = rewriter.create<vector::ExtractStridedSliceOp>(
-          loc, op.getSource(), /*offsets=*/i * mostMinorVectorSize,
-          /*sizes=*/mostMinorVectorSize,
+    for (int64_t i = 0; i < numElts; ++i) {
+      if (i != 0) {
+        incIdx(srcIdx, sourceVectorType, /*step=*/extractSize);
+        incIdx(resIdx, resultVectorType, /*step=*/1);
+      }
+
+      Value extract = rewriter.create<vector::ExtractStridedSliceOp>(
+          loc, op.getSource(), /*offsets=*/srcIdx, /*sizes=*/extractSize,
           /*strides=*/1);
-      desc = rewriter.create<vector::InsertOp>(loc, vec, desc, i);
+      result = rewriter.create<vector::InsertOp>(loc, extract, result, resIdx);
     }
-    rewriter.replaceOp(op, desc);
+    rewriter.replaceOp(op, result);
     return success();
   }
 };
 
-static void incIdx(llvm::MutableArrayRef<int64_t> idx, VectorType tp,
-                   int dimIdx, int initialStep = 1) {
-  int step = initialStep;
-  for (int d = dimIdx; d >= 0; d--) {
-    idx[d] += step;
-    if (idx[d] >= tp.getDimSize(d)) {
-      idx[d] = 0;
-      step = 1;
-    } else {
-      break;
-    }
-  }
-}
-
 // We typically should not lower general shape cast operations into data
 // movement instructions, since the assumption is that these casts are
 // optimized away during progressive lowering. For completeness, however,
@@ -145,18 +165,14 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     if (sourceVectorType.isScalable() || resultVectorType.isScalable())
       return failure();
 
-    // Special case 2D / 1D lowerings with better implementations.
-    // TODO: make is ND / 1D to allow generic ND -> 1D -> MD.
+    // Special case for n-D / 1-D lowerings with better implementations.
     int64_t srcRank = sourceVectorType.getRank();
     int64_t resRank = resultVectorType.getRank();
-    if ((srcRank == 2 && resRank == 1) || (srcRank == 1 && resRank == 2))
+    if ((srcRank > 1 && resRank == 1) || (srcRank == 1 && resRank > 1))
       return failure();
 
     // Generic ShapeCast lowering path goes all the way down to unrolled scalar
     // extract/insert chains.
-    // TODO: consider evolving the semantics to only allow 1D source or dest and
-    // drop this potentially very expensive lowering.
-    // Compute number of elements involved in the reshape.
     int64_t numElts = 1;
     for (int64_t r = 0; r < srcRank; r++)
       numElts *= sourceVectorType.getDimSize(r);
@@ -166,14 +182,14 @@ class ShapeCastOpRewritePattern : public OpRewritePattern<vector::ShapeCastOp> {
     //    x[0,1,0] = y[0,2]
     // etc., incrementing the two index vectors "row-major"
     // within the source and result shape.
-    SmallVector<int64_t> srcIdx(srcRank);
-    SmallVector<int64_t> resIdx(resRank);
+    SmallVector<int64_t> srcIdx(srcRank, 0);
+    SmallVector<int64_t> resIdx(resRank, 0);
     Value result = rewriter.create<arith::ConstantOp>(
         loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
     for (int64_t i = 0; i < numElts; i++) {
       if (i != 0) {
-        incIdx(srcIdx, sourceVectorType, srcRank - 1);
-        incIdx(resIdx, resultVectorType, resRank - 1);
+        incIdx(srcIdx, sourceVectorType);
+        incIdx(resIdx, resultVectorType);
       }
 
       Value extract;
@@ -252,7 +268,7 @@ class ScalableShapeCastOpRewritePattern
     // have a single trailing scalable dimension. This is because there are no
     // legal representation of other scalable types in LLVM (and likely won't be
     // soon). There are also (currently) no operations that can index or extract
-    // from >= 2D scalable vectors or scalable vectors of fixed vectors.
+    // from >= 2-D scalable vectors or scalable vectors of fixed vectors.
     if (!isTrailingDimScalable(sourceVectorType) ||
         !isTrailingDimScalable(resultVectorType)) {
       return failure();
@@ -278,8 +294,8 @@ class ScalableShapeCastOpRewritePattern
     Value result = rewriter.create<arith::ConstantOp>(
         loc, resultVectorType, rewriter.getZeroAttr(resultVectorType));
 
-    SmallVector<int64_t> srcIdx(srcRank);
-    SmallVector<int64_t> resIdx(resRank);
+    SmallVector<int64_t> srcIdx(srcRank, 0);
+    SmallVector<int64_t> resIdx(resRank, 0);
 
     // TODO: Try rewriting this with StaticTileOffsetRange (from IndexingUtils)
     // once D150000 lands.
@@ -334,8 +350,8 @@ class ScalableShapeCastOpRewritePattern
 
       // 4. Increment the insert/extract indices, stepping by minExtractionSize
       // for the trailing dimensions.
-      incIdx(srcIdx, sourceVectorType, srcRank - 1, minExtractionSize);
-      incIdx(resIdx, resultVectorType, resRank - 1, minExtractionSize);
+      incIdx(srcIdx, sourceVectorType, /*step=*/minExtractionSize);
+      incIdx(resIdx, resultVectorType, /*step=*/minExtractionSize);
     }
 
     rewriter.replaceOp(op, result);
@@ -352,8 +368,8 @@ class ScalableShapeCastOpRewritePattern
 
 void mlir::vector::populateVectorShapeCastLoweringPatterns(
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<ShapeCastOp2DDownCastRewritePattern,
-               ShapeCastOp2DUpCastRewritePattern, ShapeCastOpRewritePattern,
+  patterns.add<ShapeCastOpNDDownCastRewritePattern,
+               ShapeCastOpNDUpCastRewritePattern, ShapeCastOpRewritePattern,
                ScalableShapeCastOpRewritePattern>(patterns.getContext(),
                                                   benefit);
 }
diff --git a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
index f2f1211fd70ee..b4c52d5533116 100644
--- a/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-shape-cast-lowering-transforms.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
+// RUN: mlir-opt %s --transform-interpreter | FileCheck %s
 
 // CHECK-LABEL: func @nop_shape_cast
 // CHECK-SAME: %[[A:.*]]: vector<16xf32>
@@ -82,19 +82,16 @@ func.func @shape_cast_2d2d(%arg0 : vector<3x2xf32>) -> vector<2x3xf32> {
 // CHECK-LABEL: func @shape_cast_3d1d
 // CHECK-SAME: %[[A:.*]]: vector<1x3x2xf32>
 // CHECK: %[[C:.*]] = arith.constant dense<0.000000e+00> : vector<6xf32>
-// CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0, 0] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C]] [0] : f32 into vector<6xf32>
-// CHECK: %[[T2:.*]] = vector.extract %[[A]][0, 0, 1] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [1] : f32 into vector<6xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[A]][0, 1, 0] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T5:.*]] = vector.insert %[[T4]], %[[T3]] [2] : f32 into vector<6xf32>
-// CHECK: %[[T6:.*]] = vector.extract %[[A]][0, 1, 1] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T7:.*]] = vector.insert %[[T6]], %[[T5]] [3] : f32 into vector<6xf32>
-// CHECK: %[[T8:.*]] = vector.extract %[[A]][0, 2, 0] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T9:.*]] = vector.insert %[[T8]], %[[T7]] [4] : f32 into vector<6xf32>
-// CHECK: %[[T10:.*]] = vector.extract %[[A]][0, 2, 1] : f32 from vector<1x3x2xf32>
-// CHECK: %[[T11:.*]] = vector.insert %[[T10]], %[[T9]] [5] : f32 into vector<6xf32>
-// CHECK: return %[[T11]] : vector<6xf32>
+// CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : vector<2xf32> from vector<1x3x2xf32>
+// CHECK: %[[T1:.*]] = vector.insert_strided_slice %[[T0]], %[[C]]
+// CHECK-SAME:           {offsets = [0], strides = [1]} : vector<2xf32> into vector<6xf32>
+// CHECK: %[[T2:.*]] = vector.extract %[[A]][0, 1] : vector<2xf32> from vector<1x3x2xf32>
+// CHECK: %[[T3:.*]] = vector.insert_strided_slice %[[T2]], %[[T1]]
+// CHECK-SAME:           {offsets = [2], strides = [1]} : vector<2xf32> into vector<6xf32>
+// CHECK: %[[T4:.*]] = vector.extract %[[A]][0, 2] : vector<2xf32> from vector<1x3x2xf32>
+// CHECK: %[[T5:.*]] = vector.insert_strided_slice %[[T4]], %[[T3]]
+// CHECK-SAME:           {offsets = [4], strides = [1]} : vector<2xf32> into vector<6xf32>
+// CHECK: return %[[T5]] : vector<6xf32>
 
 func.func @shape_cast_3d1d(%arg0 : vector<1x3x2xf32>) -> vector<6xf32> {
   %s = vector.shape_cast %arg0 : vector<1x3x2xf32> to vector<6xf32>
@@ -104,19 +101,13 @@ func.func @shape_cast_3d1d(%arg0 : vector<1x3x2xf32>) -> vector<6xf32> {
 // CHECK-LABEL: func @shape_cast_1d3d
 // CHECK-SAME: %[[A:.*]]: vector<6xf32>
 // CHECK: %[[C:.*]] = arith.constant dense<0.000000e+00> : vector<2x1x3xf32>
-// CHECK: %[[T0:.*]] = vector.extract %[[A]][0] : f32 from vector<6xf32>
-// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C]] [0, 0, 0] : f32 into vector<2x1x3xf32>
-// CHECK: %[[T2:.*]] = vector.extract %[[A]][1] : f32 from vector<6xf32>
-// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [0, 0, 1] : f32 into vector<2x1x3xf32>
-// CHECK: %[[T4:.*]] = vector.extract %[[A]][2] : f32 from vector<6xf32>
-// CHECK: %[[T5:.*]] = vector.insert %[[T4]], %[[T3]] [0, 0, 2] : f32 into vector<2x1x3xf32>
-// CHECK: %[[T6:.*]] = vector.extract %[[A]][3] : f32 from vector<6xf32>
-// CHECK: %[[T7:.*]] = vector.insert %[[T6]], %[[T5]] [1, 0, 0] : f32 into vector<2x1x3xf32>
-// CHECK: %[[T8:.*]] = vector.extract %[[A]][4] : f32 from vector<6xf32>
-// CHECK: %[[T9:.*]] = vector.insert %[[T8]], %[[T7]] [1, 0, 1] : f32 into vector<2x1x3xf32>
-// CHECK: %[[T10:.*]] = vector.extract %[[A]][5] : f32 from vector<6xf32>
-// CHECK: %[[T11:.*]] = vector.insert %[[T10]], %[[T9]] [1, 0, 2] : f32 into vector<2x1x3xf32>
-// CHECK: return %[[T11]] : vector<2x1x3xf32>
+// CHECK: %[[T0:.*]] = vector.extract_strided_slice %[[A]]
+// CHECK-SAME:           {offsets = [0], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
+// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[C]] [0, 0] : vector<3xf32> into vector<2x1x3xf32>
+// CHECK: %[[T2:.*]] = vector.extract_strided_slice %[[A]]
+// CHECK:                {offsets = [3], sizes = [3], strides = [1]} : vector<6xf32> to vector<3xf32>
+// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [1, 0] : vector<3xf32> into vector<2x1x3xf32>
+// CHECK: return %[[T3]] : vector<2x1x3xf32>
 
 func.func @shape_cast_1d3d(%arg0 : vector<6xf32>) -> vector<2x1x3xf32> {
   %s = vector.shape_cast %arg0 : vector<6xf32> to vector<2x1x3xf32>

From 89c5576ff9038ba53025ca82209fdc5f5b5d0bb4 Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Mon, 27 Jan 2025 23:45:54 +0100
Subject: [PATCH 295/432] OpenMP: Fix Python 3 SyntaxErrors (#123940)

1. `print()` is a function in Python 3.
2. New-style exceptions are required in Python 3.
---
 openmp/runtime/tools/summarizeStats.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/openmp/runtime/tools/summarizeStats.py b/openmp/runtime/tools/summarizeStats.py
index 7daed2e1cd5cc..c1a59288fae7f 100644
--- a/openmp/runtime/tools/summarizeStats.py
+++ b/openmp/runtime/tools/summarizeStats.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 
+
 import pandas as pd
 import numpy as np
 import re
@@ -37,7 +38,7 @@ def draw_circle_frame(self, x0, y0, r):
 
     frame_dict = {'polygon': draw_poly_frame, 'circle': draw_circle_frame}
     if frame not in frame_dict:
-        raise ValueError, 'unknown value for `frame`: %s' % frame
+        raise ValueError("unknown value for `frame`: %s" % frame)
 
     class RadarAxes(PolarAxes):
         """
@@ -143,7 +144,7 @@ def readFile(fname):
             res["counters"] = readCounters(f)
             return res
     except (OSError, IOError):
-        print "Cannot open " + fname
+        print("Cannot open " + fname)
         return None
 
 def usefulValues(l):
@@ -235,7 +236,7 @@ def compPie(data):
             compKeys[key] = data[key]
         else:
             nonCompKeys[key] = data[key]
-    print "comp keys:", compKeys, "\n\n non comp keys:", nonCompKeys
+    print("comp keys:", compKeys, "\n\n non comp keys:", nonCompKeys)
     return [compKeys, nonCompKeys]
 
 def drawMainPie(data, filebase, colors):
@@ -299,10 +300,10 @@ def main():
                 chartType = "radar"
                 drawRadarChart(data, s, filebase, params, colors[n])
                 """radar Charts finish here"""
-                plt.savefig(filebase+"_"+s+"_"+chartType, bbox_inches='tight')
-            elif s == 'timers':
-                print "overheads in "+filebase
-                numThreads = tmp[s]['SampleCount']['Total_OMP_parallel']
+                plt.savefig(filebase + "_" + s + "_" + chartType, bbox_inches="tight")
+            elif s == "timers":
+                print("overheads in " + filebase)
+                numThreads = tmp[s]["SampleCount"]["Total_OMP_parallel"]
                 for key in data.keys():
                     if key[0:5] == 'Total':
                         del data[key]

From 0e372c3ea31da276ac67c5972e4ef63514577e9c Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 27 Jan 2025 23:51:19 +0100
Subject: [PATCH 296/432] Revert "[Clang] call HandleImmediateInvocation before
 checking for immediate escacalating expressions" (#124646)

Reverts llvm/llvm-project#124414

Turns out to be an important compile time regression, I'll come up with
a less disruptive approach
---
 clang/docs/ReleaseNotes.rst                   |  1 -
 clang/include/clang/Sema/Sema.h               |  6 ++--
 clang/lib/Sema/SemaDecl.cpp                   |  1 +
 clang/lib/Sema/SemaExpr.cpp                   |  3 --
 clang/test/CodeGenCXX/gh119046.cpp            | 32 -------------------
 .../SemaCXX/cxx2b-consteval-propagate.cpp     | 17 ----------
 6 files changed, 4 insertions(+), 56 deletions(-)
 delete mode 100644 clang/test/CodeGenCXX/gh119046.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8b04f172946df..af3632322b845 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1003,7 +1003,6 @@ Bug Fixes to C++ Support
 - Fixed assertions or false compiler diagnostics in the case of C++ modules for
   lambda functions or inline friend functions defined inside templates (#GH122493).
 - Clang now rejects declaring an alias template with the same name as its template parameter. (#GH123423)
-- Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046)
 - Fixed the rejection of valid code when referencing an enumerator of an unscoped enum member with a prior declaration. (#GH124405)
 - Fixed immediate escalation of non-dependent expressions. (#GH123405)
 - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index f1d1e2e567193..528304409b809 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13139,10 +13139,10 @@ class Sema final : public SemaBase {
     ~SynthesizedFunctionScope() {
       if (PushedCodeSynthesisContext)
         S.popCodeSynthesisContext();
-
-      if (auto *FD = dyn_cast<FunctionDecl>(S.CurContext))
+      if (auto *FD = dyn_cast<FunctionDecl>(S.CurContext)) {
         FD->setWillHaveBody(false);
-
+        S.CheckImmediateEscalatingFunctionDefinition(FD, S.getCurFunction());
+      }
       S.PopExpressionEvaluationContext();
       S.PopFunctionScopeInfo();
     }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index accef4c5d5d91..fe68eadc951b5 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16019,6 +16019,7 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       if (!FD->isDeletedAsWritten())
         FD->setBody(Body);
       FD->setWillHaveBody(false);
+      CheckImmediateEscalatingFunctionDefinition(FD, FSI);
 
       if (getLangOpts().CPlusPlus14) {
         if (!FD->isInvalidDecl() && Body && !FD->isDependentContext() &&
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index e098b21045f01..176627c3df37c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -17881,9 +17881,6 @@ void Sema::PopExpressionEvaluationContext() {
   WarnOnPendingNoDerefs(Rec);
   HandleImmediateInvocations(*this, Rec);
 
-  if (auto *FD = dyn_cast<FunctionDecl>(CurContext); FD && getCurFunction())
-    CheckImmediateEscalatingFunctionDefinition(FD, getCurFunction());
-
   // Warn on any volatile-qualified simple-assignments that are not discarded-
   // value expressions nor unevaluated operands (those cases get removed from
   // this list by CheckUnusedVolatileAssignment).
diff --git a/clang/test/CodeGenCXX/gh119046.cpp b/clang/test/CodeGenCXX/gh119046.cpp
deleted file mode 100644
index cad76879f0862..0000000000000
--- a/clang/test/CodeGenCXX/gh119046.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %clang_cc1 -std=c++2a -triple x86_64-elf-gnu %s -emit-llvm -o - | FileCheck %s
-
-struct S {
-    consteval void operator()() {}
-};
-
-template <class Fn>
-constexpr void dispatch(Fn fn) {
-    fn();
-}
-
-template <class Visitor>
-struct value_visitor {
-    constexpr void operator()() { visitor(); }
-    Visitor&& visitor;
-};
-
-template <class Visitor>
-constexpr auto make_dispatch() {
-    return dispatch<value_visitor<S>>;
-}
-
-template <class Visitor>
-constexpr void visit(Visitor&&) {
-    make_dispatch<Visitor>();
-}
-
-void f() { visit(S{}); }
-
-// CHECK: define {{.*}} @_Z1fv
-// CHECK-NOT: define {{.*}} @_Z5visitI1SEvOT_
-// CHECK-NOT: define {{.*}} @_Z13make_dispatchI1SEDav
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index e83e03da54f78..05904d9ade067 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -529,23 +529,6 @@ D d(0); // expected-note {{in implicit initialization for inherited constructor
 
 }
 
-namespace GH119046 {
-
-template <typename Cls> constexpr auto tfn(int) {
-  return (unsigned long long)(&Cls::sfn);
-  //expected-note@-1 {{'tfn<GH119046::S>' is an immediate function because its body evaluates the address of a consteval function 'sfn'}}
-};
-struct S { static consteval void sfn() {} };
-
-int f() {
-  int a = 0; // expected-note{{declared here}}
-  return tfn<S>(a);
-  //expected-error@-1 {{call to immediate function 'GH119046::tfn<GH119046::S>' is not a constant expression}}
-  //expected-note@-2 {{read of non-const variable 'a' is not allowed in a constant expression}}
-}
-
-}
-
 namespace GH123405 {
 
 consteval void fn() {}

From 7fd58339b4c783f240904aa5c1007e017378ada1 Mon Sep 17 00:00:00 2001
From: Aidan Goldfarb <agoldfa7@u.rochester.edu>
Date: Mon, 27 Jan 2025 18:12:50 -0500
Subject: [PATCH 297/432] [clang] Add __nullptr as a keyword to C (#123119)

This PR resolves #121503.

---------

Co-authored-by: Aidan <aidan.goldfarb@mail.mcgill.ca>
Co-authored-by: Erich Keane <ekeane@nvidia.com>
Co-authored-by: Aaron Ballman <aaron@aaronballman.com>
---
 clang/docs/LanguageExtensions.rst        | 4 +---
 clang/docs/ReleaseNotes.rst              | 1 +
 clang/include/clang/Basic/TokenKinds.def | 2 +-
 clang/test/Sema/nullptr-prec2x.c         | 4 ++++
 clang/test/Sema/nullptr.c                | 7 +++++++
 5 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index c42b88015e269..53c5955441ed6 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -514,9 +514,7 @@ available in all language modes.
 __nullptr
 ---------
 
-``__nullptr`` is an alternate spelling for ``nullptr``, but is also available in
-C++ modes prior to C++11. Note that it's currently not availbale in C despite
-C23 having support for ``nullptr``.
+``__nullptr`` is an alternate spelling for ``nullptr``. It is available in all C and C++ language modes.
 
 __signed, __signed__
 --------------------
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index af3632322b845..d8b7145986a2a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -388,6 +388,7 @@ C Language Changes
 
 - Extend clang's ``<limits.h>`` to define ``LONG_LONG_*`` macros for Android's bionic.
 - Macro ``__STDC_NO_THREADS__`` is no longer necessary for MSVC 2022 1939 and later.
+- Exposed the the ``__nullptr`` keyword as an alias for ``nullptr`` in all C language modes.
 
 C2y Feature Support
 ^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index 2c692c999bdff..8902a20b07ffa 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -707,7 +707,7 @@ ALIAS("__decltype"       , decltype     , KEYCXX)
 ALIAS("__imag__"         , __imag       , KEYALL)
 ALIAS("__inline"         , inline       , KEYALL)
 ALIAS("__inline__"       , inline       , KEYALL)
-ALIAS("__nullptr"        , nullptr      , KEYCXX)
+ALIAS("__nullptr"        , nullptr      , KEYALL)
 ALIAS("__real__"         , __real       , KEYALL)
 ALIAS("__restrict"       , restrict     , KEYALL)
 ALIAS("__restrict__"     , restrict     , KEYALL)
diff --git a/clang/test/Sema/nullptr-prec2x.c b/clang/test/Sema/nullptr-prec2x.c
index 39479d4343a56..c516c448ca1ab 100644
--- a/clang/test/Sema/nullptr-prec2x.c
+++ b/clang/test/Sema/nullptr-prec2x.c
@@ -6,3 +6,7 @@ int nullptr; // expected-warning {{'nullptr' is a keyword in C23}}
 
 nullptr_t val; // expected-error {{unknown type name 'nullptr_t'}}
 
+void foo(void *);
+void bar() { foo(__nullptr); } // Test that it converts properly to an arbitrary pointer type without warning
+_Static_assert(__nullptr == 0, "value of __nullptr"); // Test that its value matches that of NULL
+_Static_assert(_Generic(__typeof(__nullptr), int : 0, void * : 0, default : 1), "type of __nullptr"); // Test that it's type is not the same as what NULL would generally have.
diff --git a/clang/test/Sema/nullptr.c b/clang/test/Sema/nullptr.c
index d11765a9c881a..b8c371a418e3e 100644
--- a/clang/test/Sema/nullptr.c
+++ b/clang/test/Sema/nullptr.c
@@ -108,3 +108,10 @@ void test_f1() {
   int ir = (f1)(nullptr);
 }
 
+// __nullptr keyword in C
+void foo(void *);
+void bar() { foo(__nullptr); }
+static_assert(nullptr == __nullptr);
+static_assert(__nullptr == 0);  // Test that its value matches that of NULL
+static_assert(_Generic(typeof(__nullptr), nullptr_t: true, default: false));
+static_assert(_Generic(__typeof(__nullptr), int : 0, void * : 0, default : 1)); // Test that it's type is not the same as what NULL would generally have.

From 8cc83b66e20e72cdb3bb5fbd549c941797b0e0c9 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 27 Jan 2025 15:23:37 -0800
Subject: [PATCH 298/432] [MLGO] Count LR Evictions Rather than Relying on
 Cascade (#124440)

This patch adjusts the mlregalloc-max-cascade flag (renaming it to
mlregalloc-max-eviction-count) to actually count evictions rather than
just looking at the cascade number. The cascade number is not very
representative of how many times a LR has been evicted, which can lead
to some problems in certain cases, where we might end up with many
eviction problems where we have now masked off all the interferences and
are forced to evict the candidate.

This is probably what I should've done in the first place. No test case
as this only shows up in quite large functions post ThinLTO and it would
be hard to construct something that would serve as a nice regression
test without being super brittle. I've tested this on the pathological
cases that we have come across so far and it works.

Fixes #122829
---
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 44 ++++++++++++++++++---
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 9c6487b40d606..51ef625335ea9 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -63,11 +63,11 @@ static cl::opt<std::string> InteractiveChannelBaseName(
         "outgoing name should be "
         "<regalloc-evict-interactive-channel-base>.out"));
 
-static cl::opt<unsigned>
-    MaxCascade("mlregalloc-max-cascade", cl::Hidden,
-               cl::desc("The maximum number of times a live range can be "
-                        "evicted before preventing it from being evicted"),
-               cl::init(20));
+static cl::opt<unsigned> MaxEvictionCount(
+    "mlregalloc-max-eviction-count", cl::Hidden,
+    cl::desc("The maximum number of times a live range can be "
+             "evicted before preventing it from being evicted"),
+    cl::init(100));
 
 // Options that only make sense in development mode
 #ifdef LLVM_HAVE_TFLITE
@@ -364,6 +364,22 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor {
 
   using RegID = unsigned;
   mutable DenseMap<RegID, LIFeatureComponents> CachedFeatures;
+
+  mutable std::unordered_map<unsigned, unsigned> VirtRegEvictionCounts;
+
+  void onEviction(Register RegBeingEvicted) const {
+    // If we cannot find the virtual register in the map, we just assume it has
+    // not been evicted before and thus has a value of zero (which is what the
+    // subscript operator returns by default).
+    ++VirtRegEvictionCounts[RegBeingEvicted.id()];
+  }
+
+  unsigned getEvictionCount(Register Reg) const {
+    auto EvictionCountIt = VirtRegEvictionCounts.find(Reg.id());
+    if (EvictionCountIt != VirtRegEvictionCounts.end())
+      return EvictionCountIt->second;
+    return 0;
+  }
 };
 
 #define _DECL_FEATURES(type, name, shape, _)                                   \
@@ -657,7 +673,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
       // threshold, prevent the range from being evicted. We still let the
       // range through if it is urgent as we are required to produce an
       // eviction if the candidate is not spillable.
-      if (IntfCascade >= MaxCascade && !Urgent)
+      if (getEvictionCount(Intf->reg()) > MaxEvictionCount && !Urgent)
         return false;
 
       // Only evict older cascades or live ranges without a cascade.
@@ -803,6 +819,22 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
   }
   assert(CandidatePos < ValidPosLimit);
   (void)ValidPosLimit;
+
+  // Update information about how many times the virtual registers being
+  // evicted have been evicted so that we can prevent the model from evicting
+  // the same ranges continually and eating compile time.
+  if (CandidatePos == CandidateVirtRegPos) {
+    onEviction(VirtReg.reg());
+  } else {
+    for (MCRegUnit Unit : TRI->regunits(Regs[CandidatePos].first)) {
+      LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+      const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
+      for (const LiveInterval *Intf : reverse(IFIntervals)) {
+        onEviction(Intf->reg());
+      }
+    }
+  }
+
   return Regs[CandidatePos].first;
 }
 

From 8b1edc0f0ceb8ee4255d54ed9c4ada8563a3861a Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 27 Jan 2025 15:15:24 -0800
Subject: [PATCH 299/432] Revert "[ELF] Remove redundant isExported
 computation"

This reverts commit 1a4d6de1b532149b10522eae5dabce39e5f7c687.
It causes problems with `-static-pie`. See repro at
https://github.com/llvm/llvm-project/commit/1a4d6de1b532149b10522eae5dabce39e5f7c687#commitcomment-151827775
---
 lld/ELF/Config.h   |  1 -
 lld/ELF/Writer.cpp | 12 ++----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index b1d7fb88553e2..b2859486d58e9 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -619,7 +619,6 @@ struct Ctx : CommonLinkerContext {
   };
   ElfSym sym{};
   std::unique_ptr<SymbolTable> symtab;
-  SmallVector<Symbol *, 0> synthesizedSymbols;
 
   SmallVector<std::unique_ptr<MemoryBuffer>> memoryBuffers;
   SmallVector<ELFFileBase *, 0> objectFiles;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 6b34f87f0b8d0..8ce0b68c9f29c 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -149,7 +149,6 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec,
   if (!s || s->isDefined() || s->isCommon())
     return nullptr;
 
-  ctx.synthesizedSymbols.push_back(s);
   s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL,
                           stOther, STT_NOTYPE, val,
                           /*size=*/0, sec});
@@ -283,7 +282,6 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
-  bool hasDynSymTab = ctx.arg.hasDynSymTab;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -296,12 +294,11 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
                   sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
-        if (sym->includeInDynsym(ctx))
-          sym->isExported = true;
       }
     }
 
-    if (hasDynSymTab)
+    sym->isExported = sym->includeInDynsym(ctx);
+    if (ctx.arg.hasDynSymTab)
       sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
   }
 }
@@ -1839,11 +1836,6 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     }
   }
 
-  // If the previous code block defines any non-hidden symbols (e.g.
-  // __global_pointer$), they may be exported.
-  for (Symbol *sym : ctx.synthesizedSymbols)
-    sym->isExported = sym->includeInDynsym(ctx);
-
   demoteSymbolsAndComputeIsPreemptible(ctx);
 
   if (ctx.arg.copyRelocs && ctx.arg.discard != DiscardPolicy::None)

From e98b2028c758a5c333b715c1e4d5b63d09c104be Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Mon, 27 Jan 2025 15:29:38 -0800
Subject: [PATCH 300/432] [NFCI]Refactor AsmPrinter around jump table emission
 (#124645)

Add method `AsmPrinter::emitJumpTableImpl`. It takes an array-ref of jump table indices.

This splits refactor of PR https://github.com/llvm/llvm-project/pull/122215
---
 llvm/include/llvm/CodeGen/AsmPrinter.h     |  3 ++
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 56 +++++++++++++++-------
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 5291369b3b9f1..3da63af5ba571 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -893,6 +893,9 @@ class AsmPrinter : public MachineFunctionPass {
   // Internal Implementation Details
   //===------------------------------------------------------------------===//
 
+  void emitJumpTableImpl(const MachineJumpTableInfo &MJTI,
+                         ArrayRef<unsigned> JumpTableIndices,
+                         bool JTInDiffSection);
   void emitJumpTableEntry(const MachineJumpTableInfo &MJTI,
                           const MachineBasicBlock *MBB, unsigned uid) const;
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b2a4721f37b26..c21915673f643 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2868,42 +2868,62 @@ void AsmPrinter::emitJumpTableInfo() {
       MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
           MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference64,
       F);
+
+  SmallVector<unsigned> JumpTableIndices;
+  for (unsigned JTI = 0, JTSize = JT.size(); JTI < JTSize; ++JTI) {
+    JumpTableIndices.push_back(JTI);
+  }
+  emitJumpTableImpl(*MJTI, JumpTableIndices, JTInDiffSection);
+}
+
+void AsmPrinter::emitJumpTableImpl(const MachineJumpTableInfo &MJTI,
+                                   ArrayRef<unsigned> JumpTableIndices,
+                                   bool JTInDiffSection) {
+  if (JumpTableIndices.empty())
+    return;
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  const Function &F = MF->getFunction();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI.getJumpTables();
+  MCSection *JumpTableSection = TLOF.getSectionForJumpTable(F, TM);
+
+  const DataLayout &DL = MF->getDataLayout();
   if (JTInDiffSection) {
-    // Drop it in the readonly section.
-    MCSection *ReadOnlySection = TLOF.getSectionForJumpTable(F, TM);
-    OutStreamer->switchSection(ReadOnlySection);
+    OutStreamer->switchSection(JumpTableSection);
   }
 
-  emitAlignment(Align(MJTI->getEntryAlignment(DL)));
+  emitAlignment(Align(MJTI.getEntryAlignment(MF->getDataLayout())));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
   if (!JTInDiffSection)
     OutStreamer->emitDataRegion(MCDR_DataRegionJT32);
 
-  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
-    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  for (const unsigned JumpTableIndex : JumpTableIndices) {
+    ArrayRef<MachineBasicBlock *> JTBBs = JT[JumpTableIndex].MBBs;
 
     // If this jump table was deleted, ignore it.
-    if (JTBBs.empty()) continue;
+    if (JTBBs.empty())
+      continue;
 
     // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
     /// emit a .set directive for each unique entry.
-    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
+    if (MJTI.getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
         MAI->doesSetDirectiveSuppressReloc()) {
-      SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
+      SmallPtrSet<const MachineBasicBlock *, 16> EmittedSets;
       const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-      const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
+      const MCExpr *Base =
+          TLI->getPICJumpTableRelocBaseExpr(MF, JumpTableIndex, OutContext);
       for (const MachineBasicBlock *MBB : JTBBs) {
         if (!EmittedSets.insert(MBB).second)
           continue;
 
         // .set LJTSet, LBB32-base
         const MCExpr *LHS =
-          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-        OutStreamer->emitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
-                                    MCBinaryExpr::createSub(LHS, Base,
-                                                            OutContext));
+            MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        OutStreamer->emitAssignment(
+            GetJTSetSymbol(JumpTableIndex, MBB->getNumber()),
+            MCBinaryExpr::createSub(LHS, Base, OutContext));
       }
     }
 
@@ -2915,19 +2935,19 @@ void AsmPrinter::emitJumpTableInfo() {
       // FIXME: This doesn't have to have any specific name, just any randomly
       // named and numbered local label started with 'l' would work.  Simplify
       // GetJTISymbol.
-      OutStreamer->emitLabel(GetJTISymbol(JTI, true));
+      OutStreamer->emitLabel(GetJTISymbol(JumpTableIndex, true));
 
-    MCSymbol* JTISymbol = GetJTISymbol(JTI);
+    MCSymbol *JTISymbol = GetJTISymbol(JumpTableIndex);
     OutStreamer->emitLabel(JTISymbol);
 
     // Defer MCAssembler based constant folding due to a performance issue. The
     // label differences will be evaluated at write time.
     for (const MachineBasicBlock *MBB : JTBBs)
-      emitJumpTableEntry(*MJTI, MBB, JTI);
+      emitJumpTableEntry(MJTI, MBB, JumpTableIndex);
   }
 
   if (EmitJumpTableSizesSection)
-    emitJumpTableSizesSection(*MJTI, F);
+    emitJumpTableSizesSection(MJTI, MF->getFunction());
 
   if (!JTInDiffSection)
     OutStreamer->emitDataRegion(MCDR_DataRegionEnd);

From fa6e976602a41dad849d6e99692db437d328354b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Mon, 27 Jan 2025 15:41:31 -0800
Subject: [PATCH 301/432] [llvm-exegesis] Use TestBase for TargetTest (#121895)

This patch makes the PPC and X86 Exegesis TargetTests use TestBase to
provide initial setup rather than doing it themselves. This promotes
code reuse a little bit and makes the tests a bit more consistent (with
MIPS and with the initial RISC-V tests landing soon).
---
 .../tools/llvm-exegesis/PowerPC/TargetTest.cpp  |  9 ++-------
 .../tools/llvm-exegesis/X86/TargetTest.cpp      | 17 +++--------------
 .../tools/llvm-exegesis/X86/TestBase.h          |  6 ++++--
 3 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
index c3107ee4ec0eb..da24ae8cae2a4 100644
--- a/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/PowerPC/TargetTest.cpp
@@ -12,6 +12,7 @@
 #include <memory>
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "TestBase.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "gmock/gmock.h"
@@ -30,7 +31,7 @@ using testing::Not;
 
 constexpr const char kTriple[] = "powerpc64le-unknown-linux";
 
-class PowerPCTargetTest : public ::testing::Test {
+class PowerPCTargetTest : public PPCTestBase {
 protected:
   PowerPCTargetTest()
       : ExegesisTarget_(ExegesisTarget::lookup(Triple(kTriple))) {
@@ -39,12 +40,6 @@ class PowerPCTargetTest : public ::testing::Test {
     Target_ = TargetRegistry::lookupTarget(kTriple, error);
     EXPECT_THAT(Target_, NotNull());
   }
-  static void SetUpTestCase() {
-    LLVMInitializePowerPCTargetInfo();
-    LLVMInitializePowerPCTarget();
-    LLVMInitializePowerPCTargetMC();
-    InitializePowerPCExegesisTarget();
-  }
 
   const Target *Target_;
   const ExegesisTarget *const ExegesisTarget_;
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
index 3dff50c44798d..846729c6f85ee 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TargetTest.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MmapUtils.h"
 #include "SubprocessMemory.h"
+#include "TestBase.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "gmock/gmock.h"
@@ -112,19 +113,9 @@ Matcher<MCInst> IsStackDeallocate(unsigned Size) {
                ElementsAre(IsReg(X86::RSP), IsReg(X86::RSP), IsImm(Size)));
 }
 
-constexpr const char kTriple[] = "x86_64-unknown-linux";
-
-class X86TargetTest : public ::testing::Test {
+class X86TargetTest : public X86TestBase {
 protected:
-  X86TargetTest(const char *Features)
-      : State(cantFail(LLVMState::Create(kTriple, "core2", Features))) {}
-
-  static void SetUpTestCase() {
-    LLVMInitializeX86TargetInfo();
-    LLVMInitializeX86Target();
-    LLVMInitializeX86TargetMC();
-    InitializeX86ExegesisTarget();
-  }
+  X86TargetTest(const char *Features) : X86TestBase("core2", Features) {}
 
   std::vector<MCInst> setRegTo(unsigned Reg, const APInt &Value) {
     return State.getExegesisTarget().setRegTo(State.getSubtargetInfo(), Reg,
@@ -134,8 +125,6 @@ class X86TargetTest : public ::testing::Test {
   const Instruction &getInstr(unsigned OpCode) {
     return State.getIC().getInstr(OpCode);
   }
-
-  LLVMState State;
 };
 
 class X86Core2TargetTest : public X86TargetTest {
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
index ea8063ee44d2c..4122726aef94a 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
+++ b/llvm/unittests/tools/llvm-exegesis/X86/TestBase.h
@@ -22,10 +22,12 @@ namespace exegesis {
 
 void InitializeX86ExegesisTarget();
 
+constexpr const char kTriple[] = "x86_64-unknown-linux";
+
 class X86TestBase : public ::testing::Test {
 protected:
-  X86TestBase()
-      : State(cantFail(LLVMState::Create("x86_64-unknown-linux", "haswell"))) {}
+  X86TestBase(std::string CPUName = "haswell", const char *Features = "")
+      : State(cantFail(LLVMState::Create(kTriple, CPUName, Features))) {}
 
   static void SetUpTestCase() {
     LLVMInitializeX86TargetInfo();

From 934532d8b18adbe3c1d671d2287abd23885b7eaa Mon Sep 17 00:00:00 2001
From: mingmingl <mingmingl@google.com>
Date: Mon, 27 Jan 2025 15:47:32 -0800
Subject: [PATCH 302/432] remove unused var after refactoring

---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index c21915673f643..e77abf429e6b4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2853,7 +2853,6 @@ void AsmPrinter::emitConstantPool() {
 // Print assembly representations of the jump tables used by the current
 // function.
 void AsmPrinter::emitJumpTableInfo() {
-  const DataLayout &DL = MF->getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;

From a34159f85ee01c197cc9d938bd1fc40c4c7dba51 Mon Sep 17 00:00:00 2001
From: Julian Lettner <yln@users.noreply.github.com>
Date: Mon, 27 Jan 2025 15:48:29 -0800
Subject: [PATCH 303/432] [TSan][Apple] Fix interceptor build error (#124351)

In certain cases, the SDK headers declare
`OSSpinLock*` APIs as macros (instead of
functions), so users can be transparently
forwarded to non-deprecated APIs.

When enabled, building of TSan interceptors failed
because these macros interfere with the
interceptor machinery, i.e., they prevent proper
forward declaration of intercepted APIs.

In a previous change [1], we misattributed this to
the deprecation of `OSSpinLock*` APIs.

[1] ae484c21c05668f84b13304c28bc39f753e493de

rdar://143193907
---
 .../lib/tsan/rtl/tsan_interceptors_mac.cpp    | 24 +++++++---
 .../lib/tsan/rtl/tsan_spinlock_defs_mac.h     | 45 -------------------
 2 files changed, 18 insertions(+), 51 deletions(-)
 delete mode 100644 compiler-rt/lib/tsan/rtl/tsan_spinlock_defs_mac.h

diff --git a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
index e0e4c5b9d36cd..978664411fff4 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_interceptors_mac.cpp
@@ -25,7 +25,6 @@
 #  include "tsan_interceptors.h"
 #  include "tsan_interface.h"
 #  include "tsan_interface_ann.h"
-#  include "tsan_spinlock_defs_mac.h"
 
 #  if defined(__has_include) && __has_include(<xpc/xpc.h>)
 #    include <xpc/xpc.h>
@@ -96,8 +95,7 @@ static constexpr morder kMacFailureOrder = mo_relaxed;
                 m_orig(int32_t, uint32_t, a32, f##32##OrigBarrier,       \
                        __tsan_atomic32_##tsan_atomic_f, kMacOrderBarrier)
 
-#  pragma clang diagnostic push
-// OSAtomic* functions are deprecated.
+#  pragma clang diagnostic push  // OSAtomic* deprecation
 #  pragma clang diagnostic ignored "-Wdeprecated-declarations"
 OSATOMIC_INTERCEPTORS_ARITHMETIC(OSAtomicAdd, fetch_add,
                                  OSATOMIC_INTERCEPTOR_PLUS_X)
@@ -111,6 +109,7 @@ OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicAnd, fetch_and,
                               OSATOMIC_INTERCEPTOR_PLUS_X, OSATOMIC_INTERCEPTOR)
 OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicXor, fetch_xor,
                               OSATOMIC_INTERCEPTOR_PLUS_X, OSATOMIC_INTERCEPTOR)
+#  pragma clang diagnostic pop  // OSAtomic* deprecation
 
 #  define OSATOMIC_INTERCEPTORS_CAS(f, tsan_atomic_f, tsan_t, t)           \
     TSAN_INTERCEPTOR(bool, f, t old_value, t new_value, t volatile *ptr) { \
@@ -128,8 +127,7 @@ OSATOMIC_INTERCEPTORS_BITWISE(OSAtomicXor, fetch_xor,
           kMacOrderBarrier, kMacFailureOrder);                             \
     }
 
-#  pragma clang diagnostic push
-// OSAtomicCompareAndSwap* functions are deprecated.
+#  pragma clang diagnostic push  // OSAtomicCompareAndSwap* deprecation
 #  pragma clang diagnostic ignored "-Wdeprecated-declarations"
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapInt, __tsan_atomic32, a32, int)
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwapLong, __tsan_atomic64, a64,
@@ -140,7 +138,7 @@ OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap32, __tsan_atomic32, a32,
                           int32_t)
 OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap64, __tsan_atomic64, a64,
                           int64_t)
-#  pragma clang diagnostic pop
+#  pragma clang diagnostic pop  // OSAtomicCompareAndSwap* deprecation
 
 #  define OSATOMIC_INTERCEPTOR_BITOP(f, op, clear, mo)             \
     TSAN_INTERCEPTOR(bool, f, uint32_t n, volatile void *ptr) {    \
@@ -156,9 +154,12 @@ OSATOMIC_INTERCEPTORS_CAS(OSAtomicCompareAndSwap64, __tsan_atomic64, a64,
     OSATOMIC_INTERCEPTOR_BITOP(f, op, clear, kMacOrderNonBarrier) \
     OSATOMIC_INTERCEPTOR_BITOP(f##Barrier, op, clear, kMacOrderBarrier)
 
+#  pragma clang diagnostic push  // OSAtomicTestAnd* deprecation
+#  pragma clang diagnostic ignored "-Wdeprecated-declarations"
 OSATOMIC_INTERCEPTORS_BITOP(OSAtomicTestAndSet, __tsan_atomic8_fetch_or, false)
 OSATOMIC_INTERCEPTORS_BITOP(OSAtomicTestAndClear, __tsan_atomic8_fetch_and,
                             true)
+#  pragma clang diagnostic pop  // OSAtomicTestAnd* deprecation
 
 TSAN_INTERCEPTOR(void, OSAtomicEnqueue, OSQueueHead *list, void *item,
                  size_t offset) {
@@ -196,6 +197,16 @@ TSAN_INTERCEPTOR(void *, OSAtomicFifoDequeue, OSFifoQueueHead *list,
 
 #  endif
 
+// If `OSSPINLOCK_USE_INLINED=1` is set, then SDK headers don't declare these
+// as functions, but macros that call non-deprecated APIs.  Undefine these
+// macros so they don't interfere with the interceptor machinery.
+#  undef OSSpinLockLock
+#  undef OSSpinLockTry
+#  undef OSSpinLockUnlock
+
+#  pragma clang diagnostic push  // OSSpinLock* deprecation
+#  pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
 TSAN_INTERCEPTOR(void, OSSpinLockLock, volatile OSSpinLock *lock) {
   CHECK(!cur_thread()->is_dead);
   if (!cur_thread()->is_inited) {
@@ -227,6 +238,7 @@ TSAN_INTERCEPTOR(void, OSSpinLockUnlock, volatile OSSpinLock *lock) {
   Release(thr, pc, (uptr)lock);
   REAL(OSSpinLockUnlock)(lock);
 }
+#  pragma clang diagnostic pop  // OSSpinLock* deprecation
 
 TSAN_INTERCEPTOR(void, os_lock_lock, void *lock) {
   CHECK(!cur_thread()->is_dead);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_spinlock_defs_mac.h b/compiler-rt/lib/tsan/rtl/tsan_spinlock_defs_mac.h
deleted file mode 100644
index 1a99a81c03023..0000000000000
--- a/compiler-rt/lib/tsan/rtl/tsan_spinlock_defs_mac.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===-- tsan_spinlock_defs_mac.h -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer (TSan), a race detector.
-//
-// Mac-specific forward-declared function defintions that may be
-// deprecated in later versions of the OS.
-// These are needed for interceptors.
-//
-//===----------------------------------------------------------------------===//
-
-#if SANITIZER_APPLE
-
-#ifndef TSAN_SPINLOCK_DEFS_MAC_H
-#define TSAN_SPINLOCK_DEFS_MAC_H
-
-#include <stdint.h>
-
-extern "C" {
-
-/*
-Provides forward declarations related to OSSpinLocks on Darwin. These functions are
-deprecated on macOS version 10.12 and later,
-and are no longer included in the system headers.
-
-However, the symbols are still available on the system, so we provide these forward
-declarations to prevent compilation errors in tsan_interceptors_mac.cpp, which
-references these functions when defining TSAN interceptor functions.
-*/
-
-typedef int32_t OSSpinLock;
-
-void OSSpinLockLock(volatile OSSpinLock *__lock);
-void OSSpinLockUnlock(volatile OSSpinLock *__lock);
-bool OSSpinLockTry(volatile OSSpinLock *__lock);
-
-}
-
-#endif //TSAN_SPINLOCK_DEFS_MAC_H
-#endif // SANITIZER_APPLE

From f94c481543bdd3b11a668ad78d46593cf974788f Mon Sep 17 00:00:00 2001
From: antangelo <contact@antangelo.com>
Date: Mon, 27 Jan 2025 18:59:12 -0500
Subject: [PATCH 304/432] [clang] Track source deduction guide for alias
 template deduction guides (#123875)

For deduction guides generated from alias template CTAD, store the
deduction guide they were originated from. The source kind is also
maintained for future expansion in CTAD from inherited constructors.

This tracking is required to determine whether an alias template already
has a deduction guide corresponding to some deduction guide on the
original template, in order to support deduction guides for the alias
from deduction guides declared after the initial usage.
---
 clang/include/clang/AST/DeclCXX.h             | 45 +++++++++++++++++--
 clang/lib/AST/ASTImporter.cpp                 |  8 ++--
 clang/lib/AST/DeclCXX.cpp                     | 18 +++++---
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp |  8 +++-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  4 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |  4 ++
 clang/lib/Serialization/ASTWriterDecl.cpp     |  3 ++
 clang/unittests/AST/ASTImporterTest.cpp       | 26 +++++++++++
 8 files changed, 100 insertions(+), 16 deletions(-)

diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index fa3f4ec98eb36..79fd403c2718c 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1967,17 +1967,29 @@ class ExplicitSpecifier {
 class CXXDeductionGuideDecl : public FunctionDecl {
   void anchor() override;
 
+public:
+  // Represents the relationship between this deduction guide and the
+  // deduction guide that it was generated from (or lack thereof).
+  // See the SourceDeductionGuide member for more details.
+  enum class SourceDeductionGuideKind : uint8_t {
+    None,
+    Alias,
+  };
+
 private:
   CXXDeductionGuideDecl(ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
                         ExplicitSpecifier ES,
                         const DeclarationNameInfo &NameInfo, QualType T,
                         TypeSourceInfo *TInfo, SourceLocation EndLocation,
                         CXXConstructorDecl *Ctor, DeductionCandidate Kind,
-                        Expr *TrailingRequiresClause)
+                        Expr *TrailingRequiresClause,
+                        const CXXDeductionGuideDecl *GeneratedFrom,
+                        SourceDeductionGuideKind SourceKind)
       : FunctionDecl(CXXDeductionGuide, C, DC, StartLoc, NameInfo, T, TInfo,
                      SC_None, false, false, ConstexprSpecKind::Unspecified,
                      TrailingRequiresClause),
-        Ctor(Ctor), ExplicitSpec(ES) {
+        Ctor(Ctor), ExplicitSpec(ES),
+        SourceDeductionGuide(GeneratedFrom, SourceKind) {
     if (EndLocation.isValid())
       setRangeEnd(EndLocation);
     setDeductionCandidateKind(Kind);
@@ -1985,6 +1997,12 @@ class CXXDeductionGuideDecl : public FunctionDecl {
 
   CXXConstructorDecl *Ctor;
   ExplicitSpecifier ExplicitSpec;
+  // The deduction guide, if any, that this deduction guide was generated from,
+  // in the case of alias template deduction. The SourceDeductionGuideKind
+  // member indicates which of these sources applies, or is None otherwise.
+  llvm::PointerIntPair<const CXXDeductionGuideDecl *, 2,
+                       SourceDeductionGuideKind>
+      SourceDeductionGuide;
   void setExplicitSpecifier(ExplicitSpecifier ES) { ExplicitSpec = ES; }
 
 public:
@@ -1997,7 +2015,9 @@ class CXXDeductionGuideDecl : public FunctionDecl {
          TypeSourceInfo *TInfo, SourceLocation EndLocation,
          CXXConstructorDecl *Ctor = nullptr,
          DeductionCandidate Kind = DeductionCandidate::Normal,
-         Expr *TrailingRequiresClause = nullptr);
+         Expr *TrailingRequiresClause = nullptr,
+         const CXXDeductionGuideDecl *SourceDG = nullptr,
+         SourceDeductionGuideKind SK = SourceDeductionGuideKind::None);
 
   static CXXDeductionGuideDecl *CreateDeserialized(ASTContext &C,
                                                    GlobalDeclID ID);
@@ -2017,6 +2037,25 @@ class CXXDeductionGuideDecl : public FunctionDecl {
   /// this is an implicit deduction guide.
   CXXConstructorDecl *getCorrespondingConstructor() const { return Ctor; }
 
+  /// Get the deduction guide from which this deduction guide was generated,
+  /// if it was generated as part of alias template deduction or from an
+  /// inherited constructor.
+  const CXXDeductionGuideDecl *getSourceDeductionGuide() const {
+    return SourceDeductionGuide.getPointer();
+  }
+
+  void setSourceDeductionGuide(CXXDeductionGuideDecl *DG) {
+    SourceDeductionGuide.setPointer(DG);
+  }
+
+  SourceDeductionGuideKind getSourceDeductionGuideKind() const {
+    return SourceDeductionGuide.getInt();
+  }
+
+  void setSourceDeductionGuideKind(SourceDeductionGuideKind SK) {
+    SourceDeductionGuide.setInt(SK);
+  }
+
   void setDeductionCandidateKind(DeductionCandidate K) {
     FunctionDeclBits.DeductionCandidateKind = static_cast<unsigned char>(K);
   }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index f661de54504a9..5433b6120ab9f 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -3997,14 +3997,16 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
         importExplicitSpecifier(Err, Guide->getExplicitSpecifier());
     CXXConstructorDecl *Ctor =
         importChecked(Err, Guide->getCorrespondingConstructor());
+    const CXXDeductionGuideDecl *SourceDG =
+        importChecked(Err, Guide->getSourceDeductionGuide());
     if (Err)
       return std::move(Err);
     if (GetImportedOrCreateDecl<CXXDeductionGuideDecl>(
             ToFunction, D, Importer.getToContext(), DC, ToInnerLocStart, ESpec,
-            NameInfo, T, TInfo, ToEndLoc, Ctor))
+            NameInfo, T, TInfo, ToEndLoc, Ctor,
+            Guide->getDeductionCandidateKind(), TrailingRequiresClause,
+            SourceDG, Guide->getSourceDeductionGuideKind()))
       return ToFunction;
-    cast<CXXDeductionGuideDecl>(ToFunction)
-        ->setDeductionCandidateKind(Guide->getDeductionCandidateKind());
   } else {
     if (GetImportedOrCreateDecl(
             ToFunction, D, Importer.getToContext(), DC, ToInnerLocStart,
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index c0a4356dcb004..a023a9f456a0e 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2292,18 +2292,22 @@ CXXDeductionGuideDecl *CXXDeductionGuideDecl::Create(
     ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
     ExplicitSpecifier ES, const DeclarationNameInfo &NameInfo, QualType T,
     TypeSourceInfo *TInfo, SourceLocation EndLocation, CXXConstructorDecl *Ctor,
-    DeductionCandidate Kind, Expr *TrailingRequiresClause) {
-  return new (C, DC)
-      CXXDeductionGuideDecl(C, DC, StartLoc, ES, NameInfo, T, TInfo,
-                            EndLocation, Ctor, Kind, TrailingRequiresClause);
+    DeductionCandidate Kind, Expr *TrailingRequiresClause,
+    const CXXDeductionGuideDecl *GeneratedFrom,
+    SourceDeductionGuideKind SourceKind) {
+  return new (C, DC) CXXDeductionGuideDecl(
+      C, DC, StartLoc, ES, NameInfo, T, TInfo, EndLocation, Ctor, Kind,
+      TrailingRequiresClause, GeneratedFrom, SourceKind);
 }
 
 CXXDeductionGuideDecl *
 CXXDeductionGuideDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID) {
   return new (C, ID) CXXDeductionGuideDecl(
-      C, nullptr, SourceLocation(), ExplicitSpecifier(), DeclarationNameInfo(),
-      QualType(), nullptr, SourceLocation(), nullptr,
-      DeductionCandidate::Normal, nullptr);
+      C, /*DC=*/nullptr, SourceLocation(), ExplicitSpecifier(),
+      DeclarationNameInfo(), QualType(), /*TInfo=*/nullptr, SourceLocation(),
+      /*Ctor=*/nullptr, DeductionCandidate::Normal,
+      /*TrailingRequiresClause=*/nullptr,
+      /*GeneratedFrom=*/nullptr, SourceDeductionGuideKind::None);
 }
 
 RequiresExprBodyDecl *RequiresExprBodyDecl::Create(
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index 950783303efb3..00c5dfd3d7a43 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -1175,8 +1175,12 @@ BuildDeductionGuideForTypeAlias(Sema &SemaRef,
         GG->getTypeSourceInfo(), AliasTemplate->getBeginLoc(),
         AliasTemplate->getLocation(), AliasTemplate->getEndLoc(),
         F->isImplicit()));
-    cast<CXXDeductionGuideDecl>(Result->getTemplatedDecl())
-        ->setDeductionCandidateKind(GG->getDeductionCandidateKind());
+    auto *DGuide = cast<CXXDeductionGuideDecl>(Result->getTemplatedDecl());
+    DGuide->setDeductionCandidateKind(GG->getDeductionCandidateKind());
+    DGuide->setSourceDeductionGuide(
+        cast<CXXDeductionGuideDecl>(F->getTemplatedDecl()));
+    DGuide->setSourceDeductionGuideKind(
+        CXXDeductionGuideDecl::SourceDeductionGuideKind::Alias);
     return Result;
   }
   return nullptr;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 337262e4b5fc3..ebe2dd91865ca 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -2269,7 +2269,9 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
         SemaRef.Context, DC, D->getInnerLocStart(),
         InstantiatedExplicitSpecifier, NameInfo, T, TInfo,
         D->getSourceRange().getEnd(), DGuide->getCorrespondingConstructor(),
-        DGuide->getDeductionCandidateKind(), TrailingRequiresClause);
+        DGuide->getDeductionCandidateKind(), TrailingRequiresClause,
+        DGuide->getSourceDeductionGuide(),
+        DGuide->getSourceDeductionGuideKind());
     Function->setAccess(D->getAccess());
   } else {
     Function = FunctionDecl::Create(
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 0b75468a94103..87e1931ee47fd 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -2295,6 +2295,10 @@ void ASTDeclReader::VisitCXXDeductionGuideDecl(CXXDeductionGuideDecl *D) {
   VisitFunctionDecl(D);
   D->setDeductionCandidateKind(
       static_cast<DeductionCandidate>(Record.readInt()));
+  D->setSourceDeductionGuide(readDeclAs<CXXDeductionGuideDecl>());
+  D->setSourceDeductionGuideKind(
+      static_cast<CXXDeductionGuideDecl::SourceDeductionGuideKind>(
+          Record.readInt()));
 }
 
 void ASTDeclReader::VisitCXXMethodDecl(CXXMethodDecl *D) {
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 8b9ba04dce91c..fa2294da95de8 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -846,6 +846,9 @@ void ASTDeclWriter::VisitCXXDeductionGuideDecl(CXXDeductionGuideDecl *D) {
   Record.AddDeclRef(D->Ctor);
   VisitFunctionDecl(D);
   Record.push_back(static_cast<unsigned char>(D->getDeductionCandidateKind()));
+  Record.AddDeclRef(D->getSourceDeductionGuide());
+  Record.push_back(
+      static_cast<unsigned char>(D->getSourceDeductionGuideKind()));
   Code = serialization::DECL_CXX_DEDUCTION_GUIDE;
 }
 
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index e77860521335e..114d0b461dae8 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -8125,6 +8125,9 @@ TEST_P(ImportFunctions, CTADImplicit) {
   auto *ToD = Import(FromD, Lang_CXX17);
   ASSERT_TRUE(ToD);
   EXPECT_EQ(ToD->getDeductionCandidateKind(), DeductionCandidate::Copy);
+  EXPECT_EQ(ToD->getSourceDeductionGuide(), nullptr);
+  EXPECT_EQ(ToD->getSourceDeductionGuideKind(),
+            CXXDeductionGuideDecl::SourceDeductionGuideKind::None);
   // Check that the deduced class template is also imported.
   EXPECT_TRUE(findFromTU(FromD)->Importer->GetAlreadyImportedOrNull(
       FromD->getDeducedTemplate()));
@@ -8149,6 +8152,9 @@ TEST_P(ImportFunctions, CTADUserDefinedExplicit) {
   ASSERT_TRUE(ToD);
   EXPECT_FALSE(FromD->isImplicit());
   EXPECT_TRUE(ToD->isExplicit());
+  EXPECT_EQ(ToD->getSourceDeductionGuide(), nullptr);
+  EXPECT_EQ(ToD->getSourceDeductionGuideKind(),
+            CXXDeductionGuideDecl::SourceDeductionGuideKind::None);
 }
 
 TEST_P(ImportFunctions, CTADWithLocalTypedef) {
@@ -8167,6 +8173,26 @@ TEST_P(ImportFunctions, CTADWithLocalTypedef) {
   ASSERT_TRUE(ToD);
 }
 
+TEST_P(ImportFunctions, CTADAliasTemplate) {
+  Decl *TU = getTuDecl(
+      R"(
+      template <typename T> struct A {
+        A(T);
+      };
+      template<typename T>
+      using B = A<T>;
+      B b{(int)0};
+      )",
+      Lang_CXX20, "input.cc");
+  auto *FromD = FirstDeclMatcher<CXXDeductionGuideDecl>().match(
+      TU, cxxDeductionGuideDecl(hasParameter(0, hasType(asString("int")))));
+  auto *ToD = Import(FromD, Lang_CXX20);
+  ASSERT_TRUE(ToD);
+  EXPECT_TRUE(ToD->getSourceDeductionGuideKind() ==
+              CXXDeductionGuideDecl::SourceDeductionGuideKind::Alias);
+  EXPECT_TRUE(ToD->getSourceDeductionGuide());
+}
+
 TEST_P(ImportFunctions, ParmVarDeclDeclContext) {
   constexpr auto FromTUCode = R"(
       void f(int P);

From 3cfda4f11842ceaab983345333870bef7980aa85 Mon Sep 17 00:00:00 2001
From: mcbarton <matthew.c.barton@hotmail.co.uk>
Date: Tue, 28 Jan 2025 00:00:31 +0000
Subject: [PATCH 305/432] Don't use -z,defs linker flag when building shared
 libraries with emscripten (#123396)

---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 55a87f5fdbb13..636622d146505 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -312,7 +312,7 @@ endif()
 
 # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO
 # build might work on ELF but fail on MachO/COFF.
-if(NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin|FreeBSD|OpenBSD|DragonFly|AIX|OS390" OR
+if(NOT (CMAKE_SYSTEM_NAME MATCHES "Darwin|FreeBSD|OpenBSD|DragonFly|AIX|OS390|Emscripten" OR
         WIN32 OR CYGWIN) AND
    NOT LLVM_USE_SANITIZER)
   set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-z,defs")

From aa65f93b71dee8cacb22be1957673c8be6a3ec24 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 28 Jan 2025 00:22:25 +0000
Subject: [PATCH 306/432] Revert "[MLGO] Count LR Evictions Rather than Relying
 on Cascade (#124440)"

This reverts commit 8cc83b66e20e72cdb3bb5fbd549c941797b0e0c9.

This was causing builbot failures.
https://lab.llvm.org/buildbot/#/builders/90/builds/4198
https://lab.llvm.org/buildbot/#/builders/110/builds/3616
---
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 44 +++------------------
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 51ef625335ea9..9c6487b40d606 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -63,11 +63,11 @@ static cl::opt<std::string> InteractiveChannelBaseName(
         "outgoing name should be "
         "<regalloc-evict-interactive-channel-base>.out"));
 
-static cl::opt<unsigned> MaxEvictionCount(
-    "mlregalloc-max-eviction-count", cl::Hidden,
-    cl::desc("The maximum number of times a live range can be "
-             "evicted before preventing it from being evicted"),
-    cl::init(100));
+static cl::opt<unsigned>
+    MaxCascade("mlregalloc-max-cascade", cl::Hidden,
+               cl::desc("The maximum number of times a live range can be "
+                        "evicted before preventing it from being evicted"),
+               cl::init(20));
 
 // Options that only make sense in development mode
 #ifdef LLVM_HAVE_TFLITE
@@ -364,22 +364,6 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor {
 
   using RegID = unsigned;
   mutable DenseMap<RegID, LIFeatureComponents> CachedFeatures;
-
-  mutable std::unordered_map<unsigned, unsigned> VirtRegEvictionCounts;
-
-  void onEviction(Register RegBeingEvicted) const {
-    // If we cannot find the virtual register in the map, we just assume it has
-    // not been evicted before and thus has a value of zero (which is what the
-    // subscript operator returns by default).
-    ++VirtRegEvictionCounts[RegBeingEvicted.id()];
-  }
-
-  unsigned getEvictionCount(Register Reg) const {
-    auto EvictionCountIt = VirtRegEvictionCounts.find(Reg.id());
-    if (EvictionCountIt != VirtRegEvictionCounts.end())
-      return EvictionCountIt->second;
-    return 0;
-  }
 };
 
 #define _DECL_FEATURES(type, name, shape, _)                                   \
@@ -673,7 +657,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
       // threshold, prevent the range from being evicted. We still let the
       // range through if it is urgent as we are required to produce an
       // eviction if the candidate is not spillable.
-      if (getEvictionCount(Intf->reg()) > MaxEvictionCount && !Urgent)
+      if (IntfCascade >= MaxCascade && !Urgent)
         return false;
 
       // Only evict older cascades or live ranges without a cascade.
@@ -819,22 +803,6 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
   }
   assert(CandidatePos < ValidPosLimit);
   (void)ValidPosLimit;
-
-  // Update information about how many times the virtual registers being
-  // evicted have been evicted so that we can prevent the model from evicting
-  // the same ranges continually and eating compile time.
-  if (CandidatePos == CandidateVirtRegPos) {
-    onEviction(VirtReg.reg());
-  } else {
-    for (MCRegUnit Unit : TRI->regunits(Regs[CandidatePos].first)) {
-      LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
-      const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
-      for (const LiveInterval *Intf : reverse(IFIntervals)) {
-        onEviction(Intf->reg());
-      }
-    }
-  }
-
   return Regs[CandidatePos].first;
 }
 

From f949f876daeda520a5b7dbeb2cbb35b8c4383acb Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Mon, 27 Jan 2025 21:36:43 -0300
Subject: [PATCH 307/432] [clang] improve print / dump of anonymous
 declarations (#124605)

ast-print: A DeclRef to an anonymous NTTP will print
'value-parameter-DEPTH-INDEX',
similar to how type parameters are printed.

ast-dump: A bareDeclRef to an anonymous entity will print some extra
identifying information,
instead of an empty name, like indexes.
Falls back to source locations if nothing else is available.
---
 clang/lib/AST/StmtPrinter.cpp                 |  32 +-
 clang/lib/AST/TextNodeDumper.cpp              |  36 +-
 .../test/AST/HLSL/StructuredBuffers-AST.hlsl  | 444 +++++++++---------
 clang/test/AST/HLSL/TypedBuffers-AST.hlsl     |   4 +-
 clang/test/AST/ast-dump-decl.c                |   2 +-
 clang/test/AST/ast-dump-records.c             |  16 +-
 clang/test/AST/ast-dump-records.cpp           |  16 +-
 .../attr-counted-by-late-parsed-struct-ptrs.c |   2 +-
 ...unted-by-or-null-late-parsed-struct-ptrs.c |   2 +-
 .../AST/attr-counted-by-or-null-struct-ptrs.c |   6 +-
 clang/test/AST/attr-counted-by-struct-ptrs.c  |   6 +-
 .../attr-sized-by-late-parsed-struct-ptrs.c   |   2 +-
 ...sized-by-or-null-late-parsed-struct-ptrs.c |   2 +-
 .../AST/attr-sized-by-or-null-struct-ptrs.c   |   6 +-
 clang/test/AST/attr-sized-by-struct-ptrs.c    |   6 +-
 clang/test/Import/cxx-anon-namespace/test.cpp |   8 +-
 clang/test/Modules/odr_hash.cpp               |   2 +-
 .../aggregate-deduction-candidate.cpp         |   4 +-
 clang/test/SemaTemplate/deduction-crash.cpp   |  16 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   8 +-
 20 files changed, 334 insertions(+), 286 deletions(-)

diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index b5def6fbe525c..2db5e7936041b 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1257,11 +1257,12 @@ void StmtPrinter::VisitConstantExpr(ConstantExpr *Node) {
 }
 
 void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
-  if (const auto *OCED = dyn_cast<OMPCapturedExprDecl>(Node->getDecl())) {
+  ValueDecl *VD = Node->getDecl();
+  if (const auto *OCED = dyn_cast<OMPCapturedExprDecl>(VD)) {
     OCED->getInit()->IgnoreImpCasts()->printPretty(OS, nullptr, Policy);
     return;
   }
-  if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(Node->getDecl())) {
+  if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(VD)) {
     TPOD->printAsExpr(OS, Policy);
     return;
   }
@@ -1269,16 +1270,29 @@ void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
     Qualifier->print(OS, Policy);
   if (Node->hasTemplateKeyword())
     OS << "template ";
-  if (Policy.CleanUglifiedParameters &&
-      isa<ParmVarDecl, NonTypeTemplateParmDecl>(Node->getDecl()) &&
-      Node->getDecl()->getIdentifier())
-    OS << Node->getDecl()->getIdentifier()->deuglifiedName();
-  else
-    Node->getNameInfo().printName(OS, Policy);
+  DeclarationNameInfo NameInfo = Node->getNameInfo();
+  if (IdentifierInfo *ID = NameInfo.getName().getAsIdentifierInfo();
+      ID || NameInfo.getName().getNameKind() != DeclarationName::Identifier) {
+    if (Policy.CleanUglifiedParameters &&
+        isa<ParmVarDecl, NonTypeTemplateParmDecl>(VD) && ID)
+      OS << ID->deuglifiedName();
+    else
+      NameInfo.printName(OS, Policy);
+  } else {
+    switch (VD->getKind()) {
+    case Decl::NonTypeTemplateParm: {
+      auto *TD = cast<NonTypeTemplateParmDecl>(VD);
+      OS << "value-parameter-" << TD->getDepth() << '-' << TD->getIndex() << "";
+      break;
+    }
+    default:
+      llvm_unreachable("Unhandled anonymous declaration kind");
+    }
+  }
   if (Node->hasExplicitTemplateArgs()) {
     const TemplateParameterList *TPL = nullptr;
     if (!Node->hadMultipleCandidates())
-      if (auto *TD = dyn_cast<TemplateDecl>(Node->getDecl()))
+      if (auto *TD = dyn_cast<TemplateDecl>(VD))
         TPL = TD->getTemplateParameters();
     printTemplateArgumentList(OS, Node->template_arguments(), Policy, TPL);
   }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 46ec553fc05f0..a57cba9597482 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -901,7 +901,41 @@ void TextNodeDumper::dumpBareDeclRef(const Decl *D) {
 
   if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
     ColorScope Color(OS, ShowColors, DeclNameColor);
-    OS << " '" << ND->getDeclName() << '\'';
+    if (DeclarationName Name = ND->getDeclName())
+      OS << " '" << Name << '\'';
+    else
+      switch (ND->getKind()) {
+      case Decl::Decomposition: {
+        auto *DD = cast<DecompositionDecl>(ND);
+        OS << " first_binding '" << DD->bindings()[0]->getDeclName() << '\'';
+        break;
+      }
+      case Decl::Field: {
+        auto *FD = cast<FieldDecl>(ND);
+        OS << " field_index " << FD->getFieldIndex();
+        break;
+      }
+      case Decl::ParmVar: {
+        auto *PD = cast<ParmVarDecl>(ND);
+        OS << " depth " << PD->getFunctionScopeDepth() << " index "
+           << PD->getFunctionScopeIndex();
+        break;
+      }
+      case Decl::TemplateTypeParm: {
+        auto *TD = cast<TemplateTypeParmDecl>(ND);
+        OS << " depth " << TD->getDepth() << " index " << TD->getIndex();
+        break;
+      }
+      case Decl::NonTypeTemplateParm: {
+        auto *TD = cast<NonTypeTemplateParmDecl>(ND);
+        OS << " depth " << TD->getDepth() << " index " << TD->getIndex();
+        break;
+      }
+      default:
+        // Var, Namespace, (CXX)Record: Nothing else besides source location.
+        dumpSourceRange(ND->getSourceRange());
+        break;
+      }
   }
 
   if (const ValueDecl *VD = dyn_cast<ValueDecl>(D))
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index db3f2d405e686..11be67d45a14c 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -1,222 +1,222 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s
-
-// This test tests two different AST generations for each structured buffer.
-// The "EMPTY" test mode verifies the AST generated by forward declaration
-// of the HLSL types which happens on initializing the HLSL external AST with
-// an AST Context.
-
-// The non-empty mode has a use that requires the resource type be complete,
-// which results in the AST being populated by the external AST source. That
-// case covers the full implementation of the template declaration and the
-// instantiated specialization.
-
-// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
-// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
-// EMPTY-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
-// EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
-// EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
-// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
-// EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
-// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
-// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class [[RESOURCE]]
-// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-
-// There should be no more occurrences of [[RESOURCE]]
-// EMPTY-NOT: {{[^[:alnum:]]}}[[RESOURCE]]
-
-#ifndef EMPTY
-
-RESOURCE<float> Buffer;
-
-#endif
-
-// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
-// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
-// CHECK-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
-// CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
-// CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
-// CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
-// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
-// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class [[RESOURCE]] definition
-
-// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
-// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
-
-// CHECK-SUBSCRIPT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
-// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const [[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-SUBSCRIPT-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
-// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
-// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
-
-// CHECK-LOAD: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Load 'element_type (unsigned int)'
-// CHECK-LOAD-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-LOAD-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-LOAD-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-LOAD-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-LOAD-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-LOAD-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-LOAD-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-LOAD-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-LOAD-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-LOAD-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-LOAD-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-COUNTER: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> IncrementCounter 'unsigned int ()'
-// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
-// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
-// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-COUNTER-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> DecrementCounter 'unsigned int ()'
-// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
-// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
-// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-APPEND: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Append 'void (element_type)'
-// CHECK-APPEND-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> value 'element_type'
-// CHECK-APPEND-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-APPEND-NEXT: BinaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' '='
-// CHECK-APPEND-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-APPEND-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' ParmVar 0x{{[0-9A-Fa-f]+}} 'value' 'element_type'
-
-// CHECK-CONSUME: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Consume 'element_type ()'
-// CHECK-CONSUME-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-CONSUME-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-CONSUME-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-CONSUME-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
-
-// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class [[RESOURCE]] definition
-
-// CHECK: TemplateArgument type 'float'
-// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
-// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
-// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+
+// This test tests two different AST generations for each structured buffer.
+// The "EMPTY" test mode verifies the AST generated by forward declaration
+// of the HLSL types which happens on initializing the HLSL external AST with
+// an AST Context.
+
+// The non-empty mode has a use that requires the resource type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// EMPTY-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
+// EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class [[RESOURCE]]
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrences of [[RESOURCE]]
+// EMPTY-NOT: {{[^[:alnum:]]}}[[RESOURCE]]
+
+#ifndef EMPTY
+
+RESOURCE<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// CHECK-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
+// CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class [[RESOURCE]] definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
+// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
+
+// CHECK-SUBSCRIPT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
+// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const [[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-SUBSCRIPT-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
+// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+
+// CHECK-LOAD: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Load 'element_type (unsigned int)'
+// CHECK-LOAD-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-LOAD-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-LOAD-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-LOAD-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-LOAD-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-LOAD-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-LOAD-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-LOAD-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-LOAD-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-LOAD-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-LOAD-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-COUNTER: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> IncrementCounter 'unsigned int ()'
+// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
+// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
+// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-COUNTER-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> DecrementCounter 'unsigned int ()'
+// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
+// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
+// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-APPEND: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Append 'void (element_type)'
+// CHECK-APPEND-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> value 'element_type'
+// CHECK-APPEND-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-APPEND-NEXT: BinaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' '='
+// CHECK-APPEND-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-APPEND-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' ParmVar 0x{{[0-9A-Fa-f]+}} 'value' 'element_type'
+
+// CHECK-CONSUME: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Consume 'element_type ()'
+// CHECK-CONSUME-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-CONSUME-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-CONSUME-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-CONSUME-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class [[RESOURCE]] definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
+// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
index d6dfb0caba5d9..406ff07d0cf62 100644
--- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
@@ -23,7 +23,7 @@
 // EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
 // EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
 // EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
 // EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
 // EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
 // EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
@@ -45,7 +45,7 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
 // CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
 // CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
 // CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
diff --git a/clang/test/AST/ast-dump-decl.c b/clang/test/AST/ast-dump-decl.c
index 28b58c8eb648c..683df50f7e91c 100644
--- a/clang/test/AST/ast-dump-decl.c
+++ b/clang/test/AST/ast-dump-decl.c
@@ -121,7 +121,7 @@ struct testIndirectFieldDecl {
   };
 };
 // CHECK:      IndirectFieldDecl{{.*}} TestIndirectFieldDecl 'int'
-// CHECK-NEXT:   Field{{.*}} ''
+// CHECK-NEXT:   Field{{.*}} field_index 0
 // CHECK-NEXT:   Field{{.*}} 'TestIndirectFieldDecl'
 
 // FIXME: It would be nice to dump the enum and its enumerators.
diff --git a/clang/test/AST/ast-dump-records.c b/clang/test/AST/ast-dump-records.c
index c0fbac67bc491..f4a540f3fa872 100644
--- a/clang/test/AST/ast-dump-records.c
+++ b/clang/test/AST/ast-dump-records.c
@@ -58,10 +58,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:3> col:3 implicit 'union C::(anonymous at {{.*}}:[[@LINE-7]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union C::(anonymous at {{.*}}:[[@LINE-9]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union C::(anonymous at {{.*}}:[[@LINE-9]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union C::(anonymous at {{.*}}:[[@LINE-12]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union C::(anonymous at {{.*}}:[[@LINE-12]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -72,10 +72,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:3> col:3 implicit 'struct C::(anonymous at {{.*}}:[[@LINE-6]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct C::(anonymous at {{.*}}:[[@LINE-8]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct C::(anonymous at {{.*}}:[[@LINE-8]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct C::(anonymous at {{.*}}:[[@LINE-11]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct C::(anonymous at {{.*}}:[[@LINE-11]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
@@ -141,10 +141,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:3> col:3 implicit 'union G::(anonymous at {{.*}}:[[@LINE-7]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union G::(anonymous at {{.*}}:[[@LINE-9]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union G::(anonymous at {{.*}}:[[@LINE-9]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union G::(anonymous at {{.*}}:[[@LINE-12]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union G::(anonymous at {{.*}}:[[@LINE-12]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -155,10 +155,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:3> col:3 implicit 'struct G::(anonymous at {{.*}}:[[@LINE-6]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct G::(anonymous at {{.*}}:[[@LINE-8]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct G::(anonymous at {{.*}}:[[@LINE-8]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct G::(anonymous at {{.*}}:[[@LINE-11]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct G::(anonymous at {{.*}}:[[@LINE-11]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
diff --git a/clang/test/AST/ast-dump-records.cpp b/clang/test/AST/ast-dump-records.cpp
index bfd8892698d4b..e9b37b73002dd 100644
--- a/clang/test/AST/ast-dump-records.cpp
+++ b/clang/test/AST/ast-dump-records.cpp
@@ -90,10 +90,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-14]]:3> col:3 implicit 'C::(anonymous union at {{.*}}:[[@LINE-14]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous union at {{.*}}:[[@LINE-16]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'C::(anonymous union at {{.*}}:[[@LINE-16]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous union at {{.*}}:[[@LINE-19]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'C::(anonymous union at {{.*}}:[[@LINE-19]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -111,10 +111,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-13]]:3> col:3 implicit 'C::(anonymous struct at {{.*}}:[[@LINE-13]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous struct at {{.*}}:[[@LINE-15]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'C::(anonymous struct at {{.*}}:[[@LINE-15]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous struct at {{.*}}:[[@LINE-18]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'C::(anonymous struct at {{.*}}:[[@LINE-18]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
@@ -223,10 +223,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-15]]:3> col:3 implicit 'G::(anonymous union at {{.*}}:[[@LINE-15]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous union at {{.*}}:[[@LINE-17]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'G::(anonymous union at {{.*}}:[[@LINE-17]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous union at {{.*}}:[[@LINE-20]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'G::(anonymous union at {{.*}}:[[@LINE-20]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -245,10 +245,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-14]]:3> col:3 implicit 'G::(anonymous struct at {{.*}}:[[@LINE-14]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous struct at {{.*}}:[[@LINE-16]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'G::(anonymous struct at {{.*}}:[[@LINE-16]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous struct at {{.*}}:[[@LINE-19]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'G::(anonymous struct at {{.*}}:[[@LINE-19]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
index a585a45eeff03..f9772db8b6554 100644
--- a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
index 975c0a0231943..59b866dae720d 100644
--- a/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
index 075f583784fe1..d42547003f0b3 100644
--- a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
index 0c05258234143..afef9c8c3b95d 100644
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
index b58caf608bf97..411b333f37989 100644
--- a/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
index d55a42ac0fb94..ae89360e17493 100644
--- a/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
index 73b8a71f23503..7273280e4b60c 100644
--- a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-sized-by-struct-ptrs.c b/clang/test/AST/attr-sized-by-struct-ptrs.c
index 7f7e3dfea2ac7..738eaf8cbf36b 100644
--- a/clang/test/AST/attr-sized-by-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/Import/cxx-anon-namespace/test.cpp b/clang/test/Import/cxx-anon-namespace/test.cpp
index e7668cc36f0e8..9a2df03d35bbc 100644
--- a/clang/test/Import/cxx-anon-namespace/test.cpp
+++ b/clang/test/Import/cxx-anon-namespace/test.cpp
@@ -14,12 +14,12 @@
 // CHECK-NEXT: CompoundStmt
 // This is for the nested anonymous namespace.
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: ''
+// CHECK-SAME: <col:1, line:24:1>
 // CHECK: FunctionDecl
 // CHECK-SAME: func1
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: ''
+// CHECK-SAME: <col:1, line:4:1>
 
 // CHECK: NamespaceDecl
 // CHECK-SAME: test_namespace1
@@ -28,7 +28,7 @@
 // CHECK-SAME: func2
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: ''
+// CHECK-SAME: <col:1, line:9:1>
 
 // CHECK-NEXT: NamespaceDecl
 // CHECK-SAME: test_namespace2
@@ -39,7 +39,7 @@
 // CHECK-SAME: func3
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: ''
+// CHECK-SAME: <col:1, line:17:1>
 
 void expr() {
   func1();
diff --git a/clang/test/Modules/odr_hash.cpp b/clang/test/Modules/odr_hash.cpp
index 4de0e50dbc0eb..b0f5b904f2b39 100644
--- a/clang/test/Modules/odr_hash.cpp
+++ b/clang/test/Modules/odr_hash.cpp
@@ -4020,7 +4020,7 @@ struct Valid {
 };
 #else
 Invalid::L2<1>::L3<1> invalid;
-// expected-error@second.h:* {{'Types::InjectedClassName::Invalid::L2::L3::x' from module 'SecondModule' is not present in definition of 'L3<>' in module 'FirstModule'}}
+// expected-error@second.h:* {{'Types::InjectedClassName::Invalid::L2::L3::x' from module 'SecondModule' is not present in definition of 'L3<value-parameter-1-0>' in module 'FirstModule'}}
 // expected-note@first.h:* {{declaration of 'x' does not match}}
 Valid::L2<1>::L3<1> valid;
 #endif
diff --git a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
index 49afb6b860620..0854ac9178b4f 100644
--- a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
+++ b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
@@ -274,10 +274,10 @@ namespace TrailingPack {
   // CHECK: |-TemplateArgument pack
   // CHECK: | |-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
   // CHECK: | | `-RecordType {{.*}} 'TrailingPack::(lambda at {{.*}})'
-  // CHECK: | |   `-CXXRecord {{.*}} ''
+  // CHECK: | |   `-CXXRecord {{.*}} <line:262:5>
   // CHECK: | `-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
   // CHECK: |   `-RecordType {{.*}} 'TrailingPack::(lambda at {{.*}})'
-  // CHECK: |     `-CXXRecord {{.*}} ''
+  // CHECK: |     `-CXXRecord {{.*}} <line:263:5>
   // CHECK: |-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: `-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: FunctionProtoType {{.*}} 'auto (T...) -> A<T...>' dependent trailing_return cdecl
diff --git a/clang/test/SemaTemplate/deduction-crash.cpp b/clang/test/SemaTemplate/deduction-crash.cpp
index e3f97c594aa90..287c61aef00d6 100644
--- a/clang/test/SemaTemplate/deduction-crash.cpp
+++ b/clang/test/SemaTemplate/deduction-crash.cpp
@@ -24,16 +24,16 @@ struct state_machine
   {
     ant(0);
   }
-  
+
   template<class>
   struct region_processing_helper
   {
     template<class,int=0>
     struct In;
-    
+
     template<int my>
     struct In<a::int_<aaa::a>,my>; // expected-error +{{}}
-        
+
     template<class Event>
     int process(Event)
     {
@@ -61,18 +61,18 @@ template <class _Tp> struct remove_reference<_Tp&> ;
 
 template <class > struct __tuple_like;
 
-template <class _Tp, class _Up, int = __tuple_like<typename remove_reference<_Tp>::type>::value> 
+template <class _Tp, class _Up, int = __tuple_like<typename remove_reference<_Tp>::type>::value>
 struct __tuple_convertible;
 
 struct pair
 {
-template<class _Tuple, int = enable_if<__tuple_convertible<_Tuple, pair>::value>::type> 
+template<class _Tuple, int = enable_if<__tuple_convertible<_Tuple, pair>::value>::type>
 pair(_Tuple&& );
 };
 
 template <class> struct basic_ostream;
 
-template <int> 
+template <int>
 void endl( ) ;
 
 extern basic_ostream<char> cout;
@@ -166,9 +166,9 @@ namespace PR51872_part1 {
   template<int> class T1 { template <struct U1> T1(); };
   // expected-error@-1 {{non-type template parameter has incomplete type 'struct U1'}}
   // expected-note@-2  {{forward declaration of 'PR51872_part1::U1'}}
-  // expected-note@-3  {{implicit deduction guide declared as 'template <int> T1(T1<>) -> T1<>'}}
+  // expected-note@-3  {{implicit deduction guide declared as 'template <int> T1(T1<value-parameter-0-0>) -> T1<value-parameter-0-0>'}}
 
   T1 t1 = 0;
   // expected-error@-1 {{no viable constructor or deduction guide for deduction of template arguments of 'T1'}}
-  // expected-note@-7  {{candidate template ignored: could not match 'T1<>' against 'int'}}
+  // expected-note@-7  {{candidate template ignored: could not match 'T1<value-parameter-0-0>' against 'int'}}
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index c0b56f9e6ac7b..a4c523595fca2 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -227,20 +227,20 @@ F s(0);
 // CHECK: FunctionTemplateDecl
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'char' depth 0 index 0
 // CHECK:   `-TemplateArgument {{.*}} expr
-// CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} '' 'char'
+// CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} depth 0 index 0 'char'
 // CHECK: |   `-CharacterLiteral {{.*}} 'char' 120
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
 // CHECK: |-ParenExpr {{.*}} 'bool'
 // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false
-// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<value-parameter-0-0>'
 // CHECK: | `-ParmVarDecl {{.*}} 'U'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
 // CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl
-// CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
+// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<value-parameter-0-0>' dependent trailing_return cdecl
+// CHECK: |-InjectedClassNameType {{.*}} 'F<value-parameter-0-0>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
 // CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 1
 

From f75860f89522453f361e1ef54d7a33be2a2d75b1 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Tue, 28 Jan 2025 00:38:44 +0000
Subject: [PATCH 308/432] [AArch64] Implement NEON FP8 intrinsics for fused
 multiply-add (#123615)

This patch adds the following intrinsics:

* Fused multiply-add non-indexed

float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)
float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t,
fpm_t)

float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t,
mfloat8x16_t, fpm_t)
float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t, mfloat8x16_t,
mfloat8x16_t, fpm_t)
float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t,
mfloat8x16_t, fpm_t)
float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t, mfloat8x16_t,
mfloat8x16_t, fpm_t)

* Floating-point multiply-add long to half-precision (vector, by
element)

float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

* Floating-point multiply-add long-long to single-precision (vector, by
element)

float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn,
mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
---
 clang/include/clang/Basic/arm_neon.td         |  24 ++
 clang/lib/CodeGen/CGBuiltin.cpp               | 107 ++++-
 clang/lib/CodeGen/CodeGenFunction.h           |  10 +-
 .../fp8-intrinsics/acle_neon_fp8_fmla.c       | 365 ++++++++++++++++++
 .../acle_neon_fp8_fmla.c                      |  49 +++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  33 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  33 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  30 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll    | 110 ++++++
 9 files changed, 717 insertions(+), 44 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index c6609f312969e..577379e492160 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2161,6 +2161,30 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
   def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
+  def VMLALB_F16_F8 : VInst<"vmlalb_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALT_F16_F8 : VInst<"vmlalt_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VMLALLBB_F32_F8 : VInst<"vmlallbb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBT_F32_F8 : VInst<"vmlallbt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTB_F32_F8 : VInst<"vmlalltb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTT_F32_F8 : VInst<"vmlalltt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+
+  def VMLALB_F16_F8_LANE  : VInst<"vmlalb_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALB_F16_F8_LANEQ : VInst<"vmlalb_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALT_F16_F8_LANE  : VInst<"vmlalt_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALT_F16_F8_LANEQ : VInst<"vmlalt_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+
+  def VMLALLBB_F32_F8_LANE  : VInst<"vmlallbb_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLBB_F32_F8_LANEQ : VInst<"vmlallbb_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLBT_F32_F8_LANE  : VInst<"vmlallbt_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLBT_F32_F8_LANEQ : VInst<"vmlallbt_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLTB_F32_F8_LANE  : VInst<"vmlalltb_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLTB_F32_F8_LANEQ : VInst<"vmlalltb_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLTT_F32_F8_LANE  : VInst<"vmlalltt_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLTT_F32_F8_LANEQ : VInst<"vmlalltt_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index b4b26eb84d5f9..9ae133421d8da 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6759,29 +6759,45 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
-Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
+                                        ArrayRef<llvm::Type *> Tys,
                                         SmallVectorImpl<Value *> &Ops,
-                                        Value *FPM, const char *name) {
+                                        const CallExpr *E, const char *name) {
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
   Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
-  return EmitNeonCall(F, Ops, name);
+  return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
 }
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
-    unsigned IID, bool ExtendLane, llvm::Type *RetTy,
+    unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
     SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
 
   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
                              RetTy->getPrimitiveSizeInBits();
   llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
                        Ops[1]->getType()};
-  if (ExtendLane) {
+  if (ExtendLaneArg) {
     auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
                                         Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
+}
+
+llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
+    unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
+    SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
+
+  if (ExtendLaneArg) {
+    auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
+                                        Builder.getInt64(0));
+  }
+  const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
+                             RetTy->getPrimitiveSizeInBits();
+  return EmitFP8NeonCall(IID, {llvm::FixedVectorType::get(RetTy, ElemCount)},
+                         Ops, E, name);
 }
 
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6802,9 +6818,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
     Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
     Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 // Right-shift a vector by a constant.
@@ -12779,7 +12793,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   unsigned Int;
   bool ExtractLow = false;
-  bool ExtendLane = false;
+  bool ExtendLaneArg = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14054,24 +14068,85 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
                                Ops, E, "fdot2");
   case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
-    ExtendLane = true;
+    ExtendLaneArg = true;
     LLVM_FALLTHROUGH;
   case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
-                               ExtendLane, HalfTy, Ops, E, "fdot2_lane");
+                               ExtendLaneArg, HalfTy, Ops, E, "fdot2_lane");
   case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
                                FloatTy, Ops, E, "fdot4");
   case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
-    ExtendLane = true;
+    ExtendLaneArg = true;
     LLVM_FALLTHROUGH;
   case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
-                               ExtendLane, FloatTy, Ops, E, "fdot4_lane");
+                               ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
+
+  case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane,
+                               ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
+  case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane,
+                               ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
+  case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index fd6d44b2579b9..f70e73fdab0e3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4692,14 +4692,18 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
-  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+  llvm::Value *EmitFP8NeonCall(unsigned IID, ArrayRef<llvm::Type *> Tys,
                                SmallVectorImpl<llvm::Value *> &O,
-                               llvm::Value *FPM, const char *name);
+                               const CallExpr *E, const char *name);
   llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   const CallExpr *E, const char *name);
-  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
+  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLaneArg,
+                                   llvm::Type *RetTy,
+                                   SmallVectorImpl<llvm::Value *> &Ops,
+                                   const CallExpr *E, const char *name);
+  llvm::Value *EmitFP8NeonFMLACall(unsigned IID, bool ExtendLaneArg,
                                    llvm::Type *RetTy,
                                    SmallVectorImpl<llvm::Value *> &Ops,
                                    const CallExpr *E, const char *name);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 0000000000000..736538073cb39
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,365 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalb13__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalbq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalt13__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlaltq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbb13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbt13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbtq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltb13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalltbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltt13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallttq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_lane(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalb_lane13__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 0)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalb_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlalbq_lane_f16_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_laneq(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalb_laneq13__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalb_laneq(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalbq_laneq_f16_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_lane(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalt_lane13__Float16x8_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 7)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalt_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlaltq_lane_f16_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_laneq(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalt_laneq13__Float16x8_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VMLAL_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalt_laneq(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlaltq_laneq_f16_mf8_fpm(vd, vn, vm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlallbb_lane13__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbb_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallbbq_lane_f32_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlallbb_laneq13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbb_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbbq_laneq_f32_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlallbt_lane13__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbt_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallbtq_lane_f32_mf8_fpm(vd, vn, vm, 3, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlallbt_laneq13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbt_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbtq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlalltb_lane13__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltb_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlalltbq_lane_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlalltb_laneq13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltb_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalltbq_laneq_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlalltt_lane13__Float32x4_t14__Mfloat8x16_t13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltt_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallttq_lane_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlalltt_laneq13__Float32x4_t14__Mfloat8x16_tS0_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltt_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallttq_laneq_f32_mf8_fpm(vd, vn, vm, 15, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 0000000000000..4a507b08040ff
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,49 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x8_t a, float32x4_t b, mfloat8x16_t u, fpm_t fpm) {
+  (void) vmlalbq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error@-1 {{'vmlalbq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlaltq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error@-1 {{'vmlaltq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error@-1 {{'vmlallbbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbtq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error@-1 {{'vmlallbtq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlalltbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error@-1 {{'vmlalltbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallttq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error@-1 {{'vmlallttq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+}
+
+void test_imm(float16x8_t d, float32x4_t c, mfloat8x16_t a, mfloat8x8_t b, fpm_t fpm) {
+(void) vmlalbq_lane_f16_mf8_fpm(d, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlalbq_laneq_f16_mf8_fpm(d, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlaltq_lane_f16_mf8_fpm(d, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlaltq_laneq_f16_mf8_fpm(d, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+
+(void) vmlallbbq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallbbq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlallbtq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallbtq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlalltbq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlalltbq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlallttq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallttq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error@-1 {{argument value -1 is outside the valid range [0, 15]}}
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 395db293063f4..6dfc3c8f2a393 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1036,6 +1036,39 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
 
   def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
   def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+
+// Fused multiply-add
+  class AdvSIMD_FP8_FMLA_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_v16i8_ty,
+                             llvm_v16i8_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  class AdvSIMD_FP8_FMLA_LANE_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_v16i8_ty,
+                             llvm_v16i8_ty,
+                             llvm_i32_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
+  def int_aarch64_neon_fp8_fmlalb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalt : AdvSIMD_FP8_FMLA_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlallbb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlallbt : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltt : AdvSIMD_FP8_FMLA_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlalb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlallbb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlallbt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index dea2af16e3184..3bb5d3cb4d09d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6519,14 +6519,15 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
                                          v4f32, v8f16, OpNode>;
 }
 
-multiclass SIMDThreeSameVectorMLA<bit Q, string asm>{
+multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> {
+
   def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
-                                         V128, v8f16, v16i8, null_frag>;
+                                         V128, v8f16, v16i8, op>;
 }
 
-multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm>{
+multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOperator op> {
   def v4f32 : BaseSIMDThreeSameVectorDot<Q, 0b0, sz, 0b1000, asm, ".4s", ".16b",
-                                         V128, v4f32, v16i8, null_frag>;
+                                         V128, v4f32, v16i8, op>;
 }
 
 // FP8 assembly/disassembly classes
@@ -9104,7 +9105,7 @@ class BaseSIMDThreeSameVectorIndexB<bit Q, bit U, bits<2> sz, bits<4> opc,
                                     RegisterOperand RegType,
                                     RegisterOperand RegType_lo>
   : BaseSIMDIndexedTied<Q, U, 0b0, sz, opc,
-                        RegType, RegType, RegType_lo, VectorIndexB,
+                        RegType, RegType, RegType_lo, VectorIndexB32b_timm,
                         asm, "", dst_kind, ".16b", ".b", []> {
 
   // idx = H:L:M
@@ -9113,14 +9114,24 @@ class BaseSIMDThreeSameVectorIndexB<bit Q, bit U, bits<2> sz, bits<4> opc,
   let Inst{21-19} = idx{2-0};
 }
 
-multiclass SIMDThreeSameVectorMLAIndex<bit Q, string asm> {
-  def v8f16 : BaseSIMDThreeSameVectorIndexB<Q, 0b0, 0b11, 0b0000, asm, ".8h",
-                                            V128, V128_0to7>;
+multiclass SIMDThreeSameVectorMLAIndex<bit Q, string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f16 : BaseSIMDThreeSameVectorIndexB<Q, 0b0, 0b11, 0b0000, asm, ".8h",
+                                              V128, V128_0to7>;
+  }
+
+  def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_0to7:$Rm), VectorIndexB32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>;
 }
 
-multiclass SIMDThreeSameVectorMLALIndex<bit Q, bits<2> sz, string asm> {
-  def v4f32 : BaseSIMDThreeSameVectorIndexB<Q, 0b1, sz, 0b1000, asm, ".4s",
-                                            V128, V128_0to7>;
+multiclass SIMDThreeSameVectorMLALIndex<bit Q, bits<2> sz, string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v4f32 : BaseSIMDThreeSameVectorIndexB<Q, 0b1, sz, 0b1000, asm, ".4s",
+                                              V128, V128_0to7>;
+  }
+
+  def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_0to7:$Rm), VectorIndexB32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 364566f63bca1..d112d4f10e47d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10353,20 +10353,22 @@ let Predicates = [HasNEON, HasFAMINMAX] in {
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>;
 } // End let Predicates = [HasNEON, HasFAMINMAX]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
- defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
- defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
- defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
- defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt">;
- defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
- defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
-
- defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">;
- defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">;
- defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">;
- defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">;
- defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">;
- defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
+let Predicates = [HasFP8FMA] in {
+ defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb_lane>;
+ defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt_lane>;
+ defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb_lane>;
+ defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt_lane>;
+ defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb_lane>;
+ defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt_lane>;
+}
+
+let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
+ defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>;
+ defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>;
+ defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>;
+ defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt>;
+ defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb>;
+ defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt>;
 } // End let Predicates = [HasFP8FMA]
 
 let Predicates = [HasFP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
new file mode 100644
index 0000000000000..60957a7c0f2f4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8fma < %s | FileCheck %s
+
+define <8 x half> @test_fmlalb(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <8 x half> @test_fmlalt(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <4 x float> @test_fmlallbb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlallbt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <8 x half> @test_fmlalb_lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb v0.8h, v1.16b, v2.b[0]
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_fmlalt_lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt v0.8h, v1.16b, v2.b[4]
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 4)
+  ret <8 x half> %res
+}
+
+define <4 x float> @test_fmlallbb_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlallbb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbb v0.4s, v1.16b, v2.b[7]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlallbt_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlallbt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbt v0.4s, v1.16b, v2.b[10]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 10)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlalltb_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalltb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltb v0.4s, v1.16b, v2.b[13]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 13)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlalltt_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalltt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltt v0.4s, v1.16b, v2.b[15]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 15)
+  ret <4 x float> %res
+}

From 194c74e0166876ef2c5e0ff5e98a4b5a5442138c Mon Sep 17 00:00:00 2001
From: Nathan Ridge <zeratul976@hotmail.com>
Date: Mon, 27 Jan 2025 19:51:30 -0500
Subject: [PATCH 309/432] [clang][Sema] Null-check type in resolveMemberExpr()
 before checking for auto type (#124628)

Fixes https://github.com/clangd/clangd/issues/2301
---
 clang/lib/Sema/HeuristicResolver.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index 87c7274e7aefa..947cf3f2f0a02 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -231,15 +231,17 @@ std::vector<const NamedDecl *> HeuristicResolverImpl::resolveMemberExpr(
   QualType BaseType = ME->getBaseType();
   if (ME->isArrow()) {
     BaseType = getPointeeType(BaseType);
+    if (BaseType.isNull())
+      return {};
   }
-  if (BaseType.isNull())
-    return {};
   if (const auto *BT = BaseType->getAs<BuiltinType>()) {
     // If BaseType is the type of a dependent expression, it's just
     // represented as BuiltinType::Dependent which gives us no information. We
     // can get further by analyzing the dependent expression.
     if (Base && BT->getKind() == BuiltinType::Dependent) {
       BaseType = resolveExprToType(Base);
+      if (BaseType.isNull())
+        return {};
     }
   }
   if (const auto *AT = BaseType->getContainedAutoType()) {

From d839e765f03cf76e3770921d2f8e8bf510136dca Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 16:43:41 -0800
Subject: [PATCH 310/432] [TargetLowering] Inline the only caller of one of the
 forceExpandWideMUL functions. NFC

This caller does not need the libcall portion so it can directly
call forceExpandMultiply.
---
 llvm/include/llvm/CodeGen/TargetLowering.h    | 18 ++-----
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |  3 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 51 -------------------
 3 files changed, 5 insertions(+), 67 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4ad2835a70404..9fcd2ac9514e5 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5512,20 +5512,10 @@ class TargetLowering : public TargetLoweringBase {
                            SDValue HiLHS = SDValue(),
                            SDValue HiRHS = SDValue()) const;
 
-  /// forceExpandWideMUL - Unconditionally expand a MUL into either a libcall or
-  /// brute force via a wide multiplication. The expansion works by
-  /// attempting to do a multiplication on a wider type twice the size of the
-  /// original operands. LL and LH represent the lower and upper halves of the
-  /// first operand. RL and RH represent the lower and upper halves of the
-  /// second operand. The upper and lower halves of the result are stored in Lo
-  /// and Hi.
-  void forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl, bool Signed,
-                          EVT WideVT, const SDValue LL, const SDValue LH,
-                          const SDValue RL, const SDValue RH, SDValue &Lo,
-                          SDValue &Hi) const;
-
-  /// Same as above, but creates the upper halves of each operand by
-  /// sign/zero-extending the operands.
+  /// Calculate full product of LHS and RHS either via a libcall or through
+  /// brute force expansion of the multiplication. The expansion works by
+  /// splitting the 2 inputs into 4 pieces that we can multiply and add together
+  /// without needing MULH or MUL_LOHI.
   void forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl, bool Signed,
                           const SDValue LHS, const SDValue RHS, SDValue &Lo,
                           SDValue &Hi) const;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b0a624680231e..625052be657ca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4296,8 +4296,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   if (LC == RTLIB::UNKNOWN_LIBCALL || !TLI.getLibcallName(LC)) {
     // Perform a wide multiplication where the wide type is the original VT and
     // the 4 parts are the split arguments.
-    TLI.forceExpandWideMUL(DAG, dl, /*Signed=*/true, VT, LL, LH, RL, RH, Lo,
-                           Hi);
+    TLI.forceExpandMultiply(DAG, dl, /*Signed=*/false, Lo, Hi, LL, RL, LH, RH);
     return;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a37ec662ce2d9..49ec47f4e8a70 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -10915,57 +10915,6 @@ void TargetLowering::forceExpandMultiply(SelectionDAG &DAG, const SDLoc &dl,
   }
 }
 
-void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
-                                        bool Signed, EVT WideVT,
-                                        const SDValue LL, const SDValue LH,
-                                        const SDValue RL, const SDValue RH,
-                                        SDValue &Lo, SDValue &Hi) const {
-  // We can fall back to a libcall with an illegal type for the MUL if we
-  // have a libcall big enough.
-  // Also, we can fall back to a division in some cases, but that's a big
-  // performance hit in the general case.
-  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
-  if (WideVT == MVT::i16)
-    LC = RTLIB::MUL_I16;
-  else if (WideVT == MVT::i32)
-    LC = RTLIB::MUL_I32;
-  else if (WideVT == MVT::i64)
-    LC = RTLIB::MUL_I64;
-  else if (WideVT == MVT::i128)
-    LC = RTLIB::MUL_I128;
-
-  if (LC == RTLIB::UNKNOWN_LIBCALL || !getLibcallName(LC)) {
-    forceExpandMultiply(DAG, dl, /*Signed=*/false, Lo, Hi, LL, RL, LH, RH);
-  } else {
-    // Attempt a libcall.
-    SDValue Ret;
-    TargetLowering::MakeLibCallOptions CallOptions;
-    CallOptions.setIsSigned(Signed);
-    CallOptions.setIsPostTypeLegalization(true);
-    if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
-      // Halves of WideVT are packed into registers in different order
-      // depending on platform endianness. This is usually handled by
-      // the C calling convention, but we can't defer to it in
-      // the legalizer.
-      SDValue Args[] = {LL, LH, RL, RH};
-      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
-    } else {
-      SDValue Args[] = {LH, LL, RH, RL};
-      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
-    }
-    assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
-           "Ret value is a collection of constituent nodes holding result.");
-    if (DAG.getDataLayout().isLittleEndian()) {
-      // Same as above.
-      Lo = Ret.getOperand(0);
-      Hi = Ret.getOperand(1);
-    } else {
-      Lo = Ret.getOperand(1);
-      Hi = Ret.getOperand(0);
-    }
-  }
-}
-
 void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
                                         bool Signed, const SDValue LHS,
                                         const SDValue RHS, SDValue &Lo,

From eeefa72f7c7287a3aa26f6278b9669eb76e8d1be Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 28 Jan 2025 02:15:02 +0100
Subject: [PATCH 311/432] [Clang] [NFC] Introduce
 `ConstDynamicRecursiveASTVisitor` (#122991)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After some discussion around #116823, it was decided that it would be
nice to have a `const` variant of `DynamicRecursiveASTVisitor`, so this
pr does exactly that by making the main DRAV implementation a template
with a single `bool` template parameter that turns several function
parameters from a `T*` or `T&` to a `const T*` or `const T&`.

Since that made the implementation of a bunch of DRAV functions quite a
bit more verbose, I’ve moved most of them to be stamped out by a macro,
which imo makes it easier to understand what’s actually going on there.

For functions which already accepted `const` parameters in the original
RAV implementation, the parameter is `const` in both versions (e.g.
`TraverseTemplateArgument()` always takes a `const TemplateArgument&`);
conversely, parameters that are passed by value (e.g. in
`TraverseType()`, which takes a `QualType` by value) are *not* `const`
in either variant (i.e. the `QualType` argument is always just a
`QualType`, never a `const QualType`).

As a sanity check, I’ve also migrated some random visitor in the static
analyser to the `const` version (and indeed, it ends up simplifying the
code around that particular visitor actually). It would make sense to do
a pass over all visitors and change all which can be `const` use the
`const` version, but that can be done in a follow-up pr.

The [performance
impact](https://llvm-compile-time-tracker.com/compare.php?from=e3cd88a7be1dfd912bb6e7c7e888e7b442ffb5de&to=d55c5afe4a485b6d0431386e6f45cb44c1fc8883&stat=instructions:u)
of this change seems to be negligible. Clang’s binary size went up by
0.5%, but that’s expected considering that this effectively adds an
extra instantiation of `RecursiveASTVisitor`. Fortunately, this is of
course a one-time cost.
---
 .../clang/AST/DynamicRecursiveASTVisitor.h    | 114 +++++---
 clang/lib/AST/DynamicRecursiveASTVisitor.cpp  | 251 ++++++------------
 .../WebKit/RawPtrRefMemberChecker.cpp         |   6 +-
 3 files changed, 161 insertions(+), 210 deletions(-)

diff --git a/clang/include/clang/AST/DynamicRecursiveASTVisitor.h b/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
index 4382d20990829..4e0ba568263bf 100644
--- a/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
+++ b/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
@@ -52,7 +52,11 @@ class ASTContext;
 /// WalkUpFromX or post-order traversal).
 ///
 /// \see RecursiveASTVisitor.
-class DynamicRecursiveASTVisitor {
+template <bool IsConst> class DynamicRecursiveASTVisitorBase {
+protected:
+  template <typename ASTNode>
+  using MaybeConst = std::conditional_t<IsConst, const ASTNode, ASTNode>;
+
 public:
   /// Whether this visitor should recurse into template instantiations.
   bool ShouldVisitTemplateInstantiations = false;
@@ -68,28 +72,29 @@ class DynamicRecursiveASTVisitor {
   bool ShouldVisitLambdaBody = true;
 
 protected:
-  DynamicRecursiveASTVisitor() = default;
-  DynamicRecursiveASTVisitor(DynamicRecursiveASTVisitor &&) = default;
-  DynamicRecursiveASTVisitor(const DynamicRecursiveASTVisitor &) = default;
-  DynamicRecursiveASTVisitor &
-  operator=(DynamicRecursiveASTVisitor &&) = default;
-  DynamicRecursiveASTVisitor &
-  operator=(const DynamicRecursiveASTVisitor &) = default;
+  DynamicRecursiveASTVisitorBase() = default;
+  DynamicRecursiveASTVisitorBase(DynamicRecursiveASTVisitorBase &&) = default;
+  DynamicRecursiveASTVisitorBase(const DynamicRecursiveASTVisitorBase &) =
+      default;
+  DynamicRecursiveASTVisitorBase &
+  operator=(DynamicRecursiveASTVisitorBase &&) = default;
+  DynamicRecursiveASTVisitorBase &
+  operator=(const DynamicRecursiveASTVisitorBase &) = default;
 
 public:
   virtual void anchor();
-  virtual ~DynamicRecursiveASTVisitor() = default;
+  virtual ~DynamicRecursiveASTVisitorBase() = default;
 
   /// Recursively visits an entire AST, starting from the TranslationUnitDecl.
   /// \returns false if visitation was terminated early.
-  virtual bool TraverseAST(ASTContext &AST);
+  virtual bool TraverseAST(MaybeConst<ASTContext> &AST);
 
   /// Recursively visit an attribute, by dispatching to
   /// Traverse*Attr() based on the argument's dynamic type.
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is a Null type location).
-  virtual bool TraverseAttr(Attr *At);
+  virtual bool TraverseAttr(MaybeConst<Attr> *At);
 
   /// Recursively visit a constructor initializer.  This
   /// automatically dispatches to another visitor for the initializer
@@ -97,7 +102,8 @@ class DynamicRecursiveASTVisitor {
   /// be overridden for clients that need access to the name.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool TraverseConstructorInitializer(CXXCtorInitializer *Init);
+  virtual bool
+  TraverseConstructorInitializer(MaybeConst<CXXCtorInitializer> *Init);
 
   /// Recursively visit a base specifier. This can be overridden by a
   /// subclass.
@@ -110,7 +116,7 @@ class DynamicRecursiveASTVisitor {
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is NULL).
-  virtual bool TraverseDecl(Decl *D);
+  virtual bool TraverseDecl(MaybeConst<Decl> *D);
 
   /// Recursively visit a name with its location information.
   ///
@@ -121,13 +127,15 @@ class DynamicRecursiveASTVisitor {
   /// will be used to initialize the capture.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool TraverseLambdaCapture(LambdaExpr *LE, const LambdaCapture *C,
-                                     Expr *Init);
+  virtual bool TraverseLambdaCapture(MaybeConst<LambdaExpr> *LE,
+                                     const LambdaCapture *C,
+                                     MaybeConst<Expr> *Init);
 
   /// Recursively visit a C++ nested-name-specifier.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool TraverseNestedNameSpecifier(NestedNameSpecifier *NNS);
+  virtual bool
+  TraverseNestedNameSpecifier(MaybeConst<NestedNameSpecifier> *NNS);
 
   /// Recursively visit a C++ nested-name-specifier with location
   /// information.
@@ -140,7 +148,7 @@ class DynamicRecursiveASTVisitor {
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is nullptr).
-  virtual bool TraverseStmt(Stmt *S);
+  virtual bool TraverseStmt(MaybeConst<Stmt> *S);
 
   /// Recursively visit a template argument and dispatch to the
   /// appropriate method for the argument type.
@@ -190,41 +198,51 @@ class DynamicRecursiveASTVisitor {
 
   /// Traverse a concept (requirement).
   virtual bool TraverseTypeConstraint(const TypeConstraint *C);
-  virtual bool TraverseConceptRequirement(concepts::Requirement *R);
-  virtual bool TraverseConceptTypeRequirement(concepts::TypeRequirement *R);
-  virtual bool TraverseConceptExprRequirement(concepts::ExprRequirement *R);
-  virtual bool TraverseConceptNestedRequirement(concepts::NestedRequirement *R);
-  virtual bool TraverseConceptReference(ConceptReference *CR);
-  virtual bool VisitConceptReference(ConceptReference *CR) { return true; }
+  virtual bool TraverseConceptRequirement(MaybeConst<concepts::Requirement> *R);
+
+  virtual bool
+  TraverseConceptTypeRequirement(MaybeConst<concepts::TypeRequirement> *R);
+
+  virtual bool
+  TraverseConceptExprRequirement(MaybeConst<concepts::ExprRequirement> *R);
+
+  virtual bool
+  TraverseConceptNestedRequirement(MaybeConst<concepts::NestedRequirement> *R);
+
+  virtual bool TraverseConceptReference(MaybeConst<ConceptReference> *CR);
+  virtual bool VisitConceptReference(MaybeConst<ConceptReference> *CR) {
+    return true;
+  }
 
   /// Visit a node.
-  virtual bool VisitAttr(Attr *A) { return true; }
-  virtual bool VisitDecl(Decl *D) { return true; }
-  virtual bool VisitStmt(Stmt *S) { return true; }
-  virtual bool VisitType(Type *T) { return true; }
+  virtual bool VisitAttr(MaybeConst<Attr> *A) { return true; }
+  virtual bool VisitDecl(MaybeConst<Decl> *D) { return true; }
+  virtual bool VisitStmt(MaybeConst<Stmt> *S) { return true; }
+  virtual bool VisitType(MaybeConst<Type> *T) { return true; }
   virtual bool VisitTypeLoc(TypeLoc TL) { return true; }
 
   /// Walk up from a node.
-  bool WalkUpFromDecl(Decl *D) { return VisitDecl(D); }
-  bool WalkUpFromStmt(Stmt *S) { return VisitStmt(S); }
-  bool WalkUpFromType(Type *T) { return VisitType(T); }
+  bool WalkUpFromDecl(MaybeConst<Decl> *D) { return VisitDecl(D); }
+  bool WalkUpFromStmt(MaybeConst<Stmt> *S) { return VisitStmt(S); }
+  bool WalkUpFromType(MaybeConst<Type> *T) { return VisitType(T); }
   bool WalkUpFromTypeLoc(TypeLoc TL) { return VisitTypeLoc(TL); }
 
   /// Invoked before visiting a statement or expression via data recursion.
   ///
   /// \returns false to skip visiting the node, true otherwise.
-  virtual bool dataTraverseStmtPre(Stmt *S) { return true; }
+  virtual bool dataTraverseStmtPre(MaybeConst<Stmt> *S) { return true; }
 
   /// Invoked after visiting a statement or expression via data recursion.
   /// This is not invoked if the previously invoked \c dataTraverseStmtPre
   /// returned false.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool dataTraverseStmtPost(Stmt *S) { return true; }
-  virtual bool dataTraverseNode(Stmt *S);
+  virtual bool dataTraverseStmtPost(MaybeConst<Stmt> *S) { return true; }
+  virtual bool dataTraverseNode(MaybeConst<Stmt> *S);
 
 #define DEF_TRAVERSE_TMPL_INST(kind)                                           \
-  virtual bool TraverseTemplateInstantiations(kind##TemplateDecl *D);
+  virtual bool TraverseTemplateInstantiations(                                 \
+      MaybeConst<kind##TemplateDecl> *D);
   DEF_TRAVERSE_TMPL_INST(Class)
   DEF_TRAVERSE_TMPL_INST(Var)
   DEF_TRAVERSE_TMPL_INST(Function)
@@ -232,32 +250,34 @@ class DynamicRecursiveASTVisitor {
 
   // Decls.
 #define ABSTRACT_DECL(DECL)
-#define DECL(CLASS, BASE) virtual bool Traverse##CLASS##Decl(CLASS##Decl *D);
+#define DECL(CLASS, BASE)                                                      \
+  virtual bool Traverse##CLASS##Decl(MaybeConst<CLASS##Decl> *D);
 #include "clang/AST/DeclNodes.inc"
 
 #define DECL(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Decl(CLASS##Decl *D);                                \
-  virtual bool Visit##CLASS##Decl(CLASS##Decl *D) { return true; }
+  bool WalkUpFrom##CLASS##Decl(MaybeConst<CLASS##Decl> *D);                    \
+  virtual bool Visit##CLASS##Decl(MaybeConst<CLASS##Decl> *D) { return true; }
 #include "clang/AST/DeclNodes.inc"
 
   // Stmts.
 #define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT) virtual bool Traverse##CLASS(CLASS *S);
+#define STMT(CLASS, PARENT) virtual bool Traverse##CLASS(MaybeConst<CLASS> *S);
 #include "clang/AST/StmtNodes.inc"
 
 #define STMT(CLASS, PARENT)                                                    \
-  bool WalkUpFrom##CLASS(CLASS *S);                                            \
-  virtual bool Visit##CLASS(CLASS *S) { return true; }
+  bool WalkUpFrom##CLASS(MaybeConst<CLASS> *S);                                \
+  virtual bool Visit##CLASS(MaybeConst<CLASS> *S) { return true; }
 #include "clang/AST/StmtNodes.inc"
 
   // Types.
 #define ABSTRACT_TYPE(CLASS, BASE)
-#define TYPE(CLASS, BASE) virtual bool Traverse##CLASS##Type(CLASS##Type *T);
+#define TYPE(CLASS, BASE)                                                      \
+  virtual bool Traverse##CLASS##Type(MaybeConst<CLASS##Type> *T);
 #include "clang/AST/TypeNodes.inc"
 
 #define TYPE(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Type(CLASS##Type *T);                                \
-  virtual bool Visit##CLASS##Type(CLASS##Type *T) { return true; }
+  bool WalkUpFrom##CLASS##Type(MaybeConst<CLASS##Type> *T);                    \
+  virtual bool Visit##CLASS##Type(MaybeConst<CLASS##Type> *T) { return true; }
 #include "clang/AST/TypeNodes.inc"
 
   // TypeLocs.
@@ -271,6 +291,14 @@ class DynamicRecursiveASTVisitor {
   virtual bool Visit##CLASS##TypeLoc(CLASS##TypeLoc TL) { return true; }
 #include "clang/AST/TypeLocNodes.def"
 };
+
+extern template class DynamicRecursiveASTVisitorBase<false>;
+extern template class DynamicRecursiveASTVisitorBase<true>;
+
+using DynamicRecursiveASTVisitor =
+    DynamicRecursiveASTVisitorBase</*Const=*/false>;
+using ConstDynamicRecursiveASTVisitor =
+    DynamicRecursiveASTVisitorBase</*Const=*/true>;
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DYNAMIC_RECURSIVE_AST_VISITOR_H
diff --git a/clang/lib/AST/DynamicRecursiveASTVisitor.cpp b/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
index 8cfabd9f3e93f..9cc0ee60c05f6 100644
--- a/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
+++ b/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
@@ -89,9 +89,9 @@ using namespace clang;
 //
 //   End result: RAV::TraverseCallExpr() is executed,
 namespace {
-struct Impl : RecursiveASTVisitor<Impl> {
-  DynamicRecursiveASTVisitor &Visitor;
-  Impl(DynamicRecursiveASTVisitor &Visitor) : Visitor(Visitor) {}
+template <bool Const> struct Impl : RecursiveASTVisitor<Impl<Const>> {
+  DynamicRecursiveASTVisitorBase<Const> &Visitor;
+  Impl(DynamicRecursiveASTVisitorBase<Const> &Visitor) : Visitor(Visitor) {}
 
   bool shouldVisitTemplateInstantiations() const {
     return Visitor.ShouldVisitTemplateInstantiations;
@@ -189,8 +189,10 @@ struct Impl : RecursiveASTVisitor<Impl> {
 
   // TraverseStmt() always passes in a queue, so we have no choice but to
   // accept it as a parameter here.
-  bool dataTraverseNode(Stmt *S, DataRecursionQueue * = nullptr) {
-    // But since don't support postorder traversal, we don't need it, so
+  bool dataTraverseNode(
+      Stmt *S,
+      typename RecursiveASTVisitor<Impl>::DataRecursionQueue * = nullptr) {
+    // But since we don't support postorder traversal, we don't need it, so
     // simply discard it here. This way, derived classes don't need to worry
     // about including it as a parameter that they never use.
     return Visitor.dataTraverseNode(S);
@@ -266,187 +268,108 @@ struct Impl : RecursiveASTVisitor<Impl> {
 };
 } // namespace
 
-void DynamicRecursiveASTVisitor::anchor() {}
-
-bool DynamicRecursiveASTVisitor::TraverseAST(ASTContext &AST) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseAST(AST);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseAttr(Attr *At) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseAttr(At);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseConstructorInitializer(
-    CXXCtorInitializer *Init) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConstructorInitializer(
-      Init);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseDecl(Decl *D) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseDecl(D);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseLambdaCapture(LambdaExpr *LE,
-                                                       const LambdaCapture *C,
-                                                       Expr *Init) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseLambdaCapture(LE, C,
-                                                                      Init);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseStmt(Stmt *S) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseStmt(S);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTemplateArgument(
-    const TemplateArgument &Arg) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArgument(Arg);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTemplateArguments(
-    ArrayRef<TemplateArgument> Args) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArguments(Args);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTemplateArgumentLoc(
-    const TemplateArgumentLoc &ArgLoc) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArgumentLoc(
-      ArgLoc);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTemplateName(TemplateName Template) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateName(Template);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseType(QualType T) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseType(T);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTypeLoc(TypeLoc TL) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTypeLoc(TL);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseTypeConstraint(
-    const TypeConstraint *C) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTypeConstraint(C);
-}
-bool DynamicRecursiveASTVisitor::TraverseObjCProtocolLoc(
-    ObjCProtocolLoc ProtocolLoc) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseObjCProtocolLoc(
-      ProtocolLoc);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseConceptRequirement(
-    concepts::Requirement *R) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptRequirement(R);
-}
-bool DynamicRecursiveASTVisitor::TraverseConceptTypeRequirement(
-    concepts::TypeRequirement *R) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptTypeRequirement(
-      R);
-}
-bool DynamicRecursiveASTVisitor::TraverseConceptExprRequirement(
-    concepts::ExprRequirement *R) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptExprRequirement(
-      R);
-}
-bool DynamicRecursiveASTVisitor::TraverseConceptNestedRequirement(
-    concepts::NestedRequirement *R) {
-  return Impl(*this)
-      .RecursiveASTVisitor<Impl>::TraverseConceptNestedRequirement(R);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseConceptReference(
-    ConceptReference *CR) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptReference(CR);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseCXXBaseSpecifier(
-    const CXXBaseSpecifier &Base) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseCXXBaseSpecifier(Base);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseDeclarationNameInfo(
-    DeclarationNameInfo NameInfo) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseDeclarationNameInfo(
-      NameInfo);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseNestedNameSpecifier(
-    NestedNameSpecifier *NNS) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseNestedNameSpecifier(
-      NNS);
-}
-
-bool DynamicRecursiveASTVisitor::TraverseNestedNameSpecifierLoc(
-    NestedNameSpecifierLoc NNS) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseNestedNameSpecifierLoc(
-      NNS);
+template <bool Const> void DynamicRecursiveASTVisitorBase<Const>::anchor() {}
+
+// Helper macros to forward a call to the base implementation since that
+// ends up getting very verbose otherwise.
+
+// This calls the RecursiveASTVisitor implementation of the same function,
+// stripping any 'const' that the DRAV implementation may have added since
+// the RAV implementation largely doesn't use 'const'.
+#define FORWARD_TO_BASE(Function, Type, RefOrPointer)                          \
+  template <bool Const>                                                        \
+  bool DynamicRecursiveASTVisitorBase<Const>::Function(                        \
+      MaybeConst<Type> RefOrPointer Param) {                                   \
+    return Impl<Const>(*this)                                                  \
+        .template RecursiveASTVisitor<Impl<Const>>::Function(                  \
+            const_cast<Type RefOrPointer>(Param));                             \
+  }
+
+// Same as 'FORWARD_TO_BASE', but doesn't change the parameter type in any way.
+#define FORWARD_TO_BASE_EXACT(Function, Type)                                  \
+  template <bool Const>                                                        \
+  bool DynamicRecursiveASTVisitorBase<Const>::Function(Type Param) {           \
+    return Impl<Const>(*this)                                                  \
+        .template RecursiveASTVisitor<Impl<Const>>::Function(Param);           \
+  }
+
+FORWARD_TO_BASE(TraverseAST, ASTContext, &)
+FORWARD_TO_BASE(TraverseAttr, Attr, *)
+FORWARD_TO_BASE(TraverseConstructorInitializer, CXXCtorInitializer, *)
+FORWARD_TO_BASE(TraverseDecl, Decl, *)
+FORWARD_TO_BASE(TraverseStmt, Stmt, *)
+FORWARD_TO_BASE(TraverseNestedNameSpecifier, NestedNameSpecifier, *)
+FORWARD_TO_BASE(TraverseTemplateInstantiations, ClassTemplateDecl, *)
+FORWARD_TO_BASE(TraverseTemplateInstantiations, VarTemplateDecl, *)
+FORWARD_TO_BASE(TraverseTemplateInstantiations, FunctionTemplateDecl, *)
+FORWARD_TO_BASE(TraverseConceptRequirement, concepts::Requirement, *)
+FORWARD_TO_BASE(TraverseConceptTypeRequirement, concepts::TypeRequirement, *)
+FORWARD_TO_BASE(TraverseConceptExprRequirement, concepts::ExprRequirement, *)
+FORWARD_TO_BASE(TraverseConceptReference, ConceptReference, *)
+FORWARD_TO_BASE(TraverseConceptNestedRequirement,
+                concepts::NestedRequirement, *)
+
+FORWARD_TO_BASE_EXACT(TraverseCXXBaseSpecifier, const CXXBaseSpecifier &)
+FORWARD_TO_BASE_EXACT(TraverseDeclarationNameInfo, DeclarationNameInfo)
+FORWARD_TO_BASE_EXACT(TraverseTemplateArgument, const TemplateArgument &)
+FORWARD_TO_BASE_EXACT(TraverseTemplateArguments, ArrayRef<TemplateArgument>)
+FORWARD_TO_BASE_EXACT(TraverseTemplateArgumentLoc, const TemplateArgumentLoc &)
+FORWARD_TO_BASE_EXACT(TraverseTemplateName, TemplateName)
+FORWARD_TO_BASE_EXACT(TraverseType, QualType)
+FORWARD_TO_BASE_EXACT(TraverseTypeLoc, TypeLoc)
+FORWARD_TO_BASE_EXACT(TraverseTypeConstraint, const TypeConstraint *)
+FORWARD_TO_BASE_EXACT(TraverseObjCProtocolLoc, ObjCProtocolLoc)
+FORWARD_TO_BASE_EXACT(TraverseNestedNameSpecifierLoc, NestedNameSpecifierLoc)
+
+template <bool Const>
+bool DynamicRecursiveASTVisitorBase<Const>::TraverseLambdaCapture(
+    MaybeConst<LambdaExpr> *LE, const LambdaCapture *C,
+    MaybeConst<Expr> *Init) {
+  return Impl<Const>(*this)
+      .template RecursiveASTVisitor<Impl<Const>>::TraverseLambdaCapture(
+          const_cast<LambdaExpr *>(LE), C, const_cast<Expr *>(Init));
 }
 
-bool DynamicRecursiveASTVisitor::dataTraverseNode(Stmt *S) {
-  return Impl(*this).RecursiveASTVisitor<Impl>::dataTraverseNode(S, nullptr);
+template <bool Const>
+bool DynamicRecursiveASTVisitorBase<Const>::dataTraverseNode(
+    MaybeConst<Stmt> *S) {
+  return Impl<Const>(*this)
+      .template RecursiveASTVisitor<Impl<Const>>::dataTraverseNode(
+          const_cast<Stmt *>(S), nullptr);
 }
 
-#define DEF_TRAVERSE_TMPL_INST(kind)                                           \
-  bool DynamicRecursiveASTVisitor::TraverseTemplateInstantiations(             \
-      kind##TemplateDecl *D) {                                                 \
-    return Impl(*this)                                                         \
-        .RecursiveASTVisitor<Impl>::TraverseTemplateInstantiations(D);         \
-  }
-DEF_TRAVERSE_TMPL_INST(Class)
-DEF_TRAVERSE_TMPL_INST(Var)
-DEF_TRAVERSE_TMPL_INST(Function)
-#undef DEF_TRAVERSE_TMPL_INST
-
 // Declare Traverse*() for and friends all concrete Decl classes.
 #define ABSTRACT_DECL(DECL)
 #define DECL(CLASS, BASE)                                                      \
-  bool DynamicRecursiveASTVisitor::Traverse##CLASS##Decl(CLASS##Decl *D) {     \
-    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##Decl(D);    \
-  }                                                                            \
-  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##Decl(CLASS##Decl *D) {   \
-    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##Decl(D);  \
-  }
+  FORWARD_TO_BASE(Traverse##CLASS##Decl, CLASS##Decl, *)                       \
+  FORWARD_TO_BASE(WalkUpFrom##CLASS##Decl, CLASS##Decl, *)
 #include "clang/AST/DeclNodes.inc"
 
 // Declare Traverse*() and friends for all concrete Stmt classes.
 #define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT)                                                    \
-  bool DynamicRecursiveASTVisitor::Traverse##CLASS(CLASS *S) {                 \
-    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS(S);          \
-  }
+#define STMT(CLASS, PARENT) FORWARD_TO_BASE(Traverse##CLASS, CLASS, *)
 #include "clang/AST/StmtNodes.inc"
 
-#define STMT(CLASS, PARENT)                                                    \
-  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS(CLASS *S) {               \
-    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS(S);        \
-  }
+#define STMT(CLASS, PARENT) FORWARD_TO_BASE(WalkUpFrom##CLASS, CLASS, *)
 #include "clang/AST/StmtNodes.inc"
 
-// Declare Traverse*() and friends for all concrete Typeclasses.
+// Declare Traverse*() and friends for all concrete Type classes.
 #define ABSTRACT_TYPE(CLASS, BASE)
 #define TYPE(CLASS, BASE)                                                      \
-  bool DynamicRecursiveASTVisitor::Traverse##CLASS##Type(CLASS##Type *T) {     \
-    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##Type(T);    \
-  }                                                                            \
-  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##Type(CLASS##Type *T) {   \
-    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##Type(T);  \
-  }
+  FORWARD_TO_BASE(Traverse##CLASS##Type, CLASS##Type, *)                       \
+  FORWARD_TO_BASE(WalkUpFrom##CLASS##Type, CLASS##Type, *)
 #include "clang/AST/TypeNodes.inc"
 
 #define ABSTRACT_TYPELOC(CLASS, BASE)
 #define TYPELOC(CLASS, BASE)                                                   \
-  bool DynamicRecursiveASTVisitor::Traverse##CLASS##TypeLoc(                   \
-      CLASS##TypeLoc TL) {                                                     \
-    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##TypeLoc(    \
-        TL);                                                                   \
-  }
+  FORWARD_TO_BASE_EXACT(Traverse##CLASS##TypeLoc, CLASS##TypeLoc)
 #include "clang/AST/TypeLocNodes.def"
 
 #define TYPELOC(CLASS, BASE)                                                   \
-  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##TypeLoc(                 \
-      CLASS##TypeLoc TL) {                                                     \
-    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##TypeLoc(  \
-        TL);                                                                   \
-  }
+  FORWARD_TO_BASE_EXACT(WalkUpFrom##CLASS##TypeLoc, CLASS##TypeLoc)
 #include "clang/AST/TypeLocNodes.def"
+
+namespace clang {
+template class DynamicRecursiveASTVisitorBase<false>;
+template class DynamicRecursiveASTVisitorBase<true>;
+} // namespace clang
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index d7e5ebee9a9b4..79f88553feb95 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -48,7 +48,7 @@ class RawPtrRefMemberChecker
     // The calls to checkAST* from AnalysisConsumer don't
     // visit template instantiations or lambda classes. We
     // want to visit those, so we make our own RecursiveASTVisitor.
-    struct LocalVisitor : DynamicRecursiveASTVisitor {
+    struct LocalVisitor : ConstDynamicRecursiveASTVisitor {
       const RawPtrRefMemberChecker *Checker;
       explicit LocalVisitor(const RawPtrRefMemberChecker *Checker)
           : Checker(Checker) {
@@ -57,14 +57,14 @@ class RawPtrRefMemberChecker
         ShouldVisitImplicitCode = false;
       }
 
-      bool VisitRecordDecl(RecordDecl *RD) override {
+      bool VisitRecordDecl(const RecordDecl *RD) override {
         Checker->visitRecordDecl(RD);
         return true;
       }
     };
 
     LocalVisitor visitor(this);
-    visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
+    visitor.TraverseDecl(TUD);
   }
 
   void visitRecordDecl(const RecordDecl *RD) const {

From 740e6aeccdb2f8eeee549ba80b15ff3d5dd3392e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 19:16:33 -0600
Subject: [PATCH 312/432] Revert "[AMDGPU] Use the AMDGPUToolChain when
 targeting C/C++ directly (#99687)"

Breaks some downstream AMD stuff apparently.
This reverts commit 6ff86f2c0a5b58b07921ee895f0d16d8cd3d4015.
---
 clang/lib/Driver/Driver.cpp          | 16 ++--------------
 clang/test/Driver/amdgpu-toolchain.c |  3 ---
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index a9bd52920d823..87855fdb79971 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6669,21 +6669,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
     case llvm::Triple::CUDA:
       TC = std::make_unique<toolchains::NVPTXToolChain>(*this, Target, Args);
       break;
-    case llvm::Triple::AMDHSA: {
-      bool CL = llvm::any_of(Args, [&](Arg *A) {
-        return (A->getOption().matches(options::OPT_x) &&
-                types::isOpenCL(
-                    types::lookupTypeForExtension(A->getValue()))) ||
-               (A->getOption().getKind() == Option::InputClass &&
-                StringRef(A->getValue()).rfind('.') != StringRef::npos &&
-                types::isOpenCL(types::lookupTypeForExtension(
-                    &A->getValue()[StringRef(A->getValue()).rfind('.') + 1])));
-      });
-      TC = CL ? std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args)
-              : std::make_unique<toolchains::AMDGPUToolChain>(*this, Target,
-                                                              Args);
+    case llvm::Triple::AMDHSA:
+      TC = std::make_unique<toolchains::ROCMToolChain>(*this, Target, Args);
       break;
-    }
     case llvm::Triple::AMDPAL:
     case llvm::Triple::Mesa3D:
       TC = std::make_unique<toolchains::AMDGPUToolChain>(*this, Target, Args);
diff --git a/clang/test/Driver/amdgpu-toolchain.c b/clang/test/Driver/amdgpu-toolchain.c
index 3299930ee40db..c1c5aa8e90e68 100644
--- a/clang/test/Driver/amdgpu-toolchain.c
+++ b/clang/test/Driver/amdgpu-toolchain.c
@@ -36,6 +36,3 @@
 // RUN: %clang -target amdgcn-amd-amdhsa -march=gfx90a -stdlib -startfiles \
 // RUN:   -nogpulib -nogpuinc -### %s 2>&1 | FileCheck -check-prefix=STARTUP %s
 // STARTUP: ld.lld{{.*}}"-lc" "-lm" "{{.*}}crt1.o"
-//
-// RUN: %clang -### --target=amdgcn-amd-amdhsa -mcpu=gfx906 %s 2>&1 | FileCheck -check-prefix=ROCM %s
-// ROCM-NOT: -mlink-builtin-bitcode

From 8900c09ebfd782bfd41bac63ac5266f80fe29602 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Tue, 28 Jan 2025 09:19:51 +0800
Subject: [PATCH 313/432] [mlir][nvgpu] Fix crash when handling 0D memref in
 OptimizeSharedMemoryPass (#124517)

This PR adds a check for 0D memref types to prevent a crash. Fixes
#119855.
---
 .../Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp  |  4 ++++
 mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir    | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
index 31ffacb29256f..72f7296a865f8 100644
--- a/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
@@ -152,6 +152,10 @@ mlir::nvgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
   if (!memRefType || !NVGPUDialect::hasSharedMemoryAddressSpace(memRefType))
     return failure();
 
+  // Not support 0D MemRefs.
+  if (memRefType.getRank() == 0)
+    return failure();
+
   // Abort if the given value has any sub-views; we do not do any alias
   // analysis.
   bool hasSubView = false;
diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
index 5a212815ceb2a..7477e18728677 100644
--- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
+++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir
@@ -238,3 +238,13 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>,
 
   return %mat: vector<1x2xf16>
 }
+
+// -----
+
+// Ensure this case not crash
+
+// CHECK-LABEL: func @test_0_d
+func.func @test_0_d() -> memref<i32, #gpu.address_space<workgroup>> {
+  %alloc = memref.alloc() : memref<i32, #gpu.address_space<workgroup>>
+  return %alloc : memref<i32, #gpu.address_space<workgroup>>
+}

From 7873d3b50b447213c3f689449a674b2c35f83f9f Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail@gmail.com>
Date: Tue, 28 Jan 2025 02:21:18 +0100
Subject: [PATCH 314/432] Revert "[Clang] [NFC] Introduce
 `ConstDynamicRecursiveASTVisitor`" (#124667)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts llvm/llvm-project#122991

One of the bots is breaking; I’ll have to investigate what the issue is;
this might be because I haven’t updated the branch in a while.
---
 .../clang/AST/DynamicRecursiveASTVisitor.h    | 114 +++-----
 clang/lib/AST/DynamicRecursiveASTVisitor.cpp  | 251 ++++++++++++------
 .../WebKit/RawPtrRefMemberChecker.cpp         |   6 +-
 3 files changed, 210 insertions(+), 161 deletions(-)

diff --git a/clang/include/clang/AST/DynamicRecursiveASTVisitor.h b/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
index 4e0ba568263bf..4382d20990829 100644
--- a/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
+++ b/clang/include/clang/AST/DynamicRecursiveASTVisitor.h
@@ -52,11 +52,7 @@ class ASTContext;
 /// WalkUpFromX or post-order traversal).
 ///
 /// \see RecursiveASTVisitor.
-template <bool IsConst> class DynamicRecursiveASTVisitorBase {
-protected:
-  template <typename ASTNode>
-  using MaybeConst = std::conditional_t<IsConst, const ASTNode, ASTNode>;
-
+class DynamicRecursiveASTVisitor {
 public:
   /// Whether this visitor should recurse into template instantiations.
   bool ShouldVisitTemplateInstantiations = false;
@@ -72,29 +68,28 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   bool ShouldVisitLambdaBody = true;
 
 protected:
-  DynamicRecursiveASTVisitorBase() = default;
-  DynamicRecursiveASTVisitorBase(DynamicRecursiveASTVisitorBase &&) = default;
-  DynamicRecursiveASTVisitorBase(const DynamicRecursiveASTVisitorBase &) =
-      default;
-  DynamicRecursiveASTVisitorBase &
-  operator=(DynamicRecursiveASTVisitorBase &&) = default;
-  DynamicRecursiveASTVisitorBase &
-  operator=(const DynamicRecursiveASTVisitorBase &) = default;
+  DynamicRecursiveASTVisitor() = default;
+  DynamicRecursiveASTVisitor(DynamicRecursiveASTVisitor &&) = default;
+  DynamicRecursiveASTVisitor(const DynamicRecursiveASTVisitor &) = default;
+  DynamicRecursiveASTVisitor &
+  operator=(DynamicRecursiveASTVisitor &&) = default;
+  DynamicRecursiveASTVisitor &
+  operator=(const DynamicRecursiveASTVisitor &) = default;
 
 public:
   virtual void anchor();
-  virtual ~DynamicRecursiveASTVisitorBase() = default;
+  virtual ~DynamicRecursiveASTVisitor() = default;
 
   /// Recursively visits an entire AST, starting from the TranslationUnitDecl.
   /// \returns false if visitation was terminated early.
-  virtual bool TraverseAST(MaybeConst<ASTContext> &AST);
+  virtual bool TraverseAST(ASTContext &AST);
 
   /// Recursively visit an attribute, by dispatching to
   /// Traverse*Attr() based on the argument's dynamic type.
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is a Null type location).
-  virtual bool TraverseAttr(MaybeConst<Attr> *At);
+  virtual bool TraverseAttr(Attr *At);
 
   /// Recursively visit a constructor initializer.  This
   /// automatically dispatches to another visitor for the initializer
@@ -102,8 +97,7 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   /// be overridden for clients that need access to the name.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool
-  TraverseConstructorInitializer(MaybeConst<CXXCtorInitializer> *Init);
+  virtual bool TraverseConstructorInitializer(CXXCtorInitializer *Init);
 
   /// Recursively visit a base specifier. This can be overridden by a
   /// subclass.
@@ -116,7 +110,7 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is NULL).
-  virtual bool TraverseDecl(MaybeConst<Decl> *D);
+  virtual bool TraverseDecl(Decl *D);
 
   /// Recursively visit a name with its location information.
   ///
@@ -127,15 +121,13 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   /// will be used to initialize the capture.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool TraverseLambdaCapture(MaybeConst<LambdaExpr> *LE,
-                                     const LambdaCapture *C,
-                                     MaybeConst<Expr> *Init);
+  virtual bool TraverseLambdaCapture(LambdaExpr *LE, const LambdaCapture *C,
+                                     Expr *Init);
 
   /// Recursively visit a C++ nested-name-specifier.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool
-  TraverseNestedNameSpecifier(MaybeConst<NestedNameSpecifier> *NNS);
+  virtual bool TraverseNestedNameSpecifier(NestedNameSpecifier *NNS);
 
   /// Recursively visit a C++ nested-name-specifier with location
   /// information.
@@ -148,7 +140,7 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   ///
   /// \returns false if the visitation was terminated early, true
   /// otherwise (including when the argument is nullptr).
-  virtual bool TraverseStmt(MaybeConst<Stmt> *S);
+  virtual bool TraverseStmt(Stmt *S);
 
   /// Recursively visit a template argument and dispatch to the
   /// appropriate method for the argument type.
@@ -198,51 +190,41 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
 
   /// Traverse a concept (requirement).
   virtual bool TraverseTypeConstraint(const TypeConstraint *C);
-  virtual bool TraverseConceptRequirement(MaybeConst<concepts::Requirement> *R);
-
-  virtual bool
-  TraverseConceptTypeRequirement(MaybeConst<concepts::TypeRequirement> *R);
-
-  virtual bool
-  TraverseConceptExprRequirement(MaybeConst<concepts::ExprRequirement> *R);
-
-  virtual bool
-  TraverseConceptNestedRequirement(MaybeConst<concepts::NestedRequirement> *R);
-
-  virtual bool TraverseConceptReference(MaybeConst<ConceptReference> *CR);
-  virtual bool VisitConceptReference(MaybeConst<ConceptReference> *CR) {
-    return true;
-  }
+  virtual bool TraverseConceptRequirement(concepts::Requirement *R);
+  virtual bool TraverseConceptTypeRequirement(concepts::TypeRequirement *R);
+  virtual bool TraverseConceptExprRequirement(concepts::ExprRequirement *R);
+  virtual bool TraverseConceptNestedRequirement(concepts::NestedRequirement *R);
+  virtual bool TraverseConceptReference(ConceptReference *CR);
+  virtual bool VisitConceptReference(ConceptReference *CR) { return true; }
 
   /// Visit a node.
-  virtual bool VisitAttr(MaybeConst<Attr> *A) { return true; }
-  virtual bool VisitDecl(MaybeConst<Decl> *D) { return true; }
-  virtual bool VisitStmt(MaybeConst<Stmt> *S) { return true; }
-  virtual bool VisitType(MaybeConst<Type> *T) { return true; }
+  virtual bool VisitAttr(Attr *A) { return true; }
+  virtual bool VisitDecl(Decl *D) { return true; }
+  virtual bool VisitStmt(Stmt *S) { return true; }
+  virtual bool VisitType(Type *T) { return true; }
   virtual bool VisitTypeLoc(TypeLoc TL) { return true; }
 
   /// Walk up from a node.
-  bool WalkUpFromDecl(MaybeConst<Decl> *D) { return VisitDecl(D); }
-  bool WalkUpFromStmt(MaybeConst<Stmt> *S) { return VisitStmt(S); }
-  bool WalkUpFromType(MaybeConst<Type> *T) { return VisitType(T); }
+  bool WalkUpFromDecl(Decl *D) { return VisitDecl(D); }
+  bool WalkUpFromStmt(Stmt *S) { return VisitStmt(S); }
+  bool WalkUpFromType(Type *T) { return VisitType(T); }
   bool WalkUpFromTypeLoc(TypeLoc TL) { return VisitTypeLoc(TL); }
 
   /// Invoked before visiting a statement or expression via data recursion.
   ///
   /// \returns false to skip visiting the node, true otherwise.
-  virtual bool dataTraverseStmtPre(MaybeConst<Stmt> *S) { return true; }
+  virtual bool dataTraverseStmtPre(Stmt *S) { return true; }
 
   /// Invoked after visiting a statement or expression via data recursion.
   /// This is not invoked if the previously invoked \c dataTraverseStmtPre
   /// returned false.
   ///
   /// \returns false if the visitation was terminated early, true otherwise.
-  virtual bool dataTraverseStmtPost(MaybeConst<Stmt> *S) { return true; }
-  virtual bool dataTraverseNode(MaybeConst<Stmt> *S);
+  virtual bool dataTraverseStmtPost(Stmt *S) { return true; }
+  virtual bool dataTraverseNode(Stmt *S);
 
 #define DEF_TRAVERSE_TMPL_INST(kind)                                           \
-  virtual bool TraverseTemplateInstantiations(                                 \
-      MaybeConst<kind##TemplateDecl> *D);
+  virtual bool TraverseTemplateInstantiations(kind##TemplateDecl *D);
   DEF_TRAVERSE_TMPL_INST(Class)
   DEF_TRAVERSE_TMPL_INST(Var)
   DEF_TRAVERSE_TMPL_INST(Function)
@@ -250,34 +232,32 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
 
   // Decls.
 #define ABSTRACT_DECL(DECL)
-#define DECL(CLASS, BASE)                                                      \
-  virtual bool Traverse##CLASS##Decl(MaybeConst<CLASS##Decl> *D);
+#define DECL(CLASS, BASE) virtual bool Traverse##CLASS##Decl(CLASS##Decl *D);
 #include "clang/AST/DeclNodes.inc"
 
 #define DECL(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Decl(MaybeConst<CLASS##Decl> *D);                    \
-  virtual bool Visit##CLASS##Decl(MaybeConst<CLASS##Decl> *D) { return true; }
+  bool WalkUpFrom##CLASS##Decl(CLASS##Decl *D);                                \
+  virtual bool Visit##CLASS##Decl(CLASS##Decl *D) { return true; }
 #include "clang/AST/DeclNodes.inc"
 
   // Stmts.
 #define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT) virtual bool Traverse##CLASS(MaybeConst<CLASS> *S);
+#define STMT(CLASS, PARENT) virtual bool Traverse##CLASS(CLASS *S);
 #include "clang/AST/StmtNodes.inc"
 
 #define STMT(CLASS, PARENT)                                                    \
-  bool WalkUpFrom##CLASS(MaybeConst<CLASS> *S);                                \
-  virtual bool Visit##CLASS(MaybeConst<CLASS> *S) { return true; }
+  bool WalkUpFrom##CLASS(CLASS *S);                                            \
+  virtual bool Visit##CLASS(CLASS *S) { return true; }
 #include "clang/AST/StmtNodes.inc"
 
   // Types.
 #define ABSTRACT_TYPE(CLASS, BASE)
-#define TYPE(CLASS, BASE)                                                      \
-  virtual bool Traverse##CLASS##Type(MaybeConst<CLASS##Type> *T);
+#define TYPE(CLASS, BASE) virtual bool Traverse##CLASS##Type(CLASS##Type *T);
 #include "clang/AST/TypeNodes.inc"
 
 #define TYPE(CLASS, BASE)                                                      \
-  bool WalkUpFrom##CLASS##Type(MaybeConst<CLASS##Type> *T);                    \
-  virtual bool Visit##CLASS##Type(MaybeConst<CLASS##Type> *T) { return true; }
+  bool WalkUpFrom##CLASS##Type(CLASS##Type *T);                                \
+  virtual bool Visit##CLASS##Type(CLASS##Type *T) { return true; }
 #include "clang/AST/TypeNodes.inc"
 
   // TypeLocs.
@@ -291,14 +271,6 @@ template <bool IsConst> class DynamicRecursiveASTVisitorBase {
   virtual bool Visit##CLASS##TypeLoc(CLASS##TypeLoc TL) { return true; }
 #include "clang/AST/TypeLocNodes.def"
 };
-
-extern template class DynamicRecursiveASTVisitorBase<false>;
-extern template class DynamicRecursiveASTVisitorBase<true>;
-
-using DynamicRecursiveASTVisitor =
-    DynamicRecursiveASTVisitorBase</*Const=*/false>;
-using ConstDynamicRecursiveASTVisitor =
-    DynamicRecursiveASTVisitorBase</*Const=*/true>;
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DYNAMIC_RECURSIVE_AST_VISITOR_H
diff --git a/clang/lib/AST/DynamicRecursiveASTVisitor.cpp b/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
index 9cc0ee60c05f6..8cfabd9f3e93f 100644
--- a/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
+++ b/clang/lib/AST/DynamicRecursiveASTVisitor.cpp
@@ -89,9 +89,9 @@ using namespace clang;
 //
 //   End result: RAV::TraverseCallExpr() is executed,
 namespace {
-template <bool Const> struct Impl : RecursiveASTVisitor<Impl<Const>> {
-  DynamicRecursiveASTVisitorBase<Const> &Visitor;
-  Impl(DynamicRecursiveASTVisitorBase<Const> &Visitor) : Visitor(Visitor) {}
+struct Impl : RecursiveASTVisitor<Impl> {
+  DynamicRecursiveASTVisitor &Visitor;
+  Impl(DynamicRecursiveASTVisitor &Visitor) : Visitor(Visitor) {}
 
   bool shouldVisitTemplateInstantiations() const {
     return Visitor.ShouldVisitTemplateInstantiations;
@@ -189,10 +189,8 @@ template <bool Const> struct Impl : RecursiveASTVisitor<Impl<Const>> {
 
   // TraverseStmt() always passes in a queue, so we have no choice but to
   // accept it as a parameter here.
-  bool dataTraverseNode(
-      Stmt *S,
-      typename RecursiveASTVisitor<Impl>::DataRecursionQueue * = nullptr) {
-    // But since we don't support postorder traversal, we don't need it, so
+  bool dataTraverseNode(Stmt *S, DataRecursionQueue * = nullptr) {
+    // But since don't support postorder traversal, we don't need it, so
     // simply discard it here. This way, derived classes don't need to worry
     // about including it as a parameter that they never use.
     return Visitor.dataTraverseNode(S);
@@ -268,108 +266,187 @@ template <bool Const> struct Impl : RecursiveASTVisitor<Impl<Const>> {
 };
 } // namespace
 
-template <bool Const> void DynamicRecursiveASTVisitorBase<Const>::anchor() {}
-
-// Helper macros to forward a call to the base implementation since that
-// ends up getting very verbose otherwise.
-
-// This calls the RecursiveASTVisitor implementation of the same function,
-// stripping any 'const' that the DRAV implementation may have added since
-// the RAV implementation largely doesn't use 'const'.
-#define FORWARD_TO_BASE(Function, Type, RefOrPointer)                          \
-  template <bool Const>                                                        \
-  bool DynamicRecursiveASTVisitorBase<Const>::Function(                        \
-      MaybeConst<Type> RefOrPointer Param) {                                   \
-    return Impl<Const>(*this)                                                  \
-        .template RecursiveASTVisitor<Impl<Const>>::Function(                  \
-            const_cast<Type RefOrPointer>(Param));                             \
-  }
-
-// Same as 'FORWARD_TO_BASE', but doesn't change the parameter type in any way.
-#define FORWARD_TO_BASE_EXACT(Function, Type)                                  \
-  template <bool Const>                                                        \
-  bool DynamicRecursiveASTVisitorBase<Const>::Function(Type Param) {           \
-    return Impl<Const>(*this)                                                  \
-        .template RecursiveASTVisitor<Impl<Const>>::Function(Param);           \
-  }
-
-FORWARD_TO_BASE(TraverseAST, ASTContext, &)
-FORWARD_TO_BASE(TraverseAttr, Attr, *)
-FORWARD_TO_BASE(TraverseConstructorInitializer, CXXCtorInitializer, *)
-FORWARD_TO_BASE(TraverseDecl, Decl, *)
-FORWARD_TO_BASE(TraverseStmt, Stmt, *)
-FORWARD_TO_BASE(TraverseNestedNameSpecifier, NestedNameSpecifier, *)
-FORWARD_TO_BASE(TraverseTemplateInstantiations, ClassTemplateDecl, *)
-FORWARD_TO_BASE(TraverseTemplateInstantiations, VarTemplateDecl, *)
-FORWARD_TO_BASE(TraverseTemplateInstantiations, FunctionTemplateDecl, *)
-FORWARD_TO_BASE(TraverseConceptRequirement, concepts::Requirement, *)
-FORWARD_TO_BASE(TraverseConceptTypeRequirement, concepts::TypeRequirement, *)
-FORWARD_TO_BASE(TraverseConceptExprRequirement, concepts::ExprRequirement, *)
-FORWARD_TO_BASE(TraverseConceptReference, ConceptReference, *)
-FORWARD_TO_BASE(TraverseConceptNestedRequirement,
-                concepts::NestedRequirement, *)
-
-FORWARD_TO_BASE_EXACT(TraverseCXXBaseSpecifier, const CXXBaseSpecifier &)
-FORWARD_TO_BASE_EXACT(TraverseDeclarationNameInfo, DeclarationNameInfo)
-FORWARD_TO_BASE_EXACT(TraverseTemplateArgument, const TemplateArgument &)
-FORWARD_TO_BASE_EXACT(TraverseTemplateArguments, ArrayRef<TemplateArgument>)
-FORWARD_TO_BASE_EXACT(TraverseTemplateArgumentLoc, const TemplateArgumentLoc &)
-FORWARD_TO_BASE_EXACT(TraverseTemplateName, TemplateName)
-FORWARD_TO_BASE_EXACT(TraverseType, QualType)
-FORWARD_TO_BASE_EXACT(TraverseTypeLoc, TypeLoc)
-FORWARD_TO_BASE_EXACT(TraverseTypeConstraint, const TypeConstraint *)
-FORWARD_TO_BASE_EXACT(TraverseObjCProtocolLoc, ObjCProtocolLoc)
-FORWARD_TO_BASE_EXACT(TraverseNestedNameSpecifierLoc, NestedNameSpecifierLoc)
-
-template <bool Const>
-bool DynamicRecursiveASTVisitorBase<Const>::TraverseLambdaCapture(
-    MaybeConst<LambdaExpr> *LE, const LambdaCapture *C,
-    MaybeConst<Expr> *Init) {
-  return Impl<Const>(*this)
-      .template RecursiveASTVisitor<Impl<Const>>::TraverseLambdaCapture(
-          const_cast<LambdaExpr *>(LE), C, const_cast<Expr *>(Init));
+void DynamicRecursiveASTVisitor::anchor() {}
+
+bool DynamicRecursiveASTVisitor::TraverseAST(ASTContext &AST) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseAST(AST);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseAttr(Attr *At) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseAttr(At);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseConstructorInitializer(
+    CXXCtorInitializer *Init) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConstructorInitializer(
+      Init);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseDecl(Decl *D) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseDecl(D);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseLambdaCapture(LambdaExpr *LE,
+                                                       const LambdaCapture *C,
+                                                       Expr *Init) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseLambdaCapture(LE, C,
+                                                                      Init);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseStmt(Stmt *S) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseStmt(S);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTemplateArgument(
+    const TemplateArgument &Arg) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArgument(Arg);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTemplateArguments(
+    ArrayRef<TemplateArgument> Args) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArguments(Args);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTemplateArgumentLoc(
+    const TemplateArgumentLoc &ArgLoc) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateArgumentLoc(
+      ArgLoc);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTemplateName(TemplateName Template) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTemplateName(Template);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseType(QualType T) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseType(T);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTypeLoc(TypeLoc TL) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTypeLoc(TL);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseTypeConstraint(
+    const TypeConstraint *C) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseTypeConstraint(C);
+}
+bool DynamicRecursiveASTVisitor::TraverseObjCProtocolLoc(
+    ObjCProtocolLoc ProtocolLoc) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseObjCProtocolLoc(
+      ProtocolLoc);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseConceptRequirement(
+    concepts::Requirement *R) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptRequirement(R);
+}
+bool DynamicRecursiveASTVisitor::TraverseConceptTypeRequirement(
+    concepts::TypeRequirement *R) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptTypeRequirement(
+      R);
+}
+bool DynamicRecursiveASTVisitor::TraverseConceptExprRequirement(
+    concepts::ExprRequirement *R) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptExprRequirement(
+      R);
+}
+bool DynamicRecursiveASTVisitor::TraverseConceptNestedRequirement(
+    concepts::NestedRequirement *R) {
+  return Impl(*this)
+      .RecursiveASTVisitor<Impl>::TraverseConceptNestedRequirement(R);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseConceptReference(
+    ConceptReference *CR) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseConceptReference(CR);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseCXXBaseSpecifier(
+    const CXXBaseSpecifier &Base) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseCXXBaseSpecifier(Base);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseDeclarationNameInfo(
+    DeclarationNameInfo NameInfo) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseDeclarationNameInfo(
+      NameInfo);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseNestedNameSpecifier(
+    NestedNameSpecifier *NNS) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseNestedNameSpecifier(
+      NNS);
+}
+
+bool DynamicRecursiveASTVisitor::TraverseNestedNameSpecifierLoc(
+    NestedNameSpecifierLoc NNS) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::TraverseNestedNameSpecifierLoc(
+      NNS);
 }
 
-template <bool Const>
-bool DynamicRecursiveASTVisitorBase<Const>::dataTraverseNode(
-    MaybeConst<Stmt> *S) {
-  return Impl<Const>(*this)
-      .template RecursiveASTVisitor<Impl<Const>>::dataTraverseNode(
-          const_cast<Stmt *>(S), nullptr);
+bool DynamicRecursiveASTVisitor::dataTraverseNode(Stmt *S) {
+  return Impl(*this).RecursiveASTVisitor<Impl>::dataTraverseNode(S, nullptr);
 }
 
+#define DEF_TRAVERSE_TMPL_INST(kind)                                           \
+  bool DynamicRecursiveASTVisitor::TraverseTemplateInstantiations(             \
+      kind##TemplateDecl *D) {                                                 \
+    return Impl(*this)                                                         \
+        .RecursiveASTVisitor<Impl>::TraverseTemplateInstantiations(D);         \
+  }
+DEF_TRAVERSE_TMPL_INST(Class)
+DEF_TRAVERSE_TMPL_INST(Var)
+DEF_TRAVERSE_TMPL_INST(Function)
+#undef DEF_TRAVERSE_TMPL_INST
+
 // Declare Traverse*() for and friends all concrete Decl classes.
 #define ABSTRACT_DECL(DECL)
 #define DECL(CLASS, BASE)                                                      \
-  FORWARD_TO_BASE(Traverse##CLASS##Decl, CLASS##Decl, *)                       \
-  FORWARD_TO_BASE(WalkUpFrom##CLASS##Decl, CLASS##Decl, *)
+  bool DynamicRecursiveASTVisitor::Traverse##CLASS##Decl(CLASS##Decl *D) {     \
+    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##Decl(D);    \
+  }                                                                            \
+  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##Decl(CLASS##Decl *D) {   \
+    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##Decl(D);  \
+  }
 #include "clang/AST/DeclNodes.inc"
 
 // Declare Traverse*() and friends for all concrete Stmt classes.
 #define ABSTRACT_STMT(STMT)
-#define STMT(CLASS, PARENT) FORWARD_TO_BASE(Traverse##CLASS, CLASS, *)
+#define STMT(CLASS, PARENT)                                                    \
+  bool DynamicRecursiveASTVisitor::Traverse##CLASS(CLASS *S) {                 \
+    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS(S);          \
+  }
 #include "clang/AST/StmtNodes.inc"
 
-#define STMT(CLASS, PARENT) FORWARD_TO_BASE(WalkUpFrom##CLASS, CLASS, *)
+#define STMT(CLASS, PARENT)                                                    \
+  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS(CLASS *S) {               \
+    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS(S);        \
+  }
 #include "clang/AST/StmtNodes.inc"
 
-// Declare Traverse*() and friends for all concrete Type classes.
+// Declare Traverse*() and friends for all concrete Typeclasses.
 #define ABSTRACT_TYPE(CLASS, BASE)
 #define TYPE(CLASS, BASE)                                                      \
-  FORWARD_TO_BASE(Traverse##CLASS##Type, CLASS##Type, *)                       \
-  FORWARD_TO_BASE(WalkUpFrom##CLASS##Type, CLASS##Type, *)
+  bool DynamicRecursiveASTVisitor::Traverse##CLASS##Type(CLASS##Type *T) {     \
+    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##Type(T);    \
+  }                                                                            \
+  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##Type(CLASS##Type *T) {   \
+    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##Type(T);  \
+  }
 #include "clang/AST/TypeNodes.inc"
 
 #define ABSTRACT_TYPELOC(CLASS, BASE)
 #define TYPELOC(CLASS, BASE)                                                   \
-  FORWARD_TO_BASE_EXACT(Traverse##CLASS##TypeLoc, CLASS##TypeLoc)
+  bool DynamicRecursiveASTVisitor::Traverse##CLASS##TypeLoc(                   \
+      CLASS##TypeLoc TL) {                                                     \
+    return Impl(*this).RecursiveASTVisitor<Impl>::Traverse##CLASS##TypeLoc(    \
+        TL);                                                                   \
+  }
 #include "clang/AST/TypeLocNodes.def"
 
 #define TYPELOC(CLASS, BASE)                                                   \
-  FORWARD_TO_BASE_EXACT(WalkUpFrom##CLASS##TypeLoc, CLASS##TypeLoc)
+  bool DynamicRecursiveASTVisitor::WalkUpFrom##CLASS##TypeLoc(                 \
+      CLASS##TypeLoc TL) {                                                     \
+    return Impl(*this).RecursiveASTVisitor<Impl>::WalkUpFrom##CLASS##TypeLoc(  \
+        TL);                                                                   \
+  }
 #include "clang/AST/TypeLocNodes.def"
-
-namespace clang {
-template class DynamicRecursiveASTVisitorBase<false>;
-template class DynamicRecursiveASTVisitorBase<true>;
-} // namespace clang
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index 79f88553feb95..d7e5ebee9a9b4 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -48,7 +48,7 @@ class RawPtrRefMemberChecker
     // The calls to checkAST* from AnalysisConsumer don't
     // visit template instantiations or lambda classes. We
     // want to visit those, so we make our own RecursiveASTVisitor.
-    struct LocalVisitor : ConstDynamicRecursiveASTVisitor {
+    struct LocalVisitor : DynamicRecursiveASTVisitor {
       const RawPtrRefMemberChecker *Checker;
       explicit LocalVisitor(const RawPtrRefMemberChecker *Checker)
           : Checker(Checker) {
@@ -57,14 +57,14 @@ class RawPtrRefMemberChecker
         ShouldVisitImplicitCode = false;
       }
 
-      bool VisitRecordDecl(const RecordDecl *RD) override {
+      bool VisitRecordDecl(RecordDecl *RD) override {
         Checker->visitRecordDecl(RD);
         return true;
       }
     };
 
     LocalVisitor visitor(this);
-    visitor.TraverseDecl(TUD);
+    visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
   void visitRecordDecl(const RecordDecl *RD) const {

From 760a786d1580d2d933664a95cf1b5273d30ccdf6 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 27 Jan 2025 19:30:59 -0600
Subject: [PATCH 315/432] [Clang] Prevent `mlink-builtin-bitcode` from
 internalizing the RPC client (#118661)

Summary:
Currently, we only use `-mlink-builtin-bitcode` for non-LTO NVIDIA
compiliations. This has the problem that it will internalize the RPC
client symbol which needs to be visible to the host. To counteract that,
I put `retain` on it, but this also prevents optimizations on the global
itself, so the passes we have that remove the symbol don't work on
OpenMP anymore. This patch does the dumbest solution, adding a special
string check for it in clang. Not the best solution, the runner up would
be to have a clang attribute for `externally_initialized` because those
can't be internalized, but that might have some unfortunate
side-effects. Alternatively we could make NVIDIA compilations do LTO all
the time, but that would affect some users and it's harder than I
thought.
---
 llvm/lib/Transforms/IPO/Internalize.cpp |  4 ++++
 offload/DeviceRTL/src/Misc.cpp          | 10 ++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/Internalize.cpp b/llvm/lib/Transforms/IPO/Internalize.cpp
index 4cdd1fa611062..f0270600aa266 100644
--- a/llvm/lib/Transforms/IPO/Internalize.cpp
+++ b/llvm/lib/Transforms/IPO/Internalize.cpp
@@ -233,6 +233,10 @@ bool InternalizePass::internalizeModule(Module &M) {
   else
     AlwaysPreserved.insert("__stack_chk_guard");
 
+  // Preserve the RPC interface for GPU host callbacks when internalizing.
+  if (Triple(M.getTargetTriple()).isNVPTX())
+    AlwaysPreserved.insert("__llvm_rpc_server");
+
   // Mark all functions not in the api as internal.
   IsWasm = Triple(M.getTargetTriple()).isOSBinFormatWasm();
   for (Function &I : M) {
diff --git a/offload/DeviceRTL/src/Misc.cpp b/offload/DeviceRTL/src/Misc.cpp
index ba6fbf5d5c7e3..a6660d6853e47 100644
--- a/offload/DeviceRTL/src/Misc.cpp
+++ b/offload/DeviceRTL/src/Misc.cpp
@@ -105,14 +105,8 @@ void *indirectCallLookup(void *HstPtr) {
 }
 
 /// The openmp client instance used to communicate with the server.
-/// FIXME: This is marked as 'retain' so that it is not removed via
-/// `-mlink-builtin-bitcode`
-#ifdef __NVPTX__
-[[gnu::visibility("protected"), gnu::weak,
-  gnu::retain]] rpc::Client Client asm("__llvm_rpc_client");
-#else
-[[gnu::visibility("protected"), gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
-#endif
+[[gnu::visibility("protected"),
+  gnu::weak]] rpc::Client Client asm("__llvm_rpc_client");
 
 } // namespace impl
 } // namespace ompx

From ad697b28f4794af5b308ada2aac2bf3cd6c5f9ea Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 18:08:18 -0800
Subject: [PATCH 316/432] ReleaseNotes: add lld/ELF notes

Pull Request: https://github.com/llvm/llvm-project/pull/124508
---
 lld/docs/ReleaseNotes.rst | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 1995c80df652c..e13b0cf0678ce 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -29,12 +29,45 @@ ELF Improvements
 * ``-z nosectionheader`` has been implemented to omit the section header table.
   The operation is similar to ``llvm-objcopy --strip-sections``.
   (`#101286 <https://github.com/llvm/llvm-project/pull/101286>`_)
+* ``--randomize-section-padding=<seed>`` is introduced to insert random padding
+  between input sections and at the start of each segment. This can be used to
+  control measurement bias in A/B experiments.
+  (`#117653 <https://github.com/llvm/llvm-project/pull/117653>`_)
+* The reproduce tarball created with ``--reproduce=`` now excludes directories
+  specified in the ``--dependency-file`` argument (used by Ninja). This
+  resolves an error where non-existent directories could cause issues when
+  invoking ``ld.lld @response.txt``.
+* ``--symbol-ordering-file=`` and call graph profile can now be used together.
+* When ``--call-graph-ordering-file=`` is specified, ``.llvm.call-graph-profile``
+  sections in relocatable files are no longer used.
+* ``--lto-basic-block-sections=labels`` is deprecated in favor of
+  ``--lto-basic-block-address-map``.
+  (`#110697 <https://github.com/llvm/llvm-project/pull/110697>`_)
+* In non-relocatable links, a ``.note.GNU-stack`` section with the
+  ``SHF_EXECINSTR`` flag is now rejected unless ``-z execstack`` is specified.
+  (`#124068 <https://github.com/llvm/llvm-project/pull/124068>`_)
+* In relocatable links, the ``sh_entsize`` member of a ``SHF_MERGE`` section
+  with relocations is now respected in the output.
+* Quoted names can now be used in output section phdr, memory region names,
+  ``OVERLAY``, the LHS of ``--defsym``, and ``INSERT AFTER``.
 * Section ``CLASS`` linker script syntax binds input sections to named classes,
   which are referenced later one or more times. This provides access to the
   automatic spilling mechanism of `--enable-non-contiguous-regions` without
   globally changing the semantics of section matching. It also independently
   increases the expressive power of linker scripts.
   (`#95323 <https://github.com/llvm/llvm-project/pull/95323>`_)
+* ``INCLUDE`` cycle detection has been fixed. A linker script can now be
+  included twice.
+* The ``archivename:`` syntax when matching input sections is now supported.
+  (`#119293 <https://github.com/llvm/llvm-project/pull/119293>`_)
+* To support Arm v6-M, short thunks using B.w are no longer generated.
+  (`#118111 <https://github.com/llvm/llvm-project/pull/118111>`_)
+* For AArch64, BTI-aware long branch thunks can now be created to a destination
+  function without a BTI instruction.
+  (`#108989 <https://github.com/llvm/llvm-project/pull/108989>`_)
+  (`#116402 <https://github.com/llvm/llvm-project/pull/116402>`_)
+* Relocations related to GOT and TLSDESC for the AArch64 Pointer Authentication ABI
+  are now supported.
 * Supported relocation types for x86-64 target:
   * ``R_X86_64_CODE_4_GOTPCRELX`` (`#109783 <https://github.com/llvm/llvm-project/pull/109783>`_) (`#116737 <https://github.com/llvm/llvm-project/pull/116737>`_)
   * ``R_X86_64_CODE_4_GOTTPOFF`` (`#116634 <https://github.com/llvm/llvm-project/pull/116634>`_)

From 641694729df1564710c91d6778ca4f9c841b561a Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 27 Jan 2025 21:11:27 -0500
Subject: [PATCH 317/432] [lldb] Clean up Socket headers for Android (#124453)

---
 lldb/source/Host/common/Socket.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index 296c2273ba419..f35e5ff43595b 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -40,16 +40,6 @@
 #include "lldb/Host/linux/AbstractSocket.h"
 #endif
 
-#ifdef __ANDROID__
-#include <arpa/inet.h>
-#include <asm-generic/errno-base.h>
-#include <cerrno>
-#include <fcntl.h>
-#include <linux/tcp.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif // __ANDROID__
-
 using namespace lldb;
 using namespace lldb_private;
 

From 5ece348f77e9df1cafe49e565e47308fbbc2eb37 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 27 Jan 2025 21:13:40 -0500
Subject: [PATCH 318/432] [lldb] Android 9 has added the spawn.h header
 (#124452)

---
 lldb/source/Host/common/Host.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index d5054008268b9..c52e11ad6ddf2 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -21,10 +21,8 @@
 #include <sys/syscall.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#if !defined(__ANDROID__)
 #include <spawn.h>
 #endif
-#endif
 
 #if defined(__APPLE__)
 #include <mach-o/dyld.h>

From c4891089125d4ba312204cc9a666339abbfc4db2 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Mon, 27 Jan 2025 18:14:02 -0800
Subject: [PATCH 319/432] [flang] Added hlfir.reshape
 definition/lowering/codegen. (#124226)

Lower Fortran RESHAPE intrinsic into hlfir.reshape, and then lower
hlfir.reshape into a runtime call.
A later patch will add hlfir.reshape inlining as hlfir.elemental.
---
 .../flang/Optimizer/HLFIR/HLFIROpBase.td      |   9 +
 .../include/flang/Optimizer/HLFIR/HLFIROps.td |  26 +
 flang/lib/Lower/HlfirIntrinsics.cpp           |  25 +
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     | 111 ++++-
 .../HLFIR/Transforms/LowerHLFIRIntrinsics.cpp |  48 +-
 flang/test/HLFIR/invalid.fir                  | 136 +++++-
 flang/test/HLFIR/reshape-lowering.fir         | 443 ++++++++++++++++++
 flang/test/HLFIR/reshape.fir                  |  86 ++++
 flang/test/Lower/HLFIR/reshape.f90            | 143 ++++++
 9 files changed, 1001 insertions(+), 26 deletions(-)
 create mode 100644 flang/test/HLFIR/reshape-lowering.fir
 create mode 100644 flang/test/HLFIR/reshape.fir
 create mode 100644 flang/test/Lower/HLFIR/reshape.f90

diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td b/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td
index 404ab5f633bf7..1b1ac61d4550f 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td
@@ -125,6 +125,11 @@ def IsFortranNumericalArrayObjectPred
 def AnyFortranNumericalArrayObject : Type<IsFortranNumericalArrayObjectPred,
     "any array-like object containing a numerical type">;
 
+def AnyFortranNumericalArrayEntity
+    : Type<And<[AnyFortranNumericalArrayObject.predicate,
+                AnyFortranEntity.predicate]>,
+           "any array-like entity containing a numerical type">;
+
 def IsFortranNumericalOrLogicalArrayObjectPred
         : CPred<"::hlfir::isFortranNumericalOrLogicalArrayObject($_self)">;
 def AnyFortranNumericalOrLogicalArrayObject : Type<IsFortranNumericalOrLogicalArrayObjectPred,
@@ -135,6 +140,10 @@ def IsFortranArrayObjectPred
 def AnyFortranArrayObject : Type<IsFortranArrayObjectPred,
     "any array-like object">;
 
+def AnyFortranArrayEntity
+    : Type<And<[AnyFortranArrayObject.predicate, AnyFortranEntity.predicate]>,
+           "any array-like entity">;
+
 def IsPassByRefOrIntegerTypePred
         : CPred<"::hlfir::isPassByRefOrIntegerType($_self)">;
 def AnyPassByRefOrIntegerType : Type<IsPassByRefOrIntegerTypePred,
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 48764580d526d..f4102538efc3c 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -720,6 +720,32 @@ def hlfir_CShiftOp
   let hasVerifier = 1;
 }
 
+def hlfir_ReshapeOp
+    : hlfir_Op<
+          "reshape", [AttrSizedOperandSegments,
+                      DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let summary = "RESHAPE transformational intrinsic";
+  let description = [{
+    Reshapes an ARRAY to correspond to the given SHAPE.
+    If PAD is specified the new array may be padded with elements
+    from PAD array.
+    If ORDER is specified the new array may be permuted accordingly.
+  }];
+
+  let arguments = (ins AnyFortranArrayEntity:$array,
+      AnyFortranNumericalArrayEntity:$shape,
+      Optional<AnyFortranArrayEntity>:$pad,
+      Optional<AnyFortranNumericalArrayEntity>:$order);
+
+  let results = (outs hlfir_ExprType);
+
+  let assemblyFormat = [{
+    $array $shape (`pad` $pad^)? (`order` $order^)? attr-dict `:` functional-type(operands, results)
+  }];
+
+  let hasVerifier = 1;
+}
+
 // An allocation effect is needed because the value produced by the associate
 // is "deallocated" by hlfir.end_associate (the end_associate must not be
 // removed, and there must be only one hlfir.end_associate).
diff --git a/flang/lib/Lower/HlfirIntrinsics.cpp b/flang/lib/Lower/HlfirIntrinsics.cpp
index 9d3cd3a5c8fa1..8b96b209ddb00 100644
--- a/flang/lib/Lower/HlfirIntrinsics.cpp
+++ b/flang/lib/Lower/HlfirIntrinsics.cpp
@@ -170,6 +170,17 @@ class HlfirCShiftLowering : public HlfirTransformationalIntrinsic {
             mlir::Type stmtResultType) override;
 };
 
+class HlfirReshapeLowering : public HlfirTransformationalIntrinsic {
+public:
+  using HlfirTransformationalIntrinsic::HlfirTransformationalIntrinsic;
+
+protected:
+  mlir::Value
+  lowerImpl(const Fortran::lower::PreparedActualArguments &loweredActuals,
+            const fir::IntrinsicArgumentLoweringRules *argLowering,
+            mlir::Type stmtResultType) override;
+};
+
 } // namespace
 
 mlir::Value HlfirTransformationalIntrinsic::loadBoxAddress(
@@ -419,6 +430,17 @@ mlir::Value HlfirCShiftLowering::lowerImpl(
   return createOp<hlfir::CShiftOp>(resultType, operands);
 }
 
+mlir::Value HlfirReshapeLowering::lowerImpl(
+    const Fortran::lower::PreparedActualArguments &loweredActuals,
+    const fir::IntrinsicArgumentLoweringRules *argLowering,
+    mlir::Type stmtResultType) {
+  auto operands = getOperandVector(loweredActuals, argLowering);
+  assert(operands.size() == 4);
+  mlir::Type resultType = computeResultType(operands[0], stmtResultType);
+  return createOp<hlfir::ReshapeOp>(resultType, operands[0], operands[1],
+                                    operands[2], operands[3]);
+}
+
 std::optional<hlfir::EntityWithAttributes> Fortran::lower::lowerHlfirIntrinsic(
     fir::FirOpBuilder &builder, mlir::Location loc, const std::string &name,
     const Fortran::lower::PreparedActualArguments &loweredActuals,
@@ -467,6 +489,9 @@ std::optional<hlfir::EntityWithAttributes> Fortran::lower::lowerHlfirIntrinsic(
   if (name == "cshift")
     return HlfirCShiftLowering{builder, loc}.lower(loweredActuals, argLowering,
                                                    stmtResultType);
+  if (name == "reshape")
+    return HlfirReshapeLowering{builder, loc}.lower(loweredActuals, argLowering,
+                                                    stmtResultType);
   if (mlir::isa<fir::CharacterType>(stmtResultType)) {
     if (name == "min")
       return HlfirCharExtremumLowering{builder, loc,
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index d93e25280237f..2fcfa1353f86b 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -67,6 +67,33 @@ getIntrinsicEffects(mlir::Operation *self,
   }
 }
 
+/// Verification helper for checking if two types are the same.
+/// Set \p allowCharacterLenMismatch to true, if character types
+/// of different known lengths should be treated as the same.
+template <typename Op>
+static llvm::LogicalResult areMatchingTypes(Op &op, mlir::Type type1,
+                                            mlir::Type type2,
+                                            bool allowCharacterLenMismatch) {
+  if (auto charType1 = mlir::dyn_cast<fir::CharacterType>(type1))
+    if (auto charType2 = mlir::dyn_cast<fir::CharacterType>(type2)) {
+      // Character kinds must match.
+      if (charType1.getFKind() != charType2.getFKind())
+        return op.emitOpError("character KIND mismatch");
+
+      // Constant propagation can result in mismatching lengths
+      // in the dead code, but we should not fail on this.
+      if (!allowCharacterLenMismatch)
+        if (charType1.getLen() != fir::CharacterType::unknownLen() &&
+            charType2.getLen() != fir::CharacterType::unknownLen() &&
+            charType1.getLen() != charType2.getLen())
+          return op.emitOpError("character LEN mismatch");
+
+      return mlir::success();
+    }
+
+  return type1 == type2 ? mlir::success() : mlir::failure();
+}
+
 //===----------------------------------------------------------------------===//
 // DeclareOp
 //===----------------------------------------------------------------------===//
@@ -1360,23 +1387,12 @@ llvm::LogicalResult hlfir::CShiftOp::verify() {
   mlir::Value shift = getShift();
   mlir::Type shiftTy = hlfir::getFortranElementOrSequenceType(shift.getType());
 
-  if (eleTy != resultEleTy) {
-    if (mlir::isa<fir::CharacterType>(eleTy) &&
-        mlir::isa<fir::CharacterType>(resultEleTy)) {
-      auto eleCharTy = mlir::cast<fir::CharacterType>(eleTy);
-      auto resultCharTy = mlir::cast<fir::CharacterType>(resultEleTy);
-      if (eleCharTy.getFKind() != resultCharTy.getFKind())
-        return emitOpError("kind mismatch between input and output arrays");
-      if (eleCharTy.getLen() != fir::CharacterType::unknownLen() &&
-          resultCharTy.getLen() != fir::CharacterType::unknownLen() &&
-          eleCharTy.getLen() != resultCharTy.getLen())
-        return emitOpError(
-            "character LEN mismatch between input and output arrays");
-    } else {
-      return emitOpError(
-          "input and output arrays should have the same element type");
-    }
-  }
+  // TODO: turn allowCharacterLenMismatch into true.
+  if (auto match = areMatchingTypes(*this, eleTy, resultEleTy,
+                                    /*allowCharacterLenMismatch=*/false);
+      match.failed())
+    return emitOpError(
+        "input and output arrays should have the same element type");
 
   if (arrayRank != resultRank)
     return emitOpError("input and output arrays should have the same rank");
@@ -1444,6 +1460,67 @@ void hlfir::CShiftOp::getEffects(
   getIntrinsicEffects(getOperation(), effects);
 }
 
+//===----------------------------------------------------------------------===//
+// ReshapeOp
+//===----------------------------------------------------------------------===//
+
+llvm::LogicalResult hlfir::ReshapeOp::verify() {
+  auto results = getOperation()->getResultTypes();
+  assert(results.size() == 1);
+  hlfir::ExprType resultType = mlir::cast<hlfir::ExprType>(results[0]);
+  mlir::Value array = getArray();
+  auto arrayType = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(array.getType()));
+  if (auto match = areMatchingTypes(
+          *this, hlfir::getFortranElementType(resultType),
+          arrayType.getElementType(), /*allowCharacterLenMismatch=*/true);
+      match.failed())
+    return emitOpError("ARRAY and the result must have the same element type");
+  if (hlfir::isPolymorphicType(resultType) !=
+      hlfir::isPolymorphicType(array.getType()))
+    return emitOpError("ARRAY must be polymorphic iff result is polymorphic");
+
+  mlir::Value shape = getShape();
+  auto shapeArrayType = mlir::cast<fir::SequenceType>(
+      hlfir::getFortranElementOrSequenceType(shape.getType()));
+  if (shapeArrayType.getDimension() != 1)
+    return emitOpError("SHAPE must be an array of rank 1");
+  if (!mlir::isa<mlir::IntegerType>(shapeArrayType.getElementType()))
+    return emitOpError("SHAPE must be an integer array");
+  if (shapeArrayType.hasDynamicExtents())
+    return emitOpError("SHAPE must have known size");
+  if (shapeArrayType.getConstantArraySize() != resultType.getRank())
+    return emitOpError("SHAPE's extent must match the result rank");
+
+  if (mlir::Value pad = getPad()) {
+    auto padArrayType = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(pad.getType()));
+    if (auto match = areMatchingTypes(*this, arrayType.getElementType(),
+                                      padArrayType.getElementType(),
+                                      /*allowCharacterLenMismatch=*/true);
+        match.failed())
+      return emitOpError("ARRAY and PAD must be of the same type");
+  }
+
+  if (mlir::Value order = getOrder()) {
+    auto orderArrayType = mlir::cast<fir::SequenceType>(
+        hlfir::getFortranElementOrSequenceType(order.getType()));
+    if (orderArrayType.getDimension() != 1)
+      return emitOpError("ORDER must be an array of rank 1");
+    if (!mlir::isa<mlir::IntegerType>(orderArrayType.getElementType()))
+      return emitOpError("ORDER must be an integer array");
+  }
+
+  return mlir::success();
+}
+
+void hlfir::ReshapeOp::getEffects(
+    llvm::SmallVectorImpl<
+        mlir::SideEffects::EffectInstance<mlir::MemoryEffects::Effect>>
+        &effects) {
+  getIntrinsicEffects(getOperation(), effects);
+}
+
 //===----------------------------------------------------------------------===//
 // AssociateOp
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
index 091ed7ed999df..bd12700f13838 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp
@@ -494,6 +494,41 @@ class CShiftOpConversion : public HlfirIntrinsicConversion<hlfir::CShiftOp> {
   }
 };
 
+class ReshapeOpConversion : public HlfirIntrinsicConversion<hlfir::ReshapeOp> {
+  using HlfirIntrinsicConversion<hlfir::ReshapeOp>::HlfirIntrinsicConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(hlfir::ReshapeOp reshape,
+                  mlir::PatternRewriter &rewriter) const override {
+    fir::FirOpBuilder builder{rewriter, reshape.getOperation()};
+    const mlir::Location &loc = reshape->getLoc();
+
+    llvm::SmallVector<IntrinsicArgument, 4> inArgs;
+    mlir::Value array = reshape.getArray();
+    inArgs.push_back({array, array.getType()});
+    mlir::Value shape = reshape.getShape();
+    inArgs.push_back({shape, shape.getType()});
+    mlir::Type noneType = builder.getNoneType();
+    mlir::Value pad = reshape.getPad();
+    inArgs.push_back({pad, pad ? pad.getType() : noneType});
+    mlir::Value order = reshape.getOrder();
+    inArgs.push_back({order, order ? order.getType() : noneType});
+
+    auto *argLowering = fir::getIntrinsicArgumentLowering("reshape");
+    llvm::SmallVector<fir::ExtendedValue, 4> args =
+        lowerArguments(reshape, inArgs, rewriter, argLowering);
+
+    mlir::Type scalarResultType =
+        hlfir::getFortranElementType(reshape.getType());
+
+    auto [resultExv, mustBeFreed] =
+        fir::genIntrinsicCall(builder, loc, "reshape", scalarResultType, args);
+
+    processReturnValue(reshape, resultExv, mustBeFreed, builder, rewriter);
+    return mlir::success();
+  }
+};
+
 class LowerHLFIRIntrinsics
     : public hlfir::impl::LowerHLFIRIntrinsicsBase<LowerHLFIRIntrinsics> {
 public:
@@ -501,13 +536,12 @@ class LowerHLFIRIntrinsics
     mlir::ModuleOp module = this->getOperation();
     mlir::MLIRContext *context = &getContext();
     mlir::RewritePatternSet patterns(context);
-    patterns
-        .insert<MatmulOpConversion, MatmulTransposeOpConversion,
-                AllOpConversion, AnyOpConversion, SumOpConversion,
-                ProductOpConversion, TransposeOpConversion, CountOpConversion,
-                DotProductOpConversion, MaxvalOpConversion, MinvalOpConversion,
-                MinlocOpConversion, MaxlocOpConversion, CShiftOpConversion>(
-            context);
+    patterns.insert<
+        MatmulOpConversion, MatmulTransposeOpConversion, AllOpConversion,
+        AnyOpConversion, SumOpConversion, ProductOpConversion,
+        TransposeOpConversion, CountOpConversion, DotProductOpConversion,
+        MaxvalOpConversion, MinvalOpConversion, MinlocOpConversion,
+        MaxlocOpConversion, CShiftOpConversion, ReshapeOpConversion>(context);
 
     // While conceptually this pass is performing dialect conversion, we use
     // pattern rewrites here instead of dialect conversion because this pass
diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir
index b35bec4b2a899..d61efe0062e69 100644
--- a/flang/test/HLFIR/invalid.fir
+++ b/flang/test/HLFIR/invalid.fir
@@ -1411,7 +1411,8 @@ func.func @bad_cshift7(%arg0: !hlfir.expr<?x2xi32>, %arg1: !hlfir.expr<3xi32>) {
 // -----
 
 func.func @bad_cshift8(%arg0: !hlfir.expr<?x!fir.char<1,?>>, %arg1: i32) {
-  // expected-error@+1 {{'hlfir.cshift' op kind mismatch between input and output arrays}}
+  // expected-error@+2 {{'hlfir.cshift' op character KIND mismatch}}
+  // expected-error@+1 {{'hlfir.cshift' op input and output arrays should have the same element type}}
   %0 = hlfir.cshift %arg0 %arg1 : (!hlfir.expr<?x!fir.char<1,?>>, i32) -> !hlfir.expr<?x!fir.char<2,?>>
   return
 }
@@ -1419,7 +1420,138 @@ func.func @bad_cshift8(%arg0: !hlfir.expr<?x!fir.char<1,?>>, %arg1: i32) {
 // -----
 
 func.func @bad_cshift9(%arg0: !hlfir.expr<?x!fir.char<1,1>>, %arg1: i32) {
-  // expected-error@+1 {{'hlfir.cshift' op character LEN mismatch between input and output arrays}}
+  // expected-error@+2 {{'hlfir.cshift' op character LEN mismatch}}
+  // expected-error@+1 {{'hlfir.cshift' op input and output arrays should have the same element type}}
   %0 = hlfir.cshift %arg0 %arg1 : (!hlfir.expr<?x!fir.char<1,1>>, i32) -> !hlfir.expr<?x!fir.char<1,2>>
   return
 }
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY and the result must have the same element type}}
+  %0 = hlfir.reshape %arg0 %arg0 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>) -> !hlfir.expr<?xf32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<?x!fir.type<whatever>?>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY must be polymorphic iff result is polymorphic}}
+  %0 = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<?x!fir.type<whatever>?>, !hlfir.expr<1xi32>) -> !hlfir.expr<?x!fir.type<whatever>>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<?x!fir.type<whatever>>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY must be polymorphic iff result is polymorphic}}
+  %0 = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<?x!fir.type<whatever>>, !hlfir.expr<1xi32>) -> !hlfir.expr<?x!fir.type<whatever>?>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1x1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op SHAPE must be an array of rank 1}}
+  %0 = hlfir.reshape %arg0 %arg0 : (!hlfir.expr<1x1xi32>, !hlfir.expr<1x1xi32>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xf32>) {
+  // expected-error@+1 {{'hlfir.reshape' op SHAPE must be an integer array}}
+  %0 = hlfir.reshape %arg0 %arg0 : (!hlfir.expr<1xf32>, !hlfir.expr<1xf32>) -> !hlfir.expr<?xf32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<?xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op SHAPE must have known size}}
+  %0 = hlfir.reshape %arg0 %arg0 : (!hlfir.expr<?xi32>, !hlfir.expr<?xi32>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op SHAPE's extent must match the result rank}}
+  %0 = hlfir.reshape %arg0 %arg0 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>) -> !hlfir.expr<?x?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xi32>, %arg1: !hlfir.expr<?xi16>) {
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY and PAD must be of the same type}}
+  %0 = hlfir.reshape %arg0 %arg0 pad %arg1 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !hlfir.expr<?xi16>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xi32>, %arg1: !hlfir.expr<?x?xi16>) {
+  // expected-error@+1 {{'hlfir.reshape' op ORDER must be an array of rank 1}}
+  %0 = hlfir.reshape %arg0 %arg0 order %arg1 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !hlfir.expr<?x?xi16>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1xi32>, %arg1: !hlfir.expr<?xf16>) {
+  // expected-error@+1 {{'hlfir.reshape' op ORDER must be an integer array}}
+  %0 = hlfir.reshape %arg0 %arg0 order %arg1 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !hlfir.expr<?xf16>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !fir.ref<!fir.array<?xi32>>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op operand #0 must be any array-like entity}}
+  %0 = hlfir.reshape %arg0 %arg1 : (!fir.ref<!fir.array<?xi32>>, !hlfir.expr<1xi32>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !fir.ref<!fir.array<?xi32>>, %arg1: !hlfir.expr<?xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op operand #1 must be any array-like entity containing a numerical type}}
+  %0 = hlfir.reshape %arg1 %arg0 : (!hlfir.expr<?xi32>, !fir.ref<!fir.array<?xi32>>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !fir.ref<!fir.array<?xi32>>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op operand #2 must be any array-like entity}}
+  %0 = hlfir.reshape %arg1 %arg1 pad %arg0 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !fir.ref<!fir.array<?xi32>>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !fir.ref<!fir.array<?xi32>>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+1 {{'hlfir.reshape' op operand #3 must be any array-like entity containing a numerical type}}
+  %0 = hlfir.reshape %arg1 %arg1 pad %arg1 order %arg0 : (!hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !hlfir.expr<1xi32>, !fir.ref<!fir.array<?xi32>>) -> !hlfir.expr<?xi32>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1x!fir.char<1,2>>, %arg1: !hlfir.expr<1xi32>) {
+  // expected-error@+2 {{'hlfir.reshape' op character KIND mismatch}}
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY and the result must have the same element type}}
+  %0 = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<1x!fir.char<1,2>>, !hlfir.expr<1xi32>) -> !hlfir.expr<?x!fir.char<2,?>>
+  return
+}
+
+// -----
+
+func.func @bad_reshape(%arg0: !hlfir.expr<1x!fir.char<1,2>>, %arg1: !hlfir.expr<1xi32>, %arg2: !hlfir.expr<1x!fir.char<2,?>>) {
+  // expected-error@+2 {{'hlfir.reshape' op character KIND mismatch}}
+  // expected-error@+1 {{'hlfir.reshape' op ARRAY and PAD must be of the same type}}
+  %0 = hlfir.reshape %arg0 %arg1 pad %arg2 : (!hlfir.expr<1x!fir.char<1,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2,?>>) -> !hlfir.expr<?x!fir.char<1,?>>
+  return
+}
diff --git a/flang/test/HLFIR/reshape-lowering.fir b/flang/test/HLFIR/reshape-lowering.fir
new file mode 100644
index 0000000000000..c2f060efc5044
--- /dev/null
+++ b/flang/test/HLFIR/reshape-lowering.fir
@@ -0,0 +1,443 @@
+// Test hlfir.reshape operation lowering to fir runtime call
+// RUN: fir-opt %s -lower-hlfir-intrinsics | FileCheck %s
+
+// reshape(x, y)
+func.func @_QPreshape1(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %2 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %3:2 = hlfir.declare %arg1(%2) dummy_scope %0 {uniq_name = "_QFreshape1Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %4 = hlfir.reshape %1#0 %3#0 : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xf32>
+  hlfir.assign %4 to %1#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %4 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape1(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant true
+// CHECK:           %[[VAL_3:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFreshape1Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_9]]) dummy_scope %[[VAL_7]] {uniq_name = "_QFreshape1Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_10]]#1(%[[VAL_11]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_13:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_14:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_15:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_16:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_17:.*]] = fir.embox %[[VAL_15]](%[[VAL_16]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           fir.store %[[VAL_17]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_18:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_13]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_14]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_25:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_25]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_27:.*]] = fir.box_addr %[[VAL_25]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_28:.*]] = fir.shape_shift %[[VAL_26]]#0, %[[VAL_26]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_27]](%[[VAL_28]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_30:.*]] = hlfir.as_expr %[[VAL_29]]#0 move %[[VAL_2]] : (!fir.box<!fir.array<?xf32>>, i1) -> !hlfir.expr<?xf32>
+// CHECK:           hlfir.assign %[[VAL_30]] to %[[VAL_8]]#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+// CHECK:           hlfir.destroy %[[VAL_30]] : !hlfir.expr<?xf32>
+// CHECK:           return
+// CHECK:         }
+
+// reshape(x, y, pad)
+func.func @_QPreshape2(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}, %arg2: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "pad"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFreshape2Epad"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %2:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %3 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %4:2 = hlfir.declare %arg1(%3) dummy_scope %0 {uniq_name = "_QFreshape2Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %5 = hlfir.reshape %2#0 %4#0 pad %1#0 : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?xf32>>) -> !hlfir.expr<?xf32>
+  hlfir.assign %5 to %2#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %5 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape2(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                           %[[VAL_2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "pad"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape2Epad"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape2Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape2Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_14:.*]] = fir.embox %[[VAL_12]]#1(%[[VAL_13]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_15:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           fir.store %[[VAL_18]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_10]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_15]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_4]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_26:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_26]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_29:.*]] = fir.shape_shift %[[VAL_27]]#0, %[[VAL_27]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_28]](%[[VAL_29]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_31:.*]] = hlfir.as_expr %[[VAL_30]]#0 move %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, i1) -> !hlfir.expr<?xf32>
+// CHECK:           hlfir.assign %[[VAL_31]] to %[[VAL_10]]#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+// CHECK:           hlfir.destroy %[[VAL_31]] : !hlfir.expr<?xf32>
+// CHECK:           return
+// CHECK:         }
+
+// reshape(x, y, order=order)
+func.func @_QPreshape3(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}, %arg2: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "order"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFreshape3Eorder"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+  %2:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %3 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %4:2 = hlfir.declare %arg1(%3) dummy_scope %0 {uniq_name = "_QFreshape3Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %5 = hlfir.reshape %2#0 %4#0 order %1#0 : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?xf32>
+  hlfir.assign %5 to %2#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %5 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape3(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                           %[[VAL_2:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "order"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape3Eorder"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape3Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape3Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_14:.*]] = fir.embox %[[VAL_12]]#1(%[[VAL_13]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_15:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           fir.store %[[VAL_18]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_10]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_15]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_4]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_26:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_26]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_29:.*]] = fir.shape_shift %[[VAL_27]]#0, %[[VAL_27]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_28]](%[[VAL_29]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_31:.*]] = hlfir.as_expr %[[VAL_30]]#0 move %[[VAL_3]] : (!fir.box<!fir.array<?xf32>>, i1) -> !hlfir.expr<?xf32>
+// CHECK:           hlfir.assign %[[VAL_31]] to %[[VAL_10]]#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+// CHECK:           hlfir.destroy %[[VAL_31]] : !hlfir.expr<?xf32>
+// CHECK:           return
+// CHECK:         }
+
+// reshape(x, y, pad, order)
+func.func @_QPreshape4(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}, %arg2: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "pad"}, %arg3: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "order"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg3 dummy_scope %0 {uniq_name = "_QFreshape4Eorder"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+  %2:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFreshape4Epad"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %3:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape4Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+  %4 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %5:2 = hlfir.declare %arg1(%4) dummy_scope %0 {uniq_name = "_QFreshape4Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %6 = hlfir.reshape %3#0 %5#0 pad %2#0 order %1#0 : (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?xf32>
+  hlfir.assign %6 to %3#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+  hlfir.destroy %6 : !hlfir.expr<?xf32>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape4(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                           %[[VAL_2:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "pad"},
+// CHECK-SAME:                           %[[VAL_3:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "order"}) {
+// CHECK:           %[[VAL_4:.*]] = arith.constant true
+// CHECK:           %[[VAL_5:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_6:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_7:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           %[[VAL_9:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_3]] dummy_scope %[[VAL_9]] {uniq_name = "_QFreshape4Eorder"} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+// CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_9]] {uniq_name = "_QFreshape4Epad"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_9]] {uniq_name = "_QFreshape4Ex"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_13]]) dummy_scope %[[VAL_9]] {uniq_name = "_QFreshape4Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_16:.*]] = fir.embox %[[VAL_14]]#1(%[[VAL_15]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_17:.*]] = fir.zero_bits !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_18:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_17]](%[[VAL_18]]) : (!fir.heap<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?xf32>>>
+// CHECK:           fir.store %[[VAL_19]] to %[[VAL_8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_20:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_12]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_16]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_11]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_10]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_26]], %[[VAL_5]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_27:.*]] = fir.load %[[VAL_8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+// CHECK:           %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_27]], %[[VAL_6]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_29:.*]] = fir.box_addr %[[VAL_27]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+// CHECK:           %[[VAL_30:.*]] = fir.shape_shift %[[VAL_28]]#0, %[[VAL_28]]#1 : (index, index) -> !fir.shapeshift<1>
+// CHECK:           %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_29]](%[[VAL_30]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.heap<!fir.array<?xf32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.heap<!fir.array<?xf32>>)
+// CHECK:           %[[VAL_32:.*]] = hlfir.as_expr %[[VAL_31]]#0 move %[[VAL_4]] : (!fir.box<!fir.array<?xf32>>, i1) -> !hlfir.expr<?xf32>
+// CHECK:           hlfir.assign %[[VAL_32]] to %[[VAL_12]]#0 : !hlfir.expr<?xf32>, !fir.box<!fir.array<?xf32>>
+// CHECK:           hlfir.destroy %[[VAL_32]] : !hlfir.expr<?xf32>
+// CHECK:           return
+// CHECK:         }
+
+// subroutine reshape5(x, y)
+//   use types
+//   integer :: y(1)
+//   type(t) :: x(:)
+//   x = reshape(x, y)
+// end subroutine reshape5
+func.func @_QPreshape5(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape5Ex"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+  %2 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %3:2 = hlfir.declare %arg1(%2) dummy_scope %0 {uniq_name = "_QFreshape5Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %4 = hlfir.reshape %1#0 %3#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>>
+  hlfir.assign %4 to %1#0 : !hlfir.expr<?x!fir.type<_QMtypesTt>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>
+  hlfir.destroy %4 : !hlfir.expr<?x!fir.type<_QMtypesTt>>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape5(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant true
+// CHECK:           %[[VAL_3:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFreshape5Ex"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_9]]) dummy_scope %[[VAL_7]] {uniq_name = "_QFreshape5Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]] = fir.embox %[[VAL_10]]#1(%[[VAL_11]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_13:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_14:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_15:.*]] = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           %[[VAL_16:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_17:.*]] = fir.embox %[[VAL_15]](%[[VAL_16]]) : (!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           fir.store %[[VAL_17]] to %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_18:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_12]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_13]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_14]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_18]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_19]], %[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_3]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_25:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_26:.*]]:3 = fir.box_dims %[[VAL_25]], %[[VAL_4]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_27:.*]] = fir.shift %[[VAL_26]]#0 : (index) -> !fir.shift<1>
+// CHECK:           %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_25]](%[[VAL_27]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.shift<1>) -> (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>)
+// CHECK:           %[[VAL_29:.*]] = hlfir.as_expr %[[VAL_28]]#0 move %[[VAL_2]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, i1) -> !hlfir.expr<?x!fir.type<_QMtypesTt>>
+// CHECK:           hlfir.assign %[[VAL_29]] to %[[VAL_8]]#0 : !hlfir.expr<?x!fir.type<_QMtypesTt>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           hlfir.destroy %[[VAL_29]] : !hlfir.expr<?x!fir.type<_QMtypesTt>>
+// CHECK:           return
+// CHECK:         }
+
+// subroutine reshape6(x, y)
+//   use types
+//   integer :: y(1)
+//   class(t), allocatable :: x(:)
+//   x = reshape(x, y)
+// end subroutine reshape6
+func.func @_QPreshape6(%arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFreshape6Ex"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>)
+  %2 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %3:2 = hlfir.declare %arg1(%2) dummy_scope %0 {uniq_name = "_QFreshape6Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %4 = fir.load %1#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+  %5 = hlfir.reshape %4 %3#0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+  hlfir.assign %5 to %1#0 realloc : !hlfir.expr<?x!fir.type<_QMtypesTt>?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+  hlfir.destroy %5 : !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape6(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}) {
+// CHECK:           %[[VAL_2:.*]] = arith.constant true
+// CHECK:           %[[VAL_3:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_7]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFreshape6Ex"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>)
+// CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_9]]) dummy_scope %[[VAL_7]] {uniq_name = "_QFreshape6Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_11:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_13:.*]] = fir.embox %[[VAL_10]]#1(%[[VAL_12]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_14:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_15:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) source_box %[[VAL_11]] : (!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.shape<1>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           fir.store %[[VAL_18]] to %[[VAL_6]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_11]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_13]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_14]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_15]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_3]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_26:.*]] = fir.load %[[VAL_6]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_4]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_28:.*]] = fir.shift %[[VAL_27]]#0 : (index) -> !fir.shift<1>
+// CHECK:           %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_26]](%[[VAL_28]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.shift<1>) -> (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>)
+// CHECK:           %[[VAL_30:.*]] = hlfir.as_expr %[[VAL_29]]#0 move %[[VAL_2]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, i1) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+// CHECK:           hlfir.assign %[[VAL_30]] to %[[VAL_8]]#0 realloc : !hlfir.expr<?x!fir.type<_QMtypesTt>?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           hlfir.destroy %[[VAL_30]] : !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+// CHECK:           return
+// CHECK:         }
+
+// subroutine reshape7(x, y, pad)
+//   use types
+//   integer :: y(1)
+//   type(t) :: x(:), pad(:)
+//   x = reshape(x, y, pad)
+// end subroutine reshape7
+func.func @_QPreshape7(%arg0: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}, %arg2: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "pad"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFreshape7Epad"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+  %2:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFreshape7Ex"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+  %3 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %4:2 = hlfir.declare %arg1(%3) dummy_scope %0 {uniq_name = "_QFreshape7Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %5 = hlfir.reshape %2#0 %4#0 pad %1#0 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>>
+  hlfir.assign %5 to %2#0 : !hlfir.expr<?x!fir.type<_QMtypesTt>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>
+  hlfir.destroy %5 : !hlfir.expr<?x!fir.type<_QMtypesTt>>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape7(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                           %[[VAL_2:.*]]: !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "pad"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape7Epad"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape7Ex"} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape7Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_14:.*]] = fir.embox %[[VAL_12]]#1(%[[VAL_13]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_15:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_16:.*]] = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           %[[VAL_17:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.shape<1>) -> !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           fir.store %[[VAL_18]] to %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_19:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_20:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_10]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_15]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_19]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_20]], %[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_4]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_26:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_27:.*]]:3 = fir.box_dims %[[VAL_26]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_28:.*]] = fir.shift %[[VAL_27]]#0 : (index) -> !fir.shift<1>
+// CHECK:           %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_26]](%[[VAL_28]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.shift<1>) -> (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>)
+// CHECK:           %[[VAL_30:.*]] = hlfir.as_expr %[[VAL_29]]#0 move %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, i1) -> !hlfir.expr<?x!fir.type<_QMtypesTt>>
+// CHECK:           hlfir.assign %[[VAL_30]] to %[[VAL_10]]#0 : !hlfir.expr<?x!fir.type<_QMtypesTt>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           hlfir.destroy %[[VAL_30]] : !hlfir.expr<?x!fir.type<_QMtypesTt>>
+// CHECK:           return
+// CHECK:         }
+
+// subroutine reshape8(x, y, pad)
+//   use types
+//   integer :: y(1)
+//   class(t), allocatable :: x(:)
+//   class(t) :: pad(:)
+//   x = reshape(x, y, pad)
+// end subroutine reshape8
+func.func @_QPreshape8(%arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>> {fir.bindc_name = "x"}, %arg1: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"}, %arg2: !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "pad"}) {
+  %c1 = arith.constant 1 : index
+  %0 = fir.dummy_scope : !fir.dscope
+  %1:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFreshape8Epad"} : (!fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>)
+  %2:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFreshape8Ex"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>)
+  %3 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %4:2 = hlfir.declare %arg1(%3) dummy_scope %0 {uniq_name = "_QFreshape8Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+  %5 = fir.load %2#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+  %6 = hlfir.reshape %5 %4#0 pad %1#0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.ref<!fir.array<1xi32>>, !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+  hlfir.assign %6 to %2#0 realloc : !hlfir.expr<?x!fir.type<_QMtypesTt>?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+  hlfir.destroy %6 : !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+  return
+}
+// CHECK-LABEL:   func.func @_QPreshape8(
+// CHECK-SAME:                           %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>> {fir.bindc_name = "x"},
+// CHECK-SAME:                           %[[VAL_1:.*]]: !fir.ref<!fir.array<1xi32>> {fir.bindc_name = "y"},
+// CHECK-SAME:                           %[[VAL_2:.*]]: !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>> {fir.bindc_name = "pad"}) {
+// CHECK:           %[[VAL_3:.*]] = arith.constant true
+// CHECK:           %[[VAL_4:.*]] = arith.constant {{[0-9]*}} : i32
+// CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope
+// CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_2]] dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape8Epad"} : (!fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.dscope) -> (!fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>)
+// CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFreshape8Ex"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>)
+// CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_1]](%[[VAL_11]]) dummy_scope %[[VAL_8]] {uniq_name = "_QFreshape8Ey"} : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>)
+// CHECK:           %[[VAL_13:.*]] = fir.load %[[VAL_10]]#0 : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_15:.*]] = fir.embox %[[VAL_12]]#1(%[[VAL_14]]) : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<1xi32>>
+// CHECK:           %[[VAL_16:.*]] = fir.absent !fir.box<i1>
+// CHECK:           %[[VAL_17:.*]] = fir.zero_bits !fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>
+// CHECK:           %[[VAL_18:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+// CHECK:           %[[VAL_19:.*]] = fir.embox %[[VAL_17]](%[[VAL_18]]) source_box %[[VAL_13]] : (!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.shape<1>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>) -> !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>
+// CHECK:           fir.store %[[VAL_19]] to %[[VAL_7]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_20:.*]] = fir.address_of(@_QQcl
+// CHECK:           %[[VAL_21:.*]] = fir.convert %[[VAL_7]] : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>) -> !fir.ref<!fir.box<none>>
+// CHECK:           %[[VAL_22:.*]] = fir.convert %[[VAL_13]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_15]] : (!fir.box<!fir.array<1xi32>>) -> !fir.box<none>
+// CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !fir.box<none>
+// CHECK:           %[[VAL_25:.*]] = fir.convert %[[VAL_16]] : (!fir.box<i1>) -> !fir.box<none>
+// CHECK:           %[[VAL_26:.*]] = fir.convert %[[VAL_20]] : (!fir.ref<!fir.char<1,{{[0-9]*}}>>) -> !fir.ref<i8>
+// CHECK:           fir.call @_FortranAReshape(%[[VAL_21]], %[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[VAL_25]], %[[VAL_26]], %[[VAL_4]]) : (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> ()
+// CHECK:           %[[VAL_27:.*]] = fir.load %[[VAL_7]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           %[[VAL_28:.*]]:3 = fir.box_dims %[[VAL_27]], %[[VAL_5]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, index) -> (index, index, index)
+// CHECK:           %[[VAL_29:.*]] = fir.shift %[[VAL_28]]#0 : (index) -> !fir.shift<1>
+// CHECK:           %[[VAL_30:.*]]:2 = hlfir.declare %[[VAL_27]](%[[VAL_29]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.shift<1>) -> (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>)
+// CHECK:           %[[VAL_31:.*]] = hlfir.as_expr %[[VAL_30]]#0 move %[[VAL_3]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, i1) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+// CHECK:           hlfir.assign %[[VAL_31]] to %[[VAL_10]]#0 realloc : !hlfir.expr<?x!fir.type<_QMtypesTt>?>, !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>>
+// CHECK:           hlfir.destroy %[[VAL_31]] : !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+// CHECK:           return
+// CHECK:         }
diff --git a/flang/test/HLFIR/reshape.fir b/flang/test/HLFIR/reshape.fir
new file mode 100644
index 0000000000000..4f586386e0909
--- /dev/null
+++ b/flang/test/HLFIR/reshape.fir
@@ -0,0 +1,86 @@
+// Test hlfir.reshape operation parse, verify (no errors), and unparse
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+// Operands are expressions of known shape
+func.func @reshape1(%arg0: !hlfir.expr<42xi32>, %arg1: !hlfir.expr<1xi32>) -> !hlfir.expr<?xi32> {
+  %0 = hlfir.reshape %arg0 %arg1 : (!hlfir.expr<42xi32>, !hlfir.expr<1xi32>) -> !hlfir.expr<?xi32>
+  return %0 : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @reshape1(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !hlfir.expr<42xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !hlfir.expr<1xi32>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] : (!hlfir.expr<42xi32>, !hlfir.expr<1xi32>) -> !hlfir.expr<?xi32>
+// CHECK:           return %[[VAL_2]] : !hlfir.expr<?xi32>
+// CHECK:         }
+
+// Operands are expressions of assumed shape
+func.func @reshape2(%arg0: !hlfir.expr<?xi32>, %arg1: !hlfir.expr<1xi32>) -> !hlfir.expr<4xi32> {
+  %0 = hlfir.reshape %arg0 %arg1 pad %arg0 order %arg0 : (!hlfir.expr<?xi32>, !hlfir.expr<1xi32>, !hlfir.expr<?xi32>, !hlfir.expr<?xi32>) -> !hlfir.expr<4xi32>
+  return %0 : !hlfir.expr<4xi32>
+}
+// CHECK-LABEL:   func.func @reshape2(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !hlfir.expr<?xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !hlfir.expr<1xi32>) -> !hlfir.expr<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] pad %[[VAL_0]] order %[[VAL_0]] : (!hlfir.expr<?xi32>, !hlfir.expr<1xi32>, !hlfir.expr<?xi32>, !hlfir.expr<?xi32>) -> !hlfir.expr<4xi32>
+// CHECK:           return %[[VAL_2]] : !hlfir.expr<4xi32>
+// CHECK:         }
+
+// Operands are boxed array
+func.func @reshape3(%arg0: !fir.box<!fir.array<42xi32>>, %arg1: !fir.box<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32> {
+  %0 = hlfir.reshape %arg0 %arg1 : (!fir.box<!fir.array<42xi32>>, !fir.box<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
+  return %0 : !hlfir.expr<?x?xi32>
+}
+// CHECK-LABEL:   func.func @reshape3(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<42xi32>>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32> {
+// CHECK:           %[[VAL_2:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] : (!fir.box<!fir.array<42xi32>>, !fir.box<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
+// CHECK:           return %[[VAL_2]] : !hlfir.expr<?x?xi32>
+// CHECK:         }
+
+// Operands are assumed shape boxed arrays
+func.func @reshape4(%arg0: !fir.box<!fir.array<?xi32>>, %arg1: !fir.box<!fir.array<1xi32>>) -> !hlfir.expr<?xi32> {
+  %0 = hlfir.reshape %arg0 %arg1 pad %arg0 order %arg0 : (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<1xi32>>, !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?xi32>
+  return %0 : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @reshape4(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !fir.box<!fir.array<1xi32>>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] pad %[[VAL_0]] order %[[VAL_0]] : (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<1xi32>>, !fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>) -> !hlfir.expr<?xi32>
+// CHECK:           return %[[VAL_2]] : !hlfir.expr<?xi32>
+// CHECK:         }
+
+// Operands are ref<array<>> of known shape
+func.func @reshape5(%arg0: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xi32> {
+  %0 = hlfir.reshape %arg0 %arg0 pad %arg0 order %arg0 : (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xi32>
+  return %0 : !hlfir.expr<?xi32>
+}
+// CHECK-LABEL:   func.func @reshape5(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xi32> {
+// CHECK:           %[[VAL_1:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_0]] pad %[[VAL_0]] order %[[VAL_0]] : (!fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?xi32>
+// CHECK:           return %[[VAL_1]] : !hlfir.expr<?xi32>
+// CHECK:         }
+
+// Polymorphic operands
+func.func @reshape6(%arg0: !fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>, %arg1: !hlfir.expr<1xi32>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+  %0 = hlfir.reshape %arg0 %arg1 pad %arg0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>, !hlfir.expr<1xi32>, !fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>) -> !hlfir.expr<?x!fir.type<whatever>?>
+  return %0 : !hlfir.expr<?x!fir.type<whatever>?>
+}
+// CHECK-LABEL:   func.func @reshape6(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !hlfir.expr<1xi32>) -> !hlfir.expr<?x!fir.type<whatever>?> {
+// CHECK:           %[[VAL_2:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] pad %[[VAL_0]] : (!fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>, !hlfir.expr<1xi32>, !fir.class<!fir.heap<!fir.array<?x!fir.type<whatever>>>>) -> !hlfir.expr<?x!fir.type<whatever>?>
+// CHECK:           return %[[VAL_2]] : !hlfir.expr<?x!fir.type<whatever>?>
+// CHECK:         }
+
+// Allow character LEN mismatch for ARRAY/PAD and the result
+func.func @reshape7(%arg0: !hlfir.expr<1x!fir.char<2,2>>, %arg1: !hlfir.expr<1xi32>, %arg2: !hlfir.expr<1x!fir.char<2,1>>) -> !hlfir.expr<?x!fir.char<2,3>> {
+  %0 = hlfir.reshape %arg0 %arg1 pad %arg2 : (!hlfir.expr<1x!fir.char<2,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2,1>>) -> !hlfir.expr<?x!fir.char<2,3>>
+  return %0 : !hlfir.expr<?x!fir.char<2,3>>
+}
+// CHECK-LABEL:   func.func @reshape7(
+// CHECK-SAME:                        %[[VAL_0:.*]]: !hlfir.expr<1x!fir.char<2,2>>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: !hlfir.expr<1xi32>,
+// CHECK-SAME:                        %[[VAL_2:.*]]: !hlfir.expr<1x!fir.char<2>>) -> !hlfir.expr<?x!fir.char<2,3>> {
+// CHECK:           %[[VAL_3:.*]] = hlfir.reshape %[[VAL_0]] %[[VAL_1]] pad %[[VAL_2]] : (!hlfir.expr<1x!fir.char<2,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2>>) -> !hlfir.expr<?x!fir.char<2,3>>
+// CHECK:           return %[[VAL_3]] : !hlfir.expr<?x!fir.char<2,3>>
+// CHECK:         }
diff --git a/flang/test/Lower/HLFIR/reshape.f90 b/flang/test/Lower/HLFIR/reshape.f90
new file mode 100644
index 0000000000000..91a08dfe514ec
--- /dev/null
+++ b/flang/test/Lower/HLFIR/reshape.f90
@@ -0,0 +1,143 @@
+! Test lowering of RESHAPE intrinsic to HLFIR
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+module types
+  type t
+  end type t
+end module types
+
+subroutine reshape_test(x, source, pd, sh, ord)
+  integer :: x(:,:)
+  integer :: source(:,:,:)
+  integer :: pd(:,:,:)
+  integer :: sh(2)
+  integer :: ord(2)
+  x = reshape(source, sh, pd, ord)
+end subroutine reshape_test
+! CHECK-LABEL:   func.func @_QPreshape_test(
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_testEord"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_testEpd"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_testEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_testEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_testEx"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK:           %[[VAL_15:.*]] = hlfir.reshape %[[VAL_13]]#0 %[[VAL_12]]#0 pad %[[VAL_9]]#0 order %[[VAL_8]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_14]]#0 : !hlfir.expr<?x?xi32>, !fir.box<!fir.array<?x?xi32>>
+! CHECK:           hlfir.destroy %[[VAL_15]] : !hlfir.expr<?x?xi32>
+
+subroutine reshape_test_noorder(x, source, pd, sh)
+  integer :: x(:,:)
+  integer :: source(:,:,:)
+  integer :: pd(:,:,:)
+  integer :: sh(2)
+  x = reshape(source, sh, pd)
+end subroutine reshape_test_noorder
+! CHECK-LABEL:   func.func @_QPreshape_test_noorder(
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_noorderEpd"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
+! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_noorderEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_noorderEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
+! CHECK:           %[[VAL_11:.*]] = hlfir.reshape %[[VAL_9]]#0 %[[VAL_8]]#0 pad %[[VAL_5]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.box<!fir.array<?x?x?xi32>>) -> !hlfir.expr<?x?xi32>
+
+subroutine reshape_test_nopad(x, source, sh, ord)
+  integer :: x(:,:)
+  integer :: source(:,:,:)
+  integer :: sh(2)
+  integer :: ord(2)
+  x = reshape(source, sh, ORDER=ord)
+end subroutine reshape_test_nopad
+! CHECK-LABEL:   func.func @_QPreshape_test_nopad(
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEord"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsh"} : (!fir.ref<!fir.array<2xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>)
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFreshape_test_nopadEsource"} : (!fir.box<!fir.array<?x?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xi32>>, !fir.box<!fir.array<?x?x?xi32>>)
+! CHECK:           %[[VAL_13:.*]] = hlfir.reshape %[[VAL_11]]#0 %[[VAL_10]]#0 order %[[VAL_7]]#0 : (!fir.box<!fir.array<?x?x?xi32>>, !fir.ref<!fir.array<2xi32>>, !fir.ref<!fir.array<2xi32>>) -> !hlfir.expr<?x?xi32>
+  
+subroutine test_reshape_optional1(pad, order, source, shape)
+  real, pointer :: pad(:, :)
+  integer, pointer :: order(:)
+  real :: source(:, :, :)
+  integer :: shape(4)
+  print *, reshape(source=source, shape=shape, pad=pad, order=order)
+end subroutine test_reshape_optional1
+! CHECK-LABEL:   func.func @_QPtest_reshape_optional1(
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}{fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_reshape_optional1Eorder"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare {{.*}}{fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_reshape_optional1Epad"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtest_reshape_optional1Eshape"} : (!fir.ref<!fir.array<4xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<4xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtest_reshape_optional1Esource"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
+! CHECK:           %[[VAL_16:.*]] = fir.load %[[VAL_6]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
+! CHECK:           %[[VAL_17:.*]] = fir.box_addr %[[VAL_16]] : (!fir.box<!fir.ptr<!fir.array<?x?xf32>>>) -> !fir.ptr<!fir.array<?x?xf32>>
+! CHECK:           %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ptr<!fir.array<?x?xf32>>) -> i64
+! CHECK:           %[[VAL_19:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i64
+! CHECK:           %[[VAL_21:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK:           %[[VAL_22:.*]] = fir.box_addr %[[VAL_21]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
+! CHECK:           %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (!fir.ptr<!fir.array<?xi32>>) -> i64
+! CHECK:           %[[VAL_24:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_25:.*]] = arith.cmpi ne, %[[VAL_23]], %[[VAL_24]] : i64
+! CHECK:           %[[VAL_26:.*]] = fir.load %[[VAL_6]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x?xf32>>>>
+! CHECK:           %[[VAL_27:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
+! CHECK:           %[[VAL_28:.*]] = arith.select %[[VAL_20]], %[[VAL_26]], %[[VAL_27]] : !fir.box<!fir.ptr<!fir.array<?x?xf32>>>
+! CHECK:           %[[VAL_29:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK:           %[[VAL_30:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_31:.*]] = arith.select %[[VAL_25]], %[[VAL_29]], %[[VAL_30]] : !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_32:.*]] = hlfir.reshape %[[VAL_10]]#0 %[[VAL_9]]#0 pad %[[VAL_28]] order %[[VAL_31]] : (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<4xi32>>, !fir.box<!fir.ptr<!fir.array<?x?xf32>>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !hlfir.expr<?x?x?x?xf32>
+
+subroutine test_reshape_optional2(pad, order, source, shape)
+  real, optional :: pad(:, :)
+  integer, pointer, optional :: order(:)
+  real :: source(:, :, :)
+  integer :: shape(4)
+  print *, reshape(source=source, shape=shape, pad=pad, order=order)
+end subroutine test_reshape_optional2
+! CHECK-LABEL:   func.func @_QPtest_reshape_optional2(
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare {{.*}}{fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_reshape_optional2Eorder"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare {{.*}}{fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_reshape_optional2Epad"} : (!fir.box<!fir.array<?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.array<?x?xf32>>)
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtest_reshape_optional2Eshape"} : (!fir.ref<!fir.array<4xi32>>, !fir.shape<1>, !fir.dscope) -> (!fir.ref<!fir.array<4xi32>>, !fir.ref<!fir.array<4xi32>>)
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare {{.*}}{uniq_name = "_QFtest_reshape_optional2Esource"} : (!fir.box<!fir.array<?x?x?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.box<!fir.array<?x?x?xf32>>)
+! CHECK:           %[[VAL_16:.*]] = fir.is_present %[[VAL_6]]#0 : (!fir.box<!fir.array<?x?xf32>>) -> i1
+! CHECK:           %[[VAL_17:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK:           %[[VAL_18:.*]] = fir.box_addr %[[VAL_17]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
+! CHECK:           %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (!fir.ptr<!fir.array<?xi32>>) -> i64
+! CHECK:           %[[VAL_20:.*]] = arith.constant 0 : i64
+! CHECK:           %[[VAL_21:.*]] = arith.cmpi ne, %[[VAL_19]], %[[VAL_20]] : i64
+! CHECK:           %[[VAL_22:.*]] = fir.absent !fir.box<!fir.array<?x?xf32>>
+! CHECK:           %[[VAL_23:.*]] = arith.select %[[VAL_16]], %[[VAL_6]]#1, %[[VAL_22]] : !fir.box<!fir.array<?x?xf32>>
+! CHECK:           %[[VAL_24:.*]] = fir.load %[[VAL_5]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+! CHECK:           %[[VAL_25:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_26:.*]] = arith.select %[[VAL_21]], %[[VAL_24]], %[[VAL_25]] : !fir.box<!fir.ptr<!fir.array<?xi32>>>
+! CHECK:           %[[VAL_27:.*]] = hlfir.reshape %[[VAL_10]]#0 %[[VAL_9]]#0 pad %[[VAL_23]] order %[[VAL_26]] : (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<4xi32>>, !fir.box<!fir.array<?x?xf32>>, !fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !hlfir.expr<?x?x?x?xf32>
+
+subroutine test_reshape_shape_expr(source, shape)
+  integer :: source(:), shape(2)
+  print *, reshape(source, shape + 1)
+end subroutine test_reshape_shape_expr
+! CHECK-LABEL:   func.func @_QPtest_reshape_shape_expr(
+! CHECK:           %[[VAL_13:.*]] = hlfir.elemental
+! CHECK:           %[[VAL_18:.*]] = hlfir.reshape %{{.*}} %[[VAL_13]] : (!fir.box<!fir.array<?xi32>>, !hlfir.expr<2xi32>) -> !hlfir.expr<?x?xi32>
+
+subroutine test_reshape_polymorphic1(source, shape)
+  use types
+  class(t), allocatable :: source(:)
+  integer :: shape(1)
+  source = reshape(source, shape)
+end subroutine test_reshape_polymorphic1
+! CHECK-LABEL:   func.func @_QPtest_reshape_polymorphic1(
+! CHECK:           hlfir.reshape %{{.*}} %{{.*}} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.ref<!fir.array<1xi32>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+
+subroutine test_reshape_polymorphic2(source, shape, pad)
+  use types
+  class(t), allocatable :: source(:)
+  type(t) :: pad(:)
+  integer :: shape(1)
+  source = reshape(source, shape, pad)
+end subroutine test_reshape_polymorphic2
+! CHECK-LABEL:   func.func @_QPtest_reshape_polymorphic2(
+! CHECK:           hlfir.reshape %{{.*}} %{{.*}} pad %{{.*}} : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QMtypesTt>>>>, !fir.ref<!fir.array<1xi32>>, !fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>?>
+
+subroutine test_reshape_polymorphic3(source, shape, pad)
+  use types
+  type(t) :: source(:)
+  class(t) :: pad(:)
+  integer :: shape(1)
+  source = reshape(source, shape, pad)
+end subroutine test_reshape_polymorphic3
+! CHECK-LABEL:   func.func @_QPtest_reshape_polymorphic3(
+! CHECK:           hlfir.reshape %{{.*}} %{{.*}} pad %{{.*}} : (!fir.box<!fir.array<?x!fir.type<_QMtypesTt>>>, !fir.ref<!fir.array<1xi32>>, !fir.class<!fir.array<?x!fir.type<_QMtypesTt>>>) -> !hlfir.expr<?x!fir.type<_QMtypesTt>>

From e0c7f081f1582d49f81ec4c6cdbf5d6ef13c58ba Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 18:24:59 -0800
Subject: [PATCH 320/432] [lld-macho] Refactor BPSectionOrderer with CRTP. NFC

PR #117514 refactored BPSectionOrderer to be used by the ELF port
but introduced some inefficiency:

* BPSectionBase/BPSymbol are wrappers around a single pointer.
  The numbers of sections and symbols could be huge, and the extra
  allocations are memory inefficient.
* Reconstructing the returned DenseMap (since BPSectionBase != InputSectin)
  is wasteful.

This patch refactors BPSectionOrderer with Curiously Recurring Template
Pattern and eliminates the inefficiency. In addition,
`symbolToSectionIdxs` is removed and `rootSymbolToSectionIdxs` building
is moved to lld/MachO: while getting sections for symbols is cheap in
Mach-O, it is awkward and inefficient in the ELF port.

While here, add a file-level comment and replace some `StringMap<*>`
(which copies strings) with `DenseMap<CachedHashStringRef, *>`.

Pull Request: https://github.com/llvm/llvm-project/pull/124482
---
 lld/Common/CMakeLists.txt                     |   2 -
 lld/MachO/BPSectionOrderer.cpp                | 134 +++++++++++++++--
 lld/MachO/BPSectionOrderer.h                  | 118 +--------------
 lld/MachO/CMakeLists.txt                      |   1 +
 lld/include/lld/Common/BPSectionOrdererBase.h |  70 ---------
 .../lld/Common/BPSectionOrdererBase.inc}      | 142 +++++++++++-------
 6 files changed, 209 insertions(+), 258 deletions(-)
 delete mode 100644 lld/include/lld/Common/BPSectionOrdererBase.h
 rename lld/{Common/BPSectionOrdererBase.cpp => include/lld/Common/BPSectionOrdererBase.inc} (74%)

diff --git a/lld/Common/CMakeLists.txt b/lld/Common/CMakeLists.txt
index 43e91b85821db..4f503d04f7844 100644
--- a/lld/Common/CMakeLists.txt
+++ b/lld/Common/CMakeLists.txt
@@ -24,7 +24,6 @@ set_source_files_properties("${version_inc}"
 
 add_lld_library(lldCommon
   Args.cpp
-  BPSectionOrdererBase.cpp
   CommonLinkerContext.cpp
   DriverDispatcher.cpp
   DWARF.cpp
@@ -48,7 +47,6 @@ add_lld_library(lldCommon
   Demangle
   MC
   Option
-  ProfileData
   Support
   Target
   TargetParser
diff --git a/lld/MachO/BPSectionOrderer.cpp b/lld/MachO/BPSectionOrderer.cpp
index c2d03c968534e..e2f7a387deebc 100644
--- a/lld/MachO/BPSectionOrderer.cpp
+++ b/lld/MachO/BPSectionOrderer.cpp
@@ -8,39 +8,141 @@
 
 #include "BPSectionOrderer.h"
 #include "InputSection.h"
+#include "Relocations.h"
+#include "Symbols.h"
+#include "lld/Common/BPSectionOrdererBase.inc"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/xxhash.h"
 
 #define DEBUG_TYPE "bp-section-orderer"
 
 using namespace llvm;
 using namespace lld::macho;
 
+namespace {
+struct BPOrdererMachO;
+}
+template <> struct lld::BPOrdererTraits<struct BPOrdererMachO> {
+  using Section = macho::InputSection;
+  using Symbol = macho::Symbol;
+};
+namespace {
+struct BPOrdererMachO : lld::BPOrderer<BPOrdererMachO> {
+  static uint64_t getSize(const Section &sec) { return sec.getSize(); }
+  static bool isCodeSection(const Section &sec) {
+    return macho::isCodeSection(&sec);
+  }
+  static SmallVector<Symbol *, 0> getSymbols(const Section &sec) {
+    SmallVector<Symbol *, 0> symbols;
+    for (auto *sym : sec.symbols)
+      if (auto *d = llvm::dyn_cast_or_null<Defined>(sym))
+        symbols.emplace_back(d);
+    return symbols;
+  }
+
+  // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
+  // Mangler::getNameWithPrefix() for details.
+  std::optional<StringRef> static getResolvedLinkageName(llvm::StringRef name) {
+    if (name.consume_front("_") || name.consume_front("l_"))
+      return name;
+    return {};
+  }
+
+  static void
+  getSectionHashes(const Section &sec, llvm::SmallVectorImpl<uint64_t> &hashes,
+                   const llvm::DenseMap<const void *, uint64_t> &sectionToIdx) {
+    constexpr unsigned windowSize = 4;
+
+    // Calculate content hashes: k-mers and the last k-1 bytes.
+    ArrayRef<uint8_t> data = sec.data;
+    if (data.size() >= windowSize)
+      for (size_t i = 0; i <= data.size() - windowSize; ++i)
+        hashes.push_back(llvm::support::endian::read32le(data.data() + i));
+    for (uint8_t byte : data.take_back(windowSize - 1))
+      hashes.push_back(byte);
+
+    // Calculate relocation hashes
+    for (const auto &r : sec.relocs) {
+      if (r.length == 0 || r.referent.isNull() || r.offset >= data.size())
+        continue;
+
+      uint64_t relocHash = getRelocHash(r, sectionToIdx);
+      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
+      for (uint32_t i = start; i < r.offset + r.length; i++) {
+        auto window = data.drop_front(i).take_front(windowSize);
+        hashes.push_back(xxh3_64bits(window) ^ relocHash);
+      }
+    }
+
+    llvm::sort(hashes);
+    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
+  }
+
+  static llvm::StringRef getSymName(const Symbol &sym) { return sym.getName(); }
+  static uint64_t getSymValue(const Symbol &sym) {
+    if (auto *d = dyn_cast<Defined>(&sym))
+      return d->value;
+    return 0;
+  }
+  static uint64_t getSymSize(const Symbol &sym) {
+    if (auto *d = dyn_cast<Defined>(&sym))
+      return d->size;
+    return 0;
+  }
+
+private:
+  static uint64_t
+  getRelocHash(const Reloc &reloc,
+               const llvm::DenseMap<const void *, uint64_t> &sectionToIdx) {
+    auto *isec = reloc.getReferentInputSection();
+    std::optional<uint64_t> sectionIdx;
+    if (auto it = sectionToIdx.find(isec); it != sectionToIdx.end())
+      sectionIdx = it->second;
+    uint64_t kind = -1, value = 0;
+    if (isec)
+      kind = uint64_t(isec->kind());
+
+    if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
+      kind = (kind << 8) | uint8_t(sym->kind());
+      if (auto *d = llvm::dyn_cast<Defined>(sym))
+        value = d->value;
+    }
+    return llvm::stable_hash_combine(kind, sectionIdx.value_or(0), value,
+                                     reloc.addend);
+  }
+};
+} // namespace
+
 DenseMap<const InputSection *, int> lld::macho::runBalancedPartitioning(
     StringRef profilePath, bool forFunctionCompression, bool forDataCompression,
     bool compressionSortStartupFunctions, bool verbose) {
-
-  SmallVector<std::unique_ptr<BPSectionBase>> sections;
+  // Collect candidate sections and associated symbols.
+  SmallVector<InputSection *> sections;
+  DenseMap<CachedHashStringRef, DenseSet<unsigned>> rootSymbolToSectionIdxs;
   for (const auto *file : inputFiles) {
     for (auto *sec : file->sections) {
       for (auto &subsec : sec->subsections) {
         auto *isec = subsec.isec;
-        if (!isec || isec->data.empty() || !isec->data.data())
+        if (!isec || isec->data.empty())
           continue;
-        sections.emplace_back(std::make_unique<BPSectionMacho>(isec));
+        size_t idx = sections.size();
+        sections.emplace_back(isec);
+        for (auto *sym : BPOrdererMachO::getSymbols(*isec)) {
+          auto rootName = getRootSymbol(sym->getName());
+          rootSymbolToSectionIdxs[CachedHashStringRef(rootName)].insert(idx);
+          if (auto linkageName =
+                  BPOrdererMachO::getResolvedLinkageName(rootName))
+            rootSymbolToSectionIdxs[CachedHashStringRef(*linkageName)].insert(
+                idx);
+        }
       }
     }
   }
 
-  auto reorderedSections = BPSectionBase::reorderSectionsByBalancedPartitioning(
-      profilePath, forFunctionCompression, forDataCompression,
-      compressionSortStartupFunctions, verbose, sections);
-
-  DenseMap<const InputSection *, int> result;
-  for (const auto &[sec, priority] : reorderedSections) {
-    result.try_emplace(
-        static_cast<const InputSection *>(
-            static_cast<const BPSectionMacho *>(sec)->getSection()),
-        priority);
-  }
-  return result;
+  return BPOrdererMachO::computeOrder(profilePath, forFunctionCompression,
+                                      forDataCompression,
+                                      compressionSortStartupFunctions, verbose,
+                                      sections, rootSymbolToSectionIdxs);
 }
diff --git a/lld/MachO/BPSectionOrderer.h b/lld/MachO/BPSectionOrderer.h
index e3e6b12092e20..a27f605cb1180 100644
--- a/lld/MachO/BPSectionOrderer.h
+++ b/lld/MachO/BPSectionOrderer.h
@@ -14,134 +14,18 @@
 #ifndef LLD_MACHO_BPSECTION_ORDERER_H
 #define LLD_MACHO_BPSECTION_ORDERER_H
 
-#include "InputSection.h"
-#include "Relocations.h"
-#include "Symbols.h"
-#include "lld/Common/BPSectionOrdererBase.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/xxhash.h"
 
 namespace lld::macho {
-
 class InputSection;
 
-class BPSymbolMacho : public BPSymbol {
-  const Symbol *sym;
-
-public:
-  explicit BPSymbolMacho(const Symbol *s) : sym(s) {}
-
-  llvm::StringRef getName() const override { return sym->getName(); }
-
-  const Defined *asDefined() const {
-    return llvm::dyn_cast_or_null<Defined>(sym);
-  }
-
-  std::optional<uint64_t> getValue() const override {
-    if (auto *d = asDefined())
-      return d->value;
-    return {};
-  }
-
-  std::optional<uint64_t> getSize() const override {
-    if (auto *d = asDefined())
-      return d->size;
-    return {};
-  }
-
-  const Symbol *getSymbol() const { return sym; }
-};
-
-class BPSectionMacho : public BPSectionBase {
-  const InputSection *isec;
-
-public:
-  explicit BPSectionMacho(const InputSection *sec) : isec(sec) {}
-
-  const void *getSection() const override { return isec; }
-
-  uint64_t getSize() const override { return isec->getSize(); }
-
-  bool isCodeSection() const override { return macho::isCodeSection(isec); }
-
-  SmallVector<std::unique_ptr<BPSymbol>> getSymbols() const override {
-    SmallVector<std::unique_ptr<BPSymbol>> symbols;
-    for (auto *sym : isec->symbols)
-      if (auto *d = llvm::dyn_cast_or_null<Defined>(sym))
-        symbols.emplace_back(std::make_unique<BPSymbolMacho>(d));
-    return symbols;
-  }
-
-  // Linkage names can be prefixed with "_" or "l_" on Mach-O. See
-  // Mangler::getNameWithPrefix() for details.
-  std::optional<StringRef>
-  getResolvedLinkageName(llvm::StringRef name) const override {
-    if (name.consume_front("_") || name.consume_front("l_"))
-      return name;
-    return {};
-  }
-
-  void getSectionHashes(llvm::SmallVectorImpl<uint64_t> &hashes,
-                        const llvm::DenseMap<const void *, uint64_t>
-                            &sectionToIdx) const override {
-    constexpr unsigned windowSize = 4;
-
-    // Calculate content hashes: k-mers and the last k-1 bytes.
-    ArrayRef<uint8_t> data = isec->data;
-    if (data.size() >= windowSize)
-      for (size_t i = 0; i <= data.size() - windowSize; ++i)
-        hashes.push_back(llvm::support::endian::read32le(data.data() + i));
-    for (uint8_t byte : data.take_back(windowSize - 1))
-      hashes.push_back(byte);
-
-    // Calculate relocation hashes
-    for (const auto &r : isec->relocs) {
-      if (r.length == 0 || r.referent.isNull() || r.offset >= data.size())
-        continue;
-
-      uint64_t relocHash = getRelocHash(r, sectionToIdx);
-      uint32_t start = (r.offset < windowSize) ? 0 : r.offset - windowSize + 1;
-      for (uint32_t i = start; i < r.offset + r.length; i++) {
-        auto window = data.drop_front(i).take_front(windowSize);
-        hashes.push_back(xxh3_64bits(window) ^ relocHash);
-      }
-    }
-
-    llvm::sort(hashes);
-    hashes.erase(std::unique(hashes.begin(), hashes.end()), hashes.end());
-  }
-
-private:
-  static uint64_t
-  getRelocHash(const Reloc &reloc,
-               const llvm::DenseMap<const void *, uint64_t> &sectionToIdx) {
-    auto *isec = reloc.getReferentInputSection();
-    std::optional<uint64_t> sectionIdx;
-    if (auto it = sectionToIdx.find(isec); it != sectionToIdx.end())
-      sectionIdx = it->second;
-    uint64_t kind = -1, value = 0;
-    if (isec)
-      kind = uint64_t(isec->kind());
-
-    if (auto *sym = reloc.referent.dyn_cast<Symbol *>()) {
-      kind = (kind << 8) | uint8_t(sym->kind());
-      if (auto *d = llvm::dyn_cast<Defined>(sym))
-        value = d->value;
-    }
-    return llvm::stable_hash_combine(kind, sectionIdx.value_or(0), value,
-                                     reloc.addend);
-  }
-};
-
 /// Run Balanced Partitioning to find the optimal function and data order to
 /// improve startup time and compressed size.
 ///
 /// It is important that .subsections_via_symbols is used to ensure functions
 /// and data are in their own sections and thus can be reordered.
-llvm::DenseMap<const lld::macho::InputSection *, int>
+llvm::DenseMap<const InputSection *, int>
 runBalancedPartitioning(llvm::StringRef profilePath,
                         bool forFunctionCompression, bool forDataCompression,
                         bool compressionSortStartupFunctions, bool verbose);
diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index c778fcf7b6fff..ecf6ce609e59f 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -50,6 +50,7 @@ add_lld_library(lldMachO
   Object
   Option
   Passes
+  ProfileData
   Support
   TargetParser
   TextAPI
diff --git a/lld/include/lld/Common/BPSectionOrdererBase.h b/lld/include/lld/Common/BPSectionOrdererBase.h
deleted file mode 100644
index bbd05edc5e55e..0000000000000
--- a/lld/include/lld/Common/BPSectionOrdererBase.h
+++ /dev/null
@@ -1,70 +0,0 @@
-//===- BPSectionOrdererBase.h ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the common interfaces which may be used by
-// BPSectionOrderer.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLD_COMMON_BP_SECTION_ORDERER_BASE_H
-#define LLD_COMMON_BP_SECTION_ORDERER_BASE_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include <memory>
-#include <optional>
-
-namespace lld {
-
-class BPSymbol {
-
-public:
-  virtual ~BPSymbol() = default;
-  virtual llvm::StringRef getName() const = 0;
-  virtual std::optional<uint64_t> getValue() const = 0;
-  virtual std::optional<uint64_t> getSize() const = 0;
-};
-
-class BPSectionBase {
-public:
-  virtual ~BPSectionBase() = default;
-  virtual uint64_t getSize() const = 0;
-  virtual bool isCodeSection() const = 0;
-  virtual llvm::SmallVector<std::unique_ptr<BPSymbol>> getSymbols() const = 0;
-  virtual const void *getSection() const = 0;
-  virtual void getSectionHashes(
-      llvm::SmallVectorImpl<uint64_t> &hashes,
-      const llvm::DenseMap<const void *, uint64_t> &sectionToIdx) const = 0;
-  virtual std::optional<llvm::StringRef>
-  getResolvedLinkageName(llvm::StringRef name) const = 0;
-
-  /// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
-  /// "yyyy" are numbers that could change between builds. We need to use the
-  /// root symbol name before this suffix so these symbols can be matched with
-  /// profiles which may have different suffixes.
-  static llvm::StringRef getRootSymbol(llvm::StringRef Name) {
-    auto [P0, S0] = Name.rsplit(".llvm.");
-    auto [P1, S1] = P0.rsplit(".__uniq.");
-    return P1;
-  }
-
-  /// Reorders sections using balanced partitioning algorithm based on profile
-  /// data.
-  static llvm::DenseMap<const BPSectionBase *, int>
-  reorderSectionsByBalancedPartitioning(
-      llvm::StringRef profilePath, bool forFunctionCompression,
-      bool forDataCompression, bool compressionSortStartupFunctions,
-      bool verbose,
-      llvm::SmallVector<std::unique_ptr<BPSectionBase>> &inputSections);
-};
-
-} // namespace lld
-
-#endif
diff --git a/lld/Common/BPSectionOrdererBase.cpp b/lld/include/lld/Common/BPSectionOrdererBase.inc
similarity index 74%
rename from lld/Common/BPSectionOrdererBase.cpp
rename to lld/include/lld/Common/BPSectionOrdererBase.inc
index 7d26a5fb84483..9a2ee4d507384 100644
--- a/lld/Common/BPSectionOrdererBase.cpp
+++ b/lld/include/lld/Common/BPSectionOrdererBase.inc
@@ -1,31 +1,76 @@
-//===- BPSectionOrdererBase.cpp -------------------------------------------===//
+//===- BPSectionOrdererBase.inc ---------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// This file defines the common BPSectionOrderer interface using the Curiously
+// Recurring Template Pattern and dispatches to the BalancedPartitioning
+// algorithm implemented in LLVMSupport. The optimized section layout attempts
+// to group similar sections together (resulting in a smaller compressed app
+// size) and utilize a temporal profile file to reduce page faults during
+// program startup.
+//
+// Clients should derive from BPOrderer, providing concrete implementations for
+// section and symbol representations. Include this file in a .cpp file to
+// specialize the template for the derived class.
+//
+//===----------------------------------------------------------------------===//
 
-#include "lld/Common/BPSectionOrdererBase.h"
 #include "lld/Common/ErrorHandler.h"
+#include "llvm/ADT/CachedHashString.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/VirtualFileSystem.h"
+#include <memory>
+#include <optional>
 
 #define DEBUG_TYPE "bp-section-orderer"
 
 using namespace llvm;
 using namespace lld;
 
+namespace lld {
+template <class D> struct BPOrdererTraits;
+
+template <class D> struct BPOrderer {
+  using Section = typename BPOrdererTraits<D>::Section;
+  using Symbol = typename BPOrdererTraits<D>::Symbol;
+
+  // Compute a section order using the Balanced Partitioning algorithm.
+  //
+  // * for*Compresion: Improve Lempel-Ziv compression by grouping
+  //   similar sections together.
+  // * profilePath: Utilize a temporal profile file to reduce page faults during
+  //   program startup.
+  // * compressionSortStartupFunctions: if profilePath is specified, allocate
+  //   extra utility vertices to prioritize nearby function similarity.
+  static auto
+  computeOrder(llvm::StringRef profilePath, bool forFunctionCompression,
+               bool forDataCompression, bool compressionSortStartupFunctions,
+               bool verbose, llvm::ArrayRef<Section *> sections,
+               const DenseMap<CachedHashStringRef, DenseSet<unsigned>>
+                   &rootSymbolToSectionIdxs)
+      -> llvm::DenseMap<const Section *, int>;
+};
+} // namespace lld
+
 using UtilityNodes = SmallVector<BPFunctionNode::UtilityNodeT>;
 
+template <class D>
 static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
-    ArrayRef<const BPSectionBase *> sections,
+    ArrayRef<const typename D::Section *> sections,
     const DenseMap<const void *, uint64_t> &sectionToIdx,
     ArrayRef<unsigned> sectionIdxs,
     DenseMap<unsigned, SmallVector<unsigned>> *duplicateSectionIdxs,
@@ -38,7 +83,7 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
 
   for (unsigned sectionIdx : sectionIdxs) {
     const auto *isec = sections[sectionIdx];
-    isec->getSectionHashes(hashes, sectionToIdx);
+    D::getSectionHashes(*isec, hashes, sectionToIdx);
     sectionHashes.emplace_back(sectionIdx, std::move(hashes));
     hashes.clear();
   }
@@ -96,36 +141,27 @@ static SmallVector<std::pair<unsigned, UtilityNodes>> getUnsForCompression(
   return sectionUns;
 }
 
-llvm::DenseMap<const BPSectionBase *, int>
-BPSectionBase::reorderSectionsByBalancedPartitioning(
-    llvm::StringRef profilePath, bool forFunctionCompression,
-    bool forDataCompression, bool compressionSortStartupFunctions, bool verbose,
-    SmallVector<std::unique_ptr<BPSectionBase>> &inputSections) {
+/// Symbols can be appended with "(.__uniq.xxxx)?.llvm.yyyy" where "xxxx" and
+/// "yyyy" are numbers that could change between builds. We need to use the
+/// root symbol name before this suffix so these symbols can be matched with
+/// profiles which may have different suffixes.
+inline StringRef getRootSymbol(StringRef name) {
+  auto [P0, S0] = name.rsplit(".llvm.");
+  auto [P1, S1] = P0.rsplit(".__uniq.");
+  return P1;
+}
+
+template <class D>
+auto BPOrderer<D>::computeOrder(
+    StringRef profilePath, bool forFunctionCompression, bool forDataCompression,
+    bool compressionSortStartupFunctions, bool verbose,
+    ArrayRef<Section *> sections,
+    const DenseMap<CachedHashStringRef, DenseSet<unsigned>>
+        &rootSymbolToSectionIdxs) -> DenseMap<const Section *, int> {
   TimeTraceScope timeScope("Setup Balanced Partitioning");
-  SmallVector<const BPSectionBase *> sections;
   DenseMap<const void *, uint64_t> sectionToIdx;
-  StringMap<DenseSet<unsigned>> symbolToSectionIdxs;
-
-  // Process input sections
-  for (const auto &isec : inputSections) {
-    unsigned sectionIdx = sections.size();
-    sectionToIdx.try_emplace(isec->getSection(), sectionIdx);
-    sections.emplace_back(isec.get());
-    for (auto &sym : isec->getSymbols())
-      symbolToSectionIdxs[sym->getName()].insert(sectionIdx);
-  }
-  StringMap<DenseSet<unsigned>> rootSymbolToSectionIdxs;
-  for (auto &entry : symbolToSectionIdxs) {
-    StringRef name = entry.getKey();
-    auto &sectionIdxs = entry.getValue();
-    name = BPSectionBase::getRootSymbol(name);
-    rootSymbolToSectionIdxs[name].insert(sectionIdxs.begin(),
-                                         sectionIdxs.end());
-    if (auto resolvedLinkageName =
-            sections[*sectionIdxs.begin()]->getResolvedLinkageName(name))
-      rootSymbolToSectionIdxs[resolvedLinkageName.value()].insert(
-          sectionIdxs.begin(), sectionIdxs.end());
-  }
+  for (auto [i, isec] : llvm::enumerate(sections))
+    sectionToIdx.try_emplace(isec, i);
 
   BPFunctionNode::UtilityNodeT maxUN = 0;
   DenseMap<unsigned, UtilityNodes> startupSectionIdxUNs;
@@ -150,17 +186,18 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
       size_t cutoffTimestamp = 1;
       auto &trace = traces[traceIdx].FunctionNameRefs;
       for (size_t timestamp = 0; timestamp < trace.size(); timestamp++) {
-        auto [Filename, ParsedFuncName] = getParsedIRPGOName(
+        auto [_, parsedFuncName] = getParsedIRPGOName(
             reader->getSymtab().getFuncOrVarName(trace[timestamp]));
-        ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
+        parsedFuncName = getRootSymbol(parsedFuncName);
 
-        auto sectionIdxsIt = rootSymbolToSectionIdxs.find(ParsedFuncName);
+        auto sectionIdxsIt =
+            rootSymbolToSectionIdxs.find(CachedHashStringRef(parsedFuncName));
         if (sectionIdxsIt == rootSymbolToSectionIdxs.end())
           continue;
-        auto &sectionIdxs = sectionIdxsIt->getValue();
+        auto &sectionIdxs = sectionIdxsIt->second;
         // If the same symbol is found in multiple sections, they might be
         // identical, so we arbitrarily use the size from the first section.
-        currentSize += sections[*sectionIdxs.begin()]->getSize();
+        currentSize += D::getSize(*sections[*sectionIdxs.begin()]);
 
         // Since BalancedPartitioning is sensitive to the initial order, we need
         // to explicitly define it to be ordered by earliest timestamp.
@@ -193,7 +230,7 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
     if (startupSectionIdxUNs.count(sectionIdx))
       continue;
     const auto *isec = sections[sectionIdx];
-    if (isec->isCodeSection()) {
+    if (D::isCodeSection(*isec)) {
       if (forFunctionCompression)
         sectionIdxsForFunctionCompression.push_back(sectionIdx);
     } else {
@@ -207,8 +244,8 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
     for (auto &[sectionIdx, uns] : startupSectionIdxUNs)
       startupIdxs.push_back(sectionIdx);
     auto unsForStartupFunctionCompression =
-        getUnsForCompression(sections, sectionToIdx, startupIdxs,
-                             /*duplicateSectionIdxs=*/nullptr, maxUN);
+        getUnsForCompression<D>(sections, sectionToIdx, startupIdxs,
+                                /*duplicateSectionIdxs=*/nullptr, maxUN);
     for (auto &[sectionIdx, compressionUns] :
          unsForStartupFunctionCompression) {
       auto &uns = startupSectionIdxUNs[sectionIdx];
@@ -221,10 +258,10 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
   // Map a section index (order directly) to a list of duplicate section indices
   // (not ordered directly).
   DenseMap<unsigned, SmallVector<unsigned>> duplicateSectionIdxs;
-  auto unsForFunctionCompression = getUnsForCompression(
+  auto unsForFunctionCompression = getUnsForCompression<D>(
       sections, sectionToIdx, sectionIdxsForFunctionCompression,
       &duplicateSectionIdxs, maxUN);
-  auto unsForDataCompression = getUnsForCompression(
+  auto unsForDataCompression = getUnsForCompression<D>(
       sections, sectionToIdx, sectionIdxsForDataCompression,
       &duplicateSectionIdxs, maxUN);
 
@@ -263,7 +300,7 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
   unsigned numDuplicateCodeSections = 0;
   unsigned numDataCompressionSections = 0;
   unsigned numDuplicateDataSections = 0;
-  SetVector<const BPSectionBase *> orderedSections;
+  SetVector<const Section *> orderedSections;
   // Order startup functions,
   for (auto &node : nodesForStartup) {
     const auto *isec = sections[node.Id];
@@ -320,23 +357,22 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
       const uint64_t pageSize = (1 << 14);
       uint64_t currentAddress = 0;
       for (const auto *isec : orderedSections) {
-        for (auto &sym : isec->getSymbols()) {
-          uint64_t startAddress = currentAddress + sym->getValue().value_or(0);
-          uint64_t endAddress = startAddress + sym->getSize().value_or(0);
+        for (auto *sym : D::getSymbols(*isec)) {
+          uint64_t startAddress = currentAddress + D::getSymValue(*sym);
+          uint64_t endAddress = startAddress + D::getSymSize(*sym);
           uint64_t firstPage = startAddress / pageSize;
           // I think the kernel might pull in a few pages when one it touched,
           // so it might be more accurate to force lastPage to be aligned by
           // 4?
           uint64_t lastPage = endAddress / pageSize;
-          StringRef rootSymbol = sym->getName();
-          rootSymbol = BPSectionBase::getRootSymbol(rootSymbol);
+          StringRef rootSymbol = D::getSymName(*sym);
+          rootSymbol = getRootSymbol(rootSymbol);
           symbolToPageNumbers.try_emplace(rootSymbol, firstPage, lastPage);
-          if (auto resolvedLinkageName =
-                  isec->getResolvedLinkageName(rootSymbol))
+          if (auto resolvedLinkageName = D::getResolvedLinkageName(rootSymbol))
             symbolToPageNumbers.try_emplace(resolvedLinkageName.value(),
                                             firstPage, lastPage);
         }
-        currentAddress += isec->getSize();
+        currentAddress += D::getSize(*isec);
       }
 
       // The area under the curve F where F(t) is the total number of page
@@ -348,7 +384,7 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
           auto traceId = trace.FunctionNameRefs[step];
           auto [Filename, ParsedFuncName] =
               getParsedIRPGOName(reader->getSymtab().getFuncOrVarName(traceId));
-          ParsedFuncName = BPSectionBase::getRootSymbol(ParsedFuncName);
+          ParsedFuncName = getRootSymbol(ParsedFuncName);
           auto it = symbolToPageNumbers.find(ParsedFuncName);
           if (it != symbolToPageNumbers.end()) {
             auto &[firstPage, lastPage] = it->getValue();
@@ -363,7 +399,7 @@ BPSectionBase::reorderSectionsByBalancedPartitioning(
     }
   }
 
-  DenseMap<const BPSectionBase *, int> sectionPriorities;
+  DenseMap<const Section *, int> sectionPriorities;
   int prio = -orderedSections.size();
   for (const auto *isec : orderedSections)
     sectionPriorities[isec] = prio++;

From 0bbfd96a3a8e9cf2d4e79ef0826f7c189a017598 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 28 Jan 2025 02:25:27 +0000
Subject: [PATCH 321/432] [gn build] Port e0c7f081f158

---
 llvm/utils/gn/secondary/lld/Common/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/lld/Common/BUILD.gn b/llvm/utils/gn/secondary/lld/Common/BUILD.gn
index c0b1c451f2ad5..b5286545f867d 100644
--- a/llvm/utils/gn/secondary/lld/Common/BUILD.gn
+++ b/llvm/utils/gn/secondary/lld/Common/BUILD.gn
@@ -33,7 +33,6 @@ static_library("Common") {
   ]
   sources = [
     "Args.cpp",
-    "BPSectionOrdererBase.cpp",
     "CommonLinkerContext.cpp",
     "DWARF.cpp",
     "DriverDispatcher.cpp",

From ba789c6f82a911f90a5d20b637e02df4439d0153 Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Tue, 28 Jan 2025 08:27:42 +0530
Subject: [PATCH 322/432] [Flang] Add semantics checks for CrayPointer usage in
 DSA list (#123171)

Follow-up PR to fix the failure caused here:
https://github.com/llvm/llvm-project/pull/121028

Failure:
https://lab.llvm.org/buildbot/#/builders/89/builds/14474

Problems:
- Cray pointee cannot be used in the DSA list (If used results in segmentation fault)
- Cray pointer has to be in the DSA list when Cray pointee is used in the default (none) region

Fix: Added required semantic checks along the tests
Reference from the documentation (OpenMP 5.0: 2.19.1):
- Cray pointees have the same data-sharing attribute as the storage with
which their Cray pointers are associated.
---
 flang/lib/Semantics/check-omp-structure.cpp   | 25 +++++++++++
 flang/lib/Semantics/check-omp-structure.h     |  2 +
 flang/lib/Semantics/resolve-directives.cpp    | 23 +++++++---
 .../Semantics/OpenMP/cray-pointer-usage.f90   | 44 +++++++++++++++++++
 4 files changed, 88 insertions(+), 6 deletions(-)
 create mode 100644 flang/test/Semantics/OpenMP/cray-pointer-usage.f90

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index c7ad9cc085a21..00a031e0dcad7 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -3438,6 +3438,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Ordered &x) {
 void OmpStructureChecker::Enter(const parser::OmpClause::Shared &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_shared);
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "SHARED");
+  CheckCrayPointee(x.v, "SHARED");
 }
 void OmpStructureChecker::Enter(const parser::OmpClause::Private &x) {
   SymbolSourceMap symbols;
@@ -3445,6 +3446,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Private &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_private);
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "PRIVATE");
   CheckIntentInPointer(symbols, llvm::omp::Clause::OMPC_private);
+  CheckCrayPointee(x.v, "PRIVATE");
 }
 
 void OmpStructureChecker::Enter(const parser::OmpClause::Nowait &x) {
@@ -3524,6 +3526,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Firstprivate &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_firstprivate);
 
   CheckIsVarPartOfAnotherVar(GetContext().clauseSource, x.v, "FIRSTPRIVATE");
+  CheckCrayPointee(x.v, "FIRSTPRIVATE");
   CheckIsLoopIvPartOfClause(llvmOmpClause::OMPC_firstprivate, x.v);
 
   SymbolSourceMap currSymbols;
@@ -3758,6 +3761,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Linear &x) {
 
   SymbolSourceMap symbols;
   auto &objects{std::get<parser::OmpObjectList>(x.v.t)};
+  CheckCrayPointee(objects, "LINEAR", false);
   GetSymbolsInObjectList(objects, symbols);
 
   auto CheckIntegerNoRef{[&](const Symbol *symbol, parser::CharBlock source) {
@@ -4203,6 +4207,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Lastprivate &x) {
   const auto &objectList{std::get<parser::OmpObjectList>(x.v.t)};
   CheckIsVarPartOfAnotherVar(
       GetContext().clauseSource, objectList, "LASTPRIVATE");
+  CheckCrayPointee(objectList, "LASTPRIVATE");
 
   DirectivesClauseTriple dirClauseTriple;
   SymbolSourceMap currSymbols;
@@ -4620,6 +4625,26 @@ void OmpStructureChecker::CheckProcedurePointer(
   }
 }
 
+void OmpStructureChecker::CheckCrayPointee(
+    const parser::OmpObjectList &objectList, llvm::StringRef clause,
+    bool suggestToUseCrayPointer) {
+  SymbolSourceMap symbols;
+  GetSymbolsInObjectList(objectList, symbols);
+  for (auto it{symbols.begin()}; it != symbols.end(); ++it) {
+    const auto *symbol{it->first};
+    const auto source{it->second};
+    if (symbol->test(Symbol::Flag::CrayPointee)) {
+      std::string suggestionMsg = "";
+      if (suggestToUseCrayPointer)
+        suggestionMsg = ", use Cray Pointer '" +
+            semantics::GetCrayPointer(*symbol).name().ToString() + "' instead";
+      context_.Say(source,
+          "Cray Pointee '%s' may not appear in %s clause%s"_err_en_US,
+          symbol->name(), clause.str(), suggestionMsg);
+    }
+  }
+}
+
 void OmpStructureChecker::GetSymbolsInObjectList(
     const parser::OmpObjectList &objectList, SymbolSourceMap &symbols) {
   for (const auto &ompObject : objectList.v) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 2b8304cb17037..51be9ba5f76bc 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -199,6 +199,8 @@ class OmpStructureChecker
       const parser::CharBlock &source, const parser::OmpObjectList &objList);
   void CheckIntentInPointer(SymbolSourceMap &, const llvm::omp::Clause);
   void CheckProcedurePointer(SymbolSourceMap &, const llvm::omp::Clause);
+  void CheckCrayPointee(const parser::OmpObjectList &objectList,
+      llvm::StringRef clause, bool suggestToUseCrayPointer = true);
   void GetSymbolsInObjectList(const parser::OmpObjectList &, SymbolSourceMap &);
   void CheckDefinableObjects(SymbolSourceMap &, const llvm::omp::Clause);
   void CheckCopyingPolymorphicAllocatable(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 4e6d819f545a2..2bd70d7d2b935 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2105,8 +2105,11 @@ void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
 static bool IsPrivatizable(const Symbol *sym) {
   auto *misc{sym->detailsIf<MiscDetails>()};
   return IsVariableName(*sym) && !IsProcedure(*sym) && !IsNamedConstant(*sym) &&
-      !semantics::IsAssumedSizeArray(
-          *sym) && /* OpenMP 5.2, 5.1.1: Assumed-size arrays are shared*/
+      ( // OpenMP 5.2, 5.1.1: Assumed-size arrays are shared
+          !semantics::IsAssumedSizeArray(*sym) ||
+          // If CrayPointer is among the DSA list then the
+          // CrayPointee is Privatizable
+          sym->test(Symbol::Flag::CrayPointee)) &&
       !sym->owner().IsDerivedType() &&
       sym->owner().kind() != Scope::Kind::ImpliedDos &&
       sym->owner().kind() != Scope::Kind::Forall &&
@@ -2273,10 +2276,18 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
             // the scope of the parallel region, and not in this scope.
             // TODO: check whether this should be caught in IsObjectWithDSA
             !symbol->test(Symbol::Flag::OmpPrivate)) {
-          context_.Say(name.source,
-              "The DEFAULT(NONE) clause requires that '%s' must be listed in "
-              "a data-sharing attribute clause"_err_en_US,
-              symbol->name());
+          if (symbol->test(Symbol::Flag::CrayPointee)) {
+            std::string crayPtrName{
+                semantics::GetCrayPointer(*symbol).name().ToString()};
+            if (!IsObjectWithDSA(*currScope().FindSymbol(crayPtrName)))
+              context_.Say(name.source,
+                  "The DEFAULT(NONE) clause requires that the Cray Pointer '%s' must be listed in a data-sharing attribute clause"_err_en_US,
+                  crayPtrName);
+          } else {
+            context_.Say(name.source,
+                "The DEFAULT(NONE) clause requires that '%s' must be listed in a data-sharing attribute clause"_err_en_US,
+                symbol->name());
+          }
         }
       }
     }
diff --git a/flang/test/Semantics/OpenMP/cray-pointer-usage.f90 b/flang/test/Semantics/OpenMP/cray-pointer-usage.f90
new file mode 100644
index 0000000000000..6c74462dd2789
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/cray-pointer-usage.f90
@@ -0,0 +1,44 @@
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp
+subroutine test_cray_pointer_usage
+  implicit none
+  integer :: i
+  real(8) :: var(*), pointee(2)
+  pointer(ivar, var)
+  ! ERROR: Cray Pointee 'var' may not appear in LINEAR clause
+  ! ERROR: The list item 'var' specified without the REF 'linear-modifier' must be of INTEGER type
+  ! ERROR: The list item `var` must be a dummy argument
+  !$omp declare simd linear(var)
+
+  pointee = 42.0
+  ivar = loc(pointee)
+
+  !$omp parallel num_threads(2) default(none)
+    ! ERROR: The DEFAULT(NONE) clause requires that the Cray Pointer 'ivar' must be listed in a data-sharing attribute clause
+    print *, var(1)
+  !$omp end parallel
+
+  ! ERROR: Cray Pointee 'var' may not appear in PRIVATE clause, use Cray Pointer 'ivar' instead
+  !$omp parallel num_threads(2) default(none) private(var)
+    print *, var(1)
+  !$omp end parallel
+
+  ! ERROR: Cray Pointee 'var' may not appear in SHARED clause, use Cray Pointer 'ivar' instead
+  !$omp parallel num_threads(2) default(none) shared(var)
+    print *, var(1)
+  !$omp end parallel
+
+  ! ERROR: Cray Pointee 'var' may not appear in LASTPRIVATE clause, use Cray Pointer 'ivar' instead
+  !$omp do lastprivate(var)
+    do i = 1, 10
+      print *, var(1)
+    end do
+  !$omp end do
+
+  !$omp parallel num_threads(2) default(none) firstprivate(ivar)
+    print *, var(1)
+  !$omp end parallel
+
+  !$omp parallel num_threads(2) default(private) shared(ivar)
+    print *, var(1)
+  !$omp end parallel
+end subroutine test_cray_pointer_usage

From 9c1d2f84e91f39cdd17519ca61bd2aacd8bcbd84 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 28 Jan 2025 13:59:32 +1100
Subject: [PATCH 323/432] [JITLink] Add "Pointer64Authenticated" string to
 aarch64::getEdgeKindName.

No testcase: this change just allows edge names to be correctly reported in
debug dumps of LinkGraphs. (Previously Pointer64Authenticated edges were
reported as "<Unrecognized edge kind>").
---
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index e2364ad786a42..18ac8d41d1378 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -38,6 +38,8 @@ const char *getEdgeKindName(Edge::Kind R) {
   switch (R) {
   case Pointer64:
     return "Pointer64";
+  case Pointer64Authenticated:
+    return "Pointer64Authenticated";
   case Pointer32:
     return "Pointer32";
   case Delta64:

From 6e4105574ebb1c4a664c5b24a4fb2b6cbc51d73e Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 27 Jan 2025 22:57:16 -0500
Subject: [PATCH 324/432] [NFC][AMDGPU] Improve code introduced in #124607
 (#124672)

---
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 20 ++---
 .../promote-alloca-invariant-markers.ll       | 88 +++++++++++++++----
 .../AMDGPU/promote-alloca-invariant-marks.ll  | 75 ----------------
 3 files changed, 74 insertions(+), 109 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index fec0ebcd70593..28a27ffc13677 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -1557,26 +1557,16 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
     case Intrinsic::invariant_end:
     case Intrinsic::launder_invariant_group:
     case Intrinsic::strip_invariant_group: {
-      SmallVector<Type *> ArgTy;
       SmallVector<Value *> Args;
       if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
-        Value *Size = Intr->getArgOperand(0);
-        ArgTy.emplace_back(Offset->getType());
-        Args.emplace_back(Size);
-        Args.emplace_back(Offset);
+        Args.emplace_back(Intr->getArgOperand(0));
       } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
-        Value *InvariantPtr = Intr->getArgOperand(0);
-        Value *Size = Intr->getArgOperand(1);
-        ArgTy.emplace_back(Offset->getType());
-        Args.emplace_back(InvariantPtr);
-        Args.emplace_back(Size);
-        Args.emplace_back(Offset);
-      } else {
-        ArgTy.emplace_back(Offset->getType());
-        Args.emplace_back(Offset);
+        Args.emplace_back(Intr->getArgOperand(0));
+        Args.emplace_back(Intr->getArgOperand(1));
       }
+      Args.emplace_back(Offset);
       Function *F = Intrinsic::getOrInsertDeclaration(
-          Intr->getModule(), Intr->getIntrinsicID(), ArgTy);
+          Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
       CallInst *NewIntr =
           CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
       Intr->mutateType(NewIntr->getType());
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index a585901fc377c..e100e7a41472c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -1,25 +1,75 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-target datalayout = "A5"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca %s -o - | FileCheck %s
 
-declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture) #0
-declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture) #0
-declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5)) #1
+declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture)
+declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture)
+declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5))
+declare ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5))
 
-; GCN-LABEL: {{^}}use_invariant_promotable_lds:
-; GCN: buffer_load_dword
-; GCN: ds_write_b32
-define amdgpu_kernel void @use_invariant_promotable_lds(ptr addrspace(1) %arg) #2 {
+define amdgpu_kernel void @use_invariant_start_and_end() {
+; CHECK-LABEL: define amdgpu_kernel void @use_invariant_start_and_end() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_start_and_end.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[INVARIANT1:%.*]] = call ptr @llvm.invariant.start.p3(i64 0, ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr [[INVARIANT1]], align 1
+; CHECK-NEXT:    call void @llvm.invariant.end.p3(ptr [[INVARIANT1]], i64 0, ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    ret void
+;
 bb:
-  %tmp = alloca i32, align 4, addrspace(5)
-  %tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
-  %tmp3 = load i32, ptr addrspace(1) %tmp2
-  store i32 %tmp3, ptr addrspace(5) %tmp
-  %tmp4 = call ptr @llvm.invariant.start.p5(i64 4, ptr addrspace(5) %tmp) #0
-  call void @llvm.invariant.end.p5(ptr %tmp4, i64 4, ptr addrspace(5) %tmp) #0
-  %tmp5 = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %tmp) #1
+  %alloca = alloca i32, align 4, addrspace(5)
+  %invariant = call ptr @llvm.invariant.start.p5(i64 0, ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr %invariant, align 1
+  call void @llvm.invariant.end.p5(ptr %invariant, i64 0, ptr addrspace(5) %alloca)
   ret void
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
+define amdgpu_kernel void @use_invariant_group_and_strip() {
+; CHECK-LABEL: define amdgpu_kernel void @use_invariant_group_and_strip() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
+; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_group_and_strip.alloca, i32 0, i32 [[TMP13]]
+; CHECK-NEXT:    [[INVARIANT2:%.*]] = call ptr addrspace(3) @llvm.launder.invariant.group.p3(ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[INVARIANT2]], align 1
+; CHECK-NEXT:    [[STRIP1:%.*]] = call ptr addrspace(3) @llvm.strip.invariant.group.p3(ptr addrspace(3) [[TMP14]])
+; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[STRIP1]], align 1
+; CHECK-NEXT:    ret void
+;
+bb:
+  %alloca = alloca i32, align 4, addrspace(5)
+  %invariant = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr addrspace(5) %invariant, align 1
+  %strip = call ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5) %alloca)
+  store <2 x i1> zeroinitializer, ptr addrspace(5) %strip, align 1
+  ret void
+}
+;.
+; CHECK: [[META0]] = !{}
+; CHECK: [[RNG1]] = !{i32 0, i32 1025}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll
deleted file mode 100644
index fca4be5e76dae..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-marks.ll
+++ /dev/null
@@ -1,75 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca %s -o - | FileCheck %s
-
-declare ptr @llvm.invariant.start.p5(i64, ptr addrspace(5) nocapture)
-declare void @llvm.invariant.end.p5(ptr, i64, ptr addrspace(5) nocapture)
-declare ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5))
-declare ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5))
-
-define amdgpu_kernel void @use_invariant_start_and_end() {
-; CHECK-LABEL: define amdgpu_kernel void @use_invariant_start_and_end() {
-; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
-; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_start_and_end.alloca, i32 0, i32 [[TMP13]]
-; CHECK-NEXT:    [[INVARIENT1:%.*]] = call ptr @llvm.invariant.start.p3(i64 0, ptr addrspace(3) [[TMP14]])
-; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr [[INVARIENT1]], align 1
-; CHECK-NEXT:    call void @llvm.invariant.end.p3(ptr [[INVARIENT1]], i64 0, ptr addrspace(3) [[TMP14]])
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  %invarient = call ptr @llvm.invariant.start.p5(i64 0, ptr addrspace(5) %alloca)
-  store <2 x i1> zeroinitializer, ptr %invarient, align 1
-  call void @llvm.invariant.end.p5(ptr %invarient, i64 0, ptr addrspace(5) %alloca)
-  ret void
-}
-
-define amdgpu_kernel void @use_invariant_group_and_strip() {
-; CHECK-LABEL: define amdgpu_kernel void @use_invariant_group_and_strip() {
-; CHECK-NEXT:  [[BB:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
-; CHECK-NEXT:    [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
-; CHECK-NEXT:    [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
-; CHECK-NEXT:    [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr addrspace(3) @use_invariant_group_and_strip.alloca, i32 0, i32 [[TMP13]]
-; CHECK-NEXT:    [[INVARIENT2:%.*]] = call ptr addrspace(3) @llvm.launder.invariant.group.p3(ptr addrspace(3) [[TMP14]])
-; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[INVARIENT2]], align 1
-; CHECK-NEXT:    [[STRIP1:%.*]] = call ptr addrspace(3) @llvm.strip.invariant.group.p3(ptr addrspace(3) [[TMP14]])
-; CHECK-NEXT:    store <2 x i1> zeroinitializer, ptr addrspace(3) [[STRIP1]], align 1
-; CHECK-NEXT:    ret void
-;
-bb:
-  %alloca = alloca i32, align 4, addrspace(5)
-  %invarient = call ptr addrspace(5) @llvm.launder.invariant.group.p5(ptr addrspace(5) %alloca)
-  store <2 x i1> zeroinitializer, ptr addrspace(5) %invarient, align 1
-  %strip = call ptr addrspace(5) @llvm.strip.invariant.group.p5(ptr addrspace(5) %alloca)
-  store <2 x i1> zeroinitializer, ptr addrspace(5) %strip, align 1
-  ret void
-}
-;.
-; CHECK: [[META0]] = !{}
-; CHECK: [[RNG1]] = !{i32 0, i32 1025}
-;.

From 8b29c5ccddb87ac7336a700877dc08cdc57ac65b Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 28 Jan 2025 14:25:57 +1100
Subject: [PATCH 325/432] [JITLink] Move debugging output helper array into
 LLVM_DEBUG. NFC.

This doesn't need a separate guard.
---
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index 18ac8d41d1378..d101332dbc7e9 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -334,10 +334,8 @@ Error lowerPointer64AuthEdgesToSigningFunction(LinkGraph &G) {
               formatv("{0:x}", B->getFixupAddress(E).getValue()) +
               " has invalid encoded addend  " + formatv("{0:x}", EncodedInfo));
 
-#ifndef NDEBUG
-        const char *const KeyNames[] = {"IA", "IB", "DA", "DB"};
-#endif // NDEBUG
         LLVM_DEBUG({
+          const char *const KeyNames[] = {"IA", "IB", "DA", "DB"};
           dbgs() << "  " << B->getFixupAddress(E) << " <- " << ValueToSign
                  << " : key = " << KeyNames[Key] << ", discriminator = "
                  << formatv("{0:x4}", InitialDiscriminator)

From 79685b59b78804f99e46f12ac888edbc0a6d1472 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 28 Jan 2025 14:46:27 +1100
Subject: [PATCH 326/432] [JITLink][aarch64] Fix dependence tracking for
 Pointer64Authenticated edges.

Transform Pointer64Authenticated edges into KeepAlive edges, rather than
removing them, in order to preserve symbol dependence information.

The lowerPointer64AuthEdgesToSigningFunction pass is responsible for
transforming Pointer64Authenticated edges to a signing function that can be run
in the executing process to initialize global PAC pointers. Removing the edges
entirely in this pass results in loss of dependence tracking, which can in turn
cause ORC to report PAC pointers as ready before the pointers targets have
completed materialization (resulting in a use-before-finalize error, often
manifesting as access to uninitialized / unprotected memory).

This commit addresses the issue by leaving the edges in the graph and simply
changing their kind to KeepAlive, a no-op for fixup purposes but followed for
dependence tracking purposes.
---
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp  | 10 ++++------
 .../AArch64/MachO_ptrauth_dependencies.s      | 19 +++++++++++++++++++
 2 files changed, 23 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s

diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index d101332dbc7e9..c97cedca81e2b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -317,8 +317,7 @@ Error lowerPointer64AuthEdgesToSigningFunction(LinkGraph &G) {
   };
 
   for (auto *B : G.blocks()) {
-    for (auto EI = B->edges().begin(); EI != B->edges().end();) {
-      auto &E = *EI;
+    for (auto &E : B->edges()) {
       if (E.getKind() == aarch64::Pointer64Authenticated) {
         uint64_t EncodedInfo = E.getAddend();
         int32_t RealAddend = (uint32_t)(EncodedInfo & 0xffffffff);
@@ -358,10 +357,9 @@ Error lowerPointer64AuthEdgesToSigningFunction(LinkGraph &G) {
         // Store signed pointer.
         cantFail(writeStoreRegSeq(AppendInstr, Reg2, Reg1));
 
-        // Remove this edge.
-        EI = B->removeEdge(EI);
-      } else
-        ++EI;
+        // Replace edge with a keep-alive to preserve dependence info.
+        E.setKind(Edge::KeepAlive);
+      }
     }
   }
 
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s
new file mode 100644
index 0000000000000..01618d78e094d
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -triple=arm64e-apple-macosx -filetype=obj -o %t.o %s
+# RUN: llvm-jitlink -num-threads=0 -debug-only=orc -noexec \
+# RUN:              -abs _foo=0x1 %t.o 2>&1 \
+# RUN:              | FileCheck %s
+#
+# Ensure that we don't lose dependence tracking information when ptrauth edges
+# are lowered: _main should still depend on _foo.
+
+# CHECK:    Symbols: { _main }, Dependencies: { (main, { _foo }) }
+
+	.section	__TEXT,__text,regular,pure_instructions
+
+	.section	__DATA,__data
+	.globl	_main
+	.p2align	3, 0x0
+_main:
+	.quad	_foo@AUTH(ia,0)
+
+.subsections_via_symbols

From 08d14e10ca4bdd4626cbe1c893961416f9703d5c Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Tue, 28 Jan 2025 12:02:37 +0800
Subject: [PATCH 327/432] [SLP] Fix CommonMask will be transformed into an
 incorrect mask if createShuffle is called multiple times. (#124244)

We have two types of mask in SLP: a scalar mask and a vector mask.
When vectorizing four i32 additions into <4 x i32>, SLP creates a mask
of length 4.
When vectorizing four <2 x i32> additions into <8 x i32>, SLP also
creates a mask of length 4.
We refer to the first case as a scalar mask (because the mask element
represents a scalar, i32), and the second case as a vector mask (because
the mask element represents a vector, <4 x i32>).
At some point, we must convert the scalar mask into a vector mask
(otherwise, calling TTI cost functions or IRBuilderBase functions may
yield incorrect results).
Since both ShuffleCostEstimator and ShuffleInstructionBuilder can modify
the CommonMask, we have decided to perform the mask transformation only
within createShuffle. However, we do not store the transformed result,
as createShuffle may be called multiple times.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  39 +++---
 .../Transforms/SLPVectorizer/RISCV/revec.ll   |  37 ++++++
 .../SLPVectorizer/revec-shufflevector.ll      | 114 ++++++++++++++++++
 3 files changed, 166 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 640fcb56aab19..469d9c8d10976 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9374,8 +9374,15 @@ class BaseShuffleAnalysis {
   /// instruction.
   template <typename T, typename ShuffleBuilderTy>
   static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
-                         ShuffleBuilderTy &Builder) {
+                         ShuffleBuilderTy &Builder, Type *ScalarTy) {
     assert(V1 && "Expected at least one vector value.");
+    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+    SmallVector<int> NewMask(Mask);
+    if (ScalarTyNumElements != 1) {
+      assert(SLPReVec && "FixedVectorType is not expected.");
+      transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);
+      Mask = NewMask;
+    }
     if (V2)
       Builder.resizeToMatch(V1, V2);
     int VF = Mask.size();
@@ -9478,7 +9485,6 @@ class BaseShuffleAnalysis {
     if (isa<PoisonValue>(V1))
       return Builder.createPoison(
           cast<VectorType>(V1->getType())->getElementType(), Mask.size());
-    SmallVector<int> NewMask(Mask);
     bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
     assert(V1 && "Expected non-null value after looking through shuffles.");
 
@@ -10643,17 +10649,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
       }
     }
-    if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
-      assert(SLPReVec && "FixedVectorType is not expected.");
-      transformScalarShuffleIndiciesToVector(VecTy->getNumElements(),
-                                             CommonMask);
-    }
     InVectors.front() =
         Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
     if (InVectors.size() == 2)
       InVectors.pop_back();
     return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
-                           V1, V2, CommonMask, Builder);
+                           V1, V2, CommonMask, Builder, ScalarTy);
   }
 
 public:
@@ -14198,8 +14199,8 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
     assert(V1 && "Expected at least one vector value.");
     ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
                                     R.CSEBlocks, *R.DL);
-    return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
-                                                       ShuffleBuilder);
+    return BaseShuffleAnalysis::createShuffle<Value *>(
+        V1, V2, Mask, ShuffleBuilder, ScalarTy);
   }
 
   /// Cast value \p V to the vector type with the same number of elements, but
@@ -14518,14 +14519,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
            ArrayRef<int> SubVectorsMask, unsigned VF = 0,
            function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
     IsFinalized = true;
-    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-    SmallVector<int> NewExtMask(ExtMask);
-    if (ScalarTyNumElements != 1) {
-      assert(SLPReVec && "FixedVectorType is not expected.");
-      transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
-      transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
-      ExtMask = NewExtMask;
-    }
     if (Action) {
       Value *Vec = InVectors.front();
       if (InVectors.size() == 2) {
@@ -14566,17 +14559,15 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
                                      return !isKnownNonNegative(
                                          V, SimplifyQuery(*R.DL));
                                    }));
-          unsigned InsertionIndex = Idx * ScalarTyNumElements;
+          unsigned InsertionIndex = Idx * getNumElements(ScalarTy);
           Vec = createInsertVector(
               Builder, Vec, V, InsertionIndex,
               std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
                         _3));
           if (!CommonMask.empty()) {
-            std::iota(
-                std::next(CommonMask.begin(), InsertionIndex),
-                std::next(CommonMask.begin(),
-                          (Idx + E->getVectorFactor()) * ScalarTyNumElements),
-                InsertionIndex);
+            std::iota(std::next(CommonMask.begin(), Idx),
+                      std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
+                      Idx);
           }
         }
         return Vec;
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
index 61ff4f5766d30..dd7a21198ac1f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec.ll
@@ -231,3 +231,40 @@ define ptr @test4() {
   %28 = tail call float @llvm.sqrt.f32(float %26)
   ret ptr null
 }
+
+define i32 @test5() {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> poison, <2 x double> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> [[TMP0]], <2 x double> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP2:%.*]] = fdiv <4 x double> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP3]], <2 x double> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP4]], <2 x double> zeroinitializer, i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP5]], <2 x double> zeroinitializer, i64 6)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> poison, <2 x double> zeroinitializer, i64 2)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP7]], <2 x double> zeroinitializer, i64 6)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> poison, <4 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 2, i32 3, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <8 x double> [[TMP6]], [[TMP10]]
+; CHECK-NEXT:    br label [[FOR_END47:%.*]]
+; CHECK:       for.end47:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi <8 x double> [ [[TMP11]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %div0 = fdiv <2 x double> zeroinitializer, zeroinitializer
+  %div1 = fdiv <2 x double> zeroinitializer, zeroinitializer
+  %add0 = fadd <2 x double> zeroinitializer, %div0
+  %add1 = fadd <2 x double> zeroinitializer, zeroinitializer
+  %add2 = fadd <2 x double> %div1, zeroinitializer
+  %add3 = fadd <2 x double> zeroinitializer, zeroinitializer
+  br label %for.end47
+
+for.end47:                                        ; preds = %entry
+  %add0.lcssa = phi <2 x double> [ %add0, %entry ]
+  %add1.lcssa = phi <2 x double> [ %add1, %entry ]
+  %add2.lcssa = phi <2 x double> [ %add2, %entry ]
+  %add3.lcssa = phi <2 x double> [ %add3, %entry ]
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
index a2673d81068d8..d6c09bc224a7d 100644
--- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll
@@ -121,3 +121,117 @@ entry:
   store <4 x i32> %1, ptr %3, align 4
   ret void
 }
+
+define void @test6(ptr %in0, ptr %in1, ptr %in2) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds nuw i8, ptr [[IN0:%.*]], i64 32
+; CHECK-NEXT:    [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <32 x i8> [[TMP1]] to <32 x float>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <32 x float> [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    store <32 x float> [[TMP4]], ptr [[IN2:%.*]], align 16
+; CHECK-NEXT:    [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32
+; CHECK-NEXT:    [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2]], i64 128
+; CHECK-NEXT:    [[TMP6:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> [[TMP8]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <16 x float> [[TMP9]], <16 x float> [[TMP10]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <16 x float> [[TMP12]], [[TMP6]]
+; CHECK-NEXT:    store <16 x float> [[TMP13]], ptr [[GEP11]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %gep0 = getelementptr inbounds i8, ptr %in0, i64 16
+  %gep1 = getelementptr inbounds i8, ptr %in0, i64 32
+  %load0 = load <4 x float>, ptr %in0, align 16
+  %load1 = load <4 x float>, ptr %gep0, align 16
+  %load2 = load <4 x float>, ptr %gep1, align 16
+  %gep2 = getelementptr inbounds i8, ptr %in1, i64 16
+  %load3 = load <16 x i8>, ptr %in1, align 1
+  %load4 = load <16 x i8>, ptr %gep2, align 1
+  %shufflevector0 = shufflevector <16 x i8> %load3, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shufflevector1 = shufflevector <16 x i8> %load3, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %shufflevector2 = shufflevector <16 x i8> %load4, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shufflevector3 = shufflevector <16 x i8> %load4, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %zext0 = zext <8 x i8> %shufflevector0 to <8 x i16>
+  %zext1 = zext <8 x i8> %shufflevector1 to <8 x i16>
+  %zext2 = zext <8 x i8> %shufflevector2 to <8 x i16>
+  %zext3 = zext <8 x i8> %shufflevector3 to <8 x i16>
+  %shufflevector4 = shufflevector <8 x i16> %zext0, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector5 = shufflevector <8 x i16> %zext0, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shufflevector6 = shufflevector <8 x i16> %zext1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector7 = shufflevector <8 x i16> %zext1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shufflevector8 = shufflevector <8 x i16> %zext2, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector9 = shufflevector <8 x i16> %zext2, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shufflevector10 = shufflevector <8 x i16> %zext3, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector11 = shufflevector <8 x i16> %zext3, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %uitofp0 = uitofp nneg <4 x i16> %shufflevector4 to <4 x float>
+  %uitofp1 = uitofp nneg <4 x i16> %shufflevector5 to <4 x float>
+  %uitofp2 = uitofp nneg <4 x i16> %shufflevector6 to <4 x float>
+  %uitofp3 = uitofp nneg <4 x i16> %shufflevector7 to <4 x float>
+  %uitofp4 = uitofp nneg <4 x i16> %shufflevector8 to <4 x float>
+  %uitofp5 = uitofp nneg <4 x i16> %shufflevector9 to <4 x float>
+  %uitofp6 = uitofp nneg <4 x i16> %shufflevector10 to <4 x float>
+  %uitofp7 = uitofp nneg <4 x i16> %shufflevector11 to <4 x float>
+  %fmul0 = fmul <4 x float> %load0, %uitofp0
+  %fmul1 = fmul <4 x float> %load1, %uitofp1
+  %fmul2 = fmul <4 x float> %load2, %uitofp2
+  %fmul3 = fmul <4 x float> %load0, %uitofp3
+  %fmul4 = fmul <4 x float> %load1, %uitofp4
+  %fmul5 = fmul <4 x float> %load2, %uitofp5
+  %fmul6 = fmul <4 x float> %load0, %uitofp6
+  %fmul7 = fmul <4 x float> %load1, %uitofp7
+  %gep3 = getelementptr inbounds i8, ptr %in2, i64 16
+  %gep4 = getelementptr inbounds i8, ptr %in2, i64 32
+  %gep5 = getelementptr inbounds i8, ptr %in2, i64 48
+  %gep6 = getelementptr inbounds i8, ptr %in2, i64 64
+  %gep7 = getelementptr inbounds i8, ptr %in2, i64 80
+  %gep8 = getelementptr inbounds i8, ptr %in2, i64 96
+  %gep9 = getelementptr inbounds i8, ptr %in2, i64 112
+  store <4 x float> %fmul0, ptr %in2, align 16
+  store <4 x float> %fmul1, ptr %gep3, align 16
+  store <4 x float> %fmul2, ptr %gep4, align 16
+  store <4 x float> %fmul3, ptr %gep5, align 16
+  store <4 x float> %fmul4, ptr %gep6, align 16
+  store <4 x float> %fmul5, ptr %gep7, align 16
+  store <4 x float> %fmul6, ptr %gep8, align 16
+  store <4 x float> %fmul7, ptr %gep9, align 16
+  %gep10 = getelementptr inbounds i8, ptr %in1, i64 32
+  %load5 = load <16 x i8>, ptr %gep10, align 1
+  %shufflevector12 = shufflevector <16 x i8> %load5, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %shufflevector13 = shufflevector <16 x i8> %load5, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %zext4 = zext <8 x i8> %shufflevector12 to <8 x i16>
+  %zext5 = zext <8 x i8> %shufflevector13 to <8 x i16>
+  %shufflevector14 = shufflevector <8 x i16> %zext4, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector15 = shufflevector <8 x i16> %zext4, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %shufflevector16 = shufflevector <8 x i16> %zext5, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector17 = shufflevector <8 x i16> %zext5, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %uitofp8 = uitofp nneg <4 x i16> %shufflevector14 to <4 x float>
+  %uitofp9 = uitofp nneg <4 x i16> %shufflevector15 to <4 x float>
+  %uitofp10 = uitofp nneg <4 x i16> %shufflevector16 to <4 x float>
+  %uitofp11 = uitofp nneg <4 x i16> %shufflevector17 to <4 x float>
+  %fmul8 = fmul <4 x float> %load2, %uitofp8
+  %fmul9 = fmul <4 x float> %load0, %uitofp9
+  %fmul10 = fmul <4 x float> %load1, %uitofp10
+  %fmul11 = fmul <4 x float> %load2, %uitofp11
+  %gep11 = getelementptr inbounds i8, ptr %in2, i64 128
+  %gep12 = getelementptr inbounds i8, ptr %in2, i64 144
+  %gep13 = getelementptr inbounds i8, ptr %in2, i64 160
+  %gep14 = getelementptr inbounds i8, ptr %in2, i64 176
+  store <4 x float> %fmul8, ptr %gep11, align 16
+  store <4 x float> %fmul9, ptr %gep12, align 16
+  store <4 x float> %fmul10, ptr %gep13, align 16
+  store <4 x float> %fmul11, ptr %gep14, align 16
+  ret void
+}

From 5ab43c3e7d9b5dc0ca4fffa58c65fc6f7283d3c0 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 28 Jan 2025 15:03:11 +1100
Subject: [PATCH 328/432] [JITLink] Use continue to reduce loop indentation and
 improve readability. NFC.

---
 llvm/lib/ExecutionEngine/JITLink/aarch64.cpp | 85 ++++++++++----------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
index c97cedca81e2b..8067e2a173a00 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch64.cpp
@@ -318,48 +318,49 @@ Error lowerPointer64AuthEdgesToSigningFunction(LinkGraph &G) {
 
   for (auto *B : G.blocks()) {
     for (auto &E : B->edges()) {
-      if (E.getKind() == aarch64::Pointer64Authenticated) {
-        uint64_t EncodedInfo = E.getAddend();
-        int32_t RealAddend = (uint32_t)(EncodedInfo & 0xffffffff);
-        uint32_t InitialDiscriminator = (EncodedInfo >> 32) & 0xffff;
-        bool AddressDiversify = (EncodedInfo >> 48) & 0x1;
-        uint32_t Key = (EncodedInfo >> 49) & 0x3;
-        uint32_t HighBits = EncodedInfo >> 51;
-        auto ValueToSign = E.getTarget().getAddress() + RealAddend;
-
-        if (HighBits != 0x1000)
-          return make_error<JITLinkError>(
-              "Pointer64Auth edge at " +
-              formatv("{0:x}", B->getFixupAddress(E).getValue()) +
-              " has invalid encoded addend  " + formatv("{0:x}", EncodedInfo));
-
-        LLVM_DEBUG({
-          const char *const KeyNames[] = {"IA", "IB", "DA", "DB"};
-          dbgs() << "  " << B->getFixupAddress(E) << " <- " << ValueToSign
-                 << " : key = " << KeyNames[Key] << ", discriminator = "
-                 << formatv("{0:x4}", InitialDiscriminator)
-                 << ", address diversified = "
-                 << (AddressDiversify ? "yes" : "no") << "\n";
-        });
-
-        // Materialize pointer value.
-        cantFail(
-            writeMovRegImm64Seq(AppendInstr, Reg1, ValueToSign.getValue()));
-
-        // Materialize fixup pointer.
-        cantFail(writeMovRegImm64Seq(AppendInstr, Reg2,
-                                     B->getFixupAddress(E).getValue()));
-
-        // Write signing instruction(s).
-        cantFail(writePACSignSeq(AppendInstr, Reg1, ValueToSign, Reg2, Reg3,
-                                 Key, InitialDiscriminator, AddressDiversify));
-
-        // Store signed pointer.
-        cantFail(writeStoreRegSeq(AppendInstr, Reg2, Reg1));
-
-        // Replace edge with a keep-alive to preserve dependence info.
-        E.setKind(Edge::KeepAlive);
-      }
+      // We're only concerned with Pointer64Authenticated edges here.
+      if (E.getKind() != aarch64::Pointer64Authenticated)
+        continue;
+
+      uint64_t EncodedInfo = E.getAddend();
+      int32_t RealAddend = (uint32_t)(EncodedInfo & 0xffffffff);
+      uint32_t InitialDiscriminator = (EncodedInfo >> 32) & 0xffff;
+      bool AddressDiversify = (EncodedInfo >> 48) & 0x1;
+      uint32_t Key = (EncodedInfo >> 49) & 0x3;
+      uint32_t HighBits = EncodedInfo >> 51;
+      auto ValueToSign = E.getTarget().getAddress() + RealAddend;
+
+      if (HighBits != 0x1000)
+        return make_error<JITLinkError>(
+            "Pointer64Auth edge at " +
+            formatv("{0:x}", B->getFixupAddress(E).getValue()) +
+            " has invalid encoded addend  " + formatv("{0:x}", EncodedInfo));
+
+      LLVM_DEBUG({
+        const char *const KeyNames[] = {"IA", "IB", "DA", "DB"};
+        dbgs() << "  " << B->getFixupAddress(E) << " <- " << ValueToSign
+               << " : key = " << KeyNames[Key] << ", discriminator = "
+               << formatv("{0:x4}", InitialDiscriminator)
+               << ", address diversified = "
+               << (AddressDiversify ? "yes" : "no") << "\n";
+      });
+
+      // Materialize pointer value.
+      cantFail(writeMovRegImm64Seq(AppendInstr, Reg1, ValueToSign.getValue()));
+
+      // Materialize fixup pointer.
+      cantFail(writeMovRegImm64Seq(AppendInstr, Reg2,
+                                   B->getFixupAddress(E).getValue()));
+
+      // Write signing instruction(s).
+      cantFail(writePACSignSeq(AppendInstr, Reg1, ValueToSign, Reg2, Reg3, Key,
+                               InitialDiscriminator, AddressDiversify));
+
+      // Store signed pointer.
+      cantFail(writeStoreRegSeq(AppendInstr, Reg2, Reg1));
+
+      // Replace edge with a keep-alive to preserve dependence info.
+      E.setKind(Edge::KeepAlive);
     }
   }
 

From fa9ac62d02fd6b5028f301ee398c3d3a1c0eacae Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Mon, 27 Jan 2025 20:08:53 -0800
Subject: [PATCH 329/432] [ubsan] Parse and use
 <cutoffs[0,1,2]=70000;cutoffs[5,6,8]=90000> in LowerAllowCheckPass (#124211)

This adds and utilizes a cutoffs parameter for LowerAllowCheckPass, via the Options parameter (introduced in https://github.com/llvm/llvm-project/pull/122994).

Future work will connect -fsanitize-skip-hot-cutoff (introduced patch in https://github.com/llvm/llvm-project/pull/121619) in the clang frontend to the cutoffs parameter used here.
---
 .../Instrumentation/LowerAllowCheckPass.h     |  4 +-
 llvm/lib/Passes/PassBuilder.cpp               | 62 +++++++++++++++++--
 .../Instrumentation/LowerAllowCheckPass.cpp   | 62 ++++++++++++++++---
 .../Transforms/lower-builtin-allow-check.ll   |  8 +++
 4 files changed, 121 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
index 3ee907606e12b..2c6e60138f2aa 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/LowerAllowCheckPass.h
@@ -25,7 +25,7 @@ namespace llvm {
 class LowerAllowCheckPass : public PassInfoMixin<LowerAllowCheckPass> {
 public:
   struct Options {
-    std::vector<unsigned int> placeholder; // TODO: cutoffs
+    std::vector<unsigned int> cutoffs;
   };
 
   explicit LowerAllowCheckPass(LowerAllowCheckPass::Options Opts)
@@ -33,6 +33,8 @@ class LowerAllowCheckPass : public PassInfoMixin<LowerAllowCheckPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 
   static bool IsRequested();
+  void printPipeline(raw_ostream &OS,
+                     function_ref<StringRef(StringRef)> MapClassName2PassName);
 
 private:
   LowerAllowCheckPass::Options Opts;
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 1e97cef22045d..0918b1e5dd2cf 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -828,11 +828,65 @@ parseLowerAllowCheckPassOptions(StringRef Params) {
     StringRef ParamName;
     std::tie(ParamName, Params) = Params.split(';');
 
-    return make_error<StringError>(
-        formatv("invalid LowerAllowCheck pass parameter '{0}' ", ParamName)
-            .str(),
-        inconvertibleErrorCode());
+    // Format is <cutoffs[1,2,3]=70000;cutoffs[5,6,8]=90000>
+    //
+    // Parsing allows duplicate indices (last one takes precedence).
+    // It would technically be in spec to specify
+    //   cutoffs[0]=70000,cutoffs[1]=90000,cutoffs[0]=80000,...
+    if (ParamName.starts_with("cutoffs[")) {
+      StringRef IndicesStr;
+      StringRef CutoffStr;
+
+      std::tie(IndicesStr, CutoffStr) = ParamName.split("]=");
+      //       cutoffs[1,2,3
+      //                   70000
+
+      int cutoff;
+      if (CutoffStr.getAsInteger(0, cutoff))
+        return make_error<StringError>(
+            formatv("invalid LowerAllowCheck pass cutoffs parameter '{0}' "
+                    "({1})",
+                    CutoffStr, Params)
+                .str(),
+            inconvertibleErrorCode());
+
+      if (!IndicesStr.consume_front("cutoffs[") || IndicesStr == "")
+        return make_error<StringError>(
+            formatv("invalid LowerAllowCheck pass index parameter '{0}' "
+                    "({1})",
+                    IndicesStr, CutoffStr)
+                .str(),
+            inconvertibleErrorCode());
+
+      while (IndicesStr != "") {
+        StringRef firstIndexStr;
+        std::tie(firstIndexStr, IndicesStr) = IndicesStr.split('|');
+
+        unsigned int index;
+        if (firstIndexStr.getAsInteger(0, index))
+          return make_error<StringError>(
+              formatv("invalid LowerAllowCheck pass index parameter '{0}' "
+                      "({1}) {2}",
+                      firstIndexStr, IndicesStr)
+                  .str(),
+              inconvertibleErrorCode());
+
+        // In the common case (sequentially increasing indices), we will issue
+        // O(n) resize requests. We assume the underlying data structure has
+        // O(1) runtime for each added element.
+        if (index >= Result.cutoffs.size())
+          Result.cutoffs.resize(index + 1, 0);
+
+        Result.cutoffs[index] = cutoff;
+      }
+    } else {
+      return make_error<StringError>(
+          formatv("invalid LowerAllowCheck pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
   }
+
   return Result;
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
index f27798cfd228c..10e908ef73ce5 100644
--- a/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
+++ b/llvm/lib/Transforms/Instrumentation/LowerAllowCheckPass.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/RandomNumberGenerator.h"
 #include <memory>
 #include <random>
@@ -71,7 +72,8 @@ static void emitRemark(IntrinsicInst *II, OptimizationRemarkEmitter &ORE,
 
 static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
                              const ProfileSummaryInfo *PSI,
-                             OptimizationRemarkEmitter &ORE) {
+                             OptimizationRemarkEmitter &ORE,
+                             const std::vector<unsigned int> &cutoffs) {
   SmallVector<std::pair<IntrinsicInst *, bool>, 16> ReplaceWithValue;
   std::unique_ptr<RandomNumberGenerator> Rng;
 
@@ -81,10 +83,22 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
     return *Rng;
   };
 
-  auto ShouldRemoveHot = [&](const BasicBlock &BB) {
-    return HotPercentileCutoff.getNumOccurrences() && PSI &&
-           PSI->isHotCountNthPercentile(
-               HotPercentileCutoff, BFI.getBlockProfileCount(&BB).value_or(0));
+  auto GetCutoff = [&](const IntrinsicInst *II) -> unsigned {
+    if (HotPercentileCutoff.getNumOccurrences())
+      return HotPercentileCutoff;
+    else if (II->getIntrinsicID() == Intrinsic::allow_ubsan_check) {
+      auto *Kind = cast<ConstantInt>(II->getArgOperand(0));
+      if (Kind->getZExtValue() < cutoffs.size())
+        return cutoffs[Kind->getZExtValue()];
+    }
+
+    return 0;
+  };
+
+  auto ShouldRemoveHot = [&](const BasicBlock &BB, unsigned int cutoff) {
+    return (cutoff == 1000000) ||
+           (PSI && PSI->isHotCountNthPercentile(
+                       cutoff, BFI.getBlockProfileCount(&BB).value_or(0)));
   };
 
   auto ShouldRemoveRandom = [&]() {
@@ -92,8 +106,9 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
            !std::bernoulli_distribution(RandomRate)(GetRng());
   };
 
-  auto ShouldRemove = [&](const BasicBlock &BB) {
-    return ShouldRemoveRandom() || ShouldRemoveHot(BB);
+  auto ShouldRemove = [&](const IntrinsicInst *II) {
+    unsigned int cutoff = GetCutoff(II);
+    return ShouldRemoveRandom() || ShouldRemoveHot(*(II->getParent()), cutoff);
   };
 
   for (BasicBlock &BB : F) {
@@ -107,7 +122,8 @@ static bool removeUbsanTraps(Function &F, const BlockFrequencyInfo &BFI,
       case Intrinsic::allow_runtime_check: {
         ++NumChecksTotal;
 
-        bool ToRemove = ShouldRemove(BB);
+        bool ToRemove = ShouldRemove(II);
+
         ReplaceWithValue.push_back({
             II,
             ToRemove,
@@ -142,11 +158,37 @@ PreservedAnalyses LowerAllowCheckPass::run(Function &F,
   OptimizationRemarkEmitter &ORE =
       AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-  return removeUbsanTraps(F, BFI, PSI, ORE) ? PreservedAnalyses::none()
-                                            : PreservedAnalyses::all();
+  return removeUbsanTraps(F, BFI, PSI, ORE, Opts.cutoffs)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
 }
 
 bool LowerAllowCheckPass::IsRequested() {
   return RandomRate.getNumOccurrences() ||
          HotPercentileCutoff.getNumOccurrences();
 }
+
+void LowerAllowCheckPass::printPipeline(
+    raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
+  static_cast<PassInfoMixin<LowerAllowCheckPass> *>(this)->printPipeline(
+      OS, MapClassName2PassName);
+  OS << "<";
+
+  // Format is <cutoffs[0,1,2]=70000;cutoffs[5,6,8]=90000>
+  // but it's equally valid to specify
+  //   cutoffs[0]=70000;cutoffs[1]=70000;cutoffs[2]=70000;cutoffs[5]=90000;...
+  // and that's what we do here. It is verbose but valid and easy to verify
+  // correctness.
+  // TODO: print shorter output by combining adjacent runs, etc.
+  int i = 0;
+  for (unsigned int cutoff : Opts.cutoffs) {
+    if (cutoff > 0) {
+      if (i > 0)
+        OS << ";";
+      OS << "cutoffs[" << i << "]=" << cutoff;
+    }
+
+    i++;
+  }
+  OS << '>';
+}
diff --git a/llvm/test/Transforms/lower-builtin-allow-check.ll b/llvm/test/Transforms/lower-builtin-allow-check.ll
index bcd9722d2b289..fb87269429928 100644
--- a/llvm/test/Transforms/lower-builtin-allow-check.ll
+++ b/llvm/test/Transforms/lower-builtin-allow-check.ll
@@ -2,6 +2,14 @@
 ; RUN: opt < %s -passes='function(lower-allow-check)' -S | FileCheck %s --check-prefixes=NOPROFILE
 ; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=0 -S | FileCheck %s --check-prefixes=NONE
 ; RUN: opt < %s -passes='function(lower-allow-check)' -lower-allow-check-random-rate=1 -S | FileCheck %s --check-prefixes=ALL
+;
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check<cutoffs[22]=990000>)' -S | FileCheck %s --check-prefixes=HOT99
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check<cutoffs[22]=700000>)' -S | FileCheck %s --check-prefixes=HOT70
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check<cutoffs[22]=990000>)' -lower-allow-check-random-rate=0 -S | FileCheck %s --check-prefixes=NONE99
+; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check<cutoffs[22]=700000>)' -lower-allow-check-random-rate=1 -S | FileCheck %s --check-prefixes=ALL70
+;
+; -lower-allow-check-percentile-cutoff is deprecated and will be removed in the future;
+; use the cutoffs parameter to the lower-allow-check pass, as shown above.
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=HOT99
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-percentile-cutoff-hot=700000 -S | FileCheck %s --check-prefixes=HOT70
 ; RUN: opt < %s -passes='require<profile-summary>,function(lower-allow-check)' -lower-allow-check-random-rate=0 -lower-allow-check-percentile-cutoff-hot=990000 -S | FileCheck %s --check-prefixes=NONE99

From 334a1cdbfaafc5424c5932663728334d1cc46285 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Mon, 27 Jan 2025 20:16:30 -0800
Subject: [PATCH 330/432] [SandboxIR] createFunction() should always create a
 function (#124665)

This patch removes the assertion that checks for an existing function.
If one exists it will remove it and create a new one. This helps remove
a crash when a function declaration object already exists and we are
about to create a SandboxIR object for the definition.
---
 llvm/lib/SandboxIR/Context.cpp                |  5 ++++-
 .../SandboxVectorizer/SandboxVectorizer.cpp   |  5 ++++-
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 20 +++++++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/SandboxIR/Context.cpp b/llvm/lib/SandboxIR/Context.cpp
index 42ca456881fd0..440210f5a1bf7 100644
--- a/llvm/lib/SandboxIR/Context.cpp
+++ b/llvm/lib/SandboxIR/Context.cpp
@@ -628,12 +628,15 @@ Module *Context::getOrCreateModule(llvm::Module *LLVMM) {
 }
 
 Function *Context::createFunction(llvm::Function *F) {
-  assert(getValue(F) == nullptr && "Already exists!");
   // Create the module if needed before we create the new sandboxir::Function.
   // Note: this won't fully populate the module. The only globals that will be
   // available will be the ones being used within the function.
   getOrCreateModule(F->getParent());
 
+  // There may be a function declaration already defined. Regardless destroy it.
+  if (Function *ExistingF = cast_or_null<Function>(getValue(F)))
+    detach(ExistingF);
+
   auto NewFPtr = std::unique_ptr<Function>(new Function(F, *this));
   auto *SBF = cast<Function>(registerValue(std::move(NewFPtr)));
   // Create arguments.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index a6e2b40000529..542fcde71e83c 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -87,5 +87,8 @@ bool SandboxVectorizerPass::runImpl(Function &LLVMF) {
   // Create SandboxIR for LLVMF and run BottomUpVec on it.
   sandboxir::Function &F = *Ctx->createFunction(&LLVMF);
   sandboxir::Analyses A(*AA, *SE, *TTI);
-  return FPM.runOnFunction(F, A);
+  bool Change = FPM.runOnFunction(F, A);
+  // TODO: This is a function pass, so we won't be needing the function-level
+  // Sandbox IR objects in the future. So we should clear them.
+  return Change;
 }
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 9d1c86a9b9c72..9eeac9b60372f 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -6080,3 +6080,23 @@ TEST_F(SandboxIRTest, InstructionCallbacks) {
   EXPECT_THAT(Removed, testing::IsEmpty());
   EXPECT_THAT(Moved, testing::IsEmpty());
 }
+
+TEST_F(SandboxIRTest, FunctionObjectAlreadyExists) {
+  parseIR(C, R"IR(
+define void @foo() {
+  call void @bar()
+  ret void
+}
+define void @bar() {
+  ret void
+}
+)IR");
+  Function &LLVMFoo = *M->getFunction("foo");
+  Function &LLVMBar = *M->getFunction("bar");
+  sandboxir::Context Ctx(C);
+  // This will create a Function object for @bar().
+  Ctx.createFunction(&LLVMFoo);
+  EXPECT_NE(Ctx.getValue(&LLVMBar), nullptr);
+  // This should not crash, even though there is already a value for LLVMBar.
+  Ctx.createFunction(&LLVMBar);
+}

From cc97653d534e80745a0cfb0143972e8d4dec9f74 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 Jan 2025 11:17:10 +0700
Subject: [PATCH 331/432] AMDGPU: Custom lower 32-bit element shuffles
 (#123711)

This is so we can try to make use of v_pk_mov_b32 when available.
Note this currently has little observable effect. The combiner
will undo the common extract of shuffle pattern. The lack
of test changes should demonstrate this change is minimally
correct.

We should probably try to make better use of wider extracts in
even aligned cases, but I'm trying to avoid some really ugly
regalloc regressions in some MFMA tests. The DAG scheduler ends
up doing a worse job if we use vector extracts, resulting
in failure to do 3 address conversion of MFMAs.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 85 +++++++++++++++++++++--
 1 file changed, 80 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1aeca7f370aa1..b632c50dae0e3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -419,8 +419,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   }
 
   setOperationAction(ISD::VECTOR_SHUFFLE,
-                     {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
-                     Expand);
+                     {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
+                      MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
+                     Custom);
 
   if (Subtarget->hasPkMovB32()) {
     // TODO: 16-bit element vectors should be legal with even aligned elements.
@@ -7589,15 +7590,38 @@ static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
   return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
 }
 
+static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
+  assert(Elt % 2 == 0);
+  return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
+         !(Mask[Elt + 1] & 1);
+}
+
 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc SL(Op);
   EVT ResultVT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
   MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
-  MVT PackVT = MVT::getVectorVT(EltVT, 2);
+  const int NewSrcNumElts = 2;
+  MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
   int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
 
+  // Break up the shuffle into registers sized pieces.
+  //
+  // We're trying to form sub-shuffles that the register allocation pipeline
+  // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
+  // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
+  // pair of copies into a consecutive register copy, so use the ordinary
+  // extract_vector_elt lowering unless we can use the shuffle.
+  //
+  // TODO: This is a bit of hack, and we should probably always use
+  // extract_subvector for the largest possible subvector we can (or at least
+  // use it for PackVT aligned pieces). However we have worse support for
+  // combines on them don't directly treat extract_subvector / insert_subvector
+  // as legal. The DAG scheduler also ends up doing a worse job with the
+  // extract_subvectors.
+  const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
+
   // vector_shuffle <0,1,6,7> lhs, rhs
   // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
   //
@@ -7608,9 +7632,18 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
   // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
 
   // Avoid scalarizing when both halves are reading from consecutive elements.
-  SmallVector<SDValue, 4> Pieces;
+
+  // If we're treating 2 element shuffles as legal, also create odd-to-even
+  // shuffles of neighboring pairs.
+  //
+  // vector_shuffle <3,2,7,6> lhs, rhs
+  //  -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
+  //                    vector_shuffle <1, 0> (extract_subvector rhs, 2)
+
+  SmallVector<SDValue, 16> Pieces;
   for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
-    if (elementPairIsContiguous(SVN->getMask(), I)) {
+    if (ShouldUseConsecutiveExtract &&
+        elementPairIsContiguous(SVN->getMask(), I)) {
       const int Idx = SVN->getMaskElt(I);
       int VecIdx = Idx < SrcNumElts ? 0 : 1;
       int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
@@ -7618,6 +7651,48 @@ SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
                                    SVN->getOperand(VecIdx),
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
       Pieces.push_back(SubVec);
+    } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
+               isOperationLegal(ISD::VECTOR_SHUFFLE, PackVT)) {
+      int Idx0 = SVN->getMaskElt(I);
+      int Idx1 = SVN->getMaskElt(I + 1);
+
+      SDValue SrcOp0 = SVN->getOperand(0);
+      SDValue SrcOp1 = SrcOp0;
+      if (Idx0 >= SrcNumElts) {
+        SrcOp0 = SVN->getOperand(1);
+        Idx0 -= SrcNumElts;
+      }
+
+      if (Idx1 >= SrcNumElts) {
+        SrcOp1 = SVN->getOperand(1);
+        Idx1 -= SrcNumElts;
+      }
+
+      int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
+      int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
+
+      // Extract nearest even aligned piece.
+      SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
+                                    DAG.getConstant(AlignedIdx0, SL, MVT::i32));
+      SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
+                                    DAG.getConstant(AlignedIdx1, SL, MVT::i32));
+
+      int NewMaskIdx0 = Idx0 - AlignedIdx0;
+      int NewMaskIdx1 = Idx1 - AlignedIdx1;
+
+      SDValue Result0 = SubVec0;
+      SDValue Result1 = SubVec0;
+
+      if (SubVec0 != SubVec1) {
+        NewMaskIdx1 += NewSrcNumElts;
+        Result1 = SubVec1;
+      } else {
+        Result1 = DAG.getUNDEF(PackVT);
+      }
+
+      SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
+                                          {NewMaskIdx0, NewMaskIdx1});
+      Pieces.push_back(Shuf);
     } else {
       const int Idx0 = SVN->getMaskElt(I);
       const int Idx1 = SVN->getMaskElt(I + 1);

From bd8578bf2e8f3cfcccea232d3a83f4dbeae41fdb Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 28 Jan 2025 15:32:14 +1100
Subject: [PATCH 332/432] [JITLink] Add 'REQUIRES: asserts' to
 MachO_ptrauth_dependencies.s.

This test depends on debugging output.
---
 .../JITLink/AArch64/MachO_ptrauth_dependencies.s                | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s
index 01618d78e094d..454079923622e 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch64/MachO_ptrauth_dependencies.s
@@ -5,6 +5,8 @@
 #
 # Ensure that we don't lose dependence tracking information when ptrauth edges
 # are lowered: _main should still depend on _foo.
+#
+# REQUIRES: asserts
 
 # CHECK:    Symbols: { _main }, Dependencies: { (main, { _foo }) }
 

From 2d0688797cc31ef10572d3216bb7ef4dbb8019b7 Mon Sep 17 00:00:00 2001
From: quic_hchandel <quic_hchandel@quicinc.com>
Date: Tue, 28 Jan 2025 10:10:45 +0530
Subject: [PATCH 333/432] [RISCV] Renaming muladdi to muliadd as per v0.5 spec.
 (#124237)

muliadd is more relevant to the operation performed, i.e. multiply by
immediate.

The latest spec can be found at:
https://github.com/quic/riscv-unified-db/releases/latest
---
 .../Driver/print-supported-extensions-riscv.c |  2 +-
 llvm/docs/RISCVUsage.rst                      |  2 +-
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  8 +++----
 llvm/test/CodeGen/RISCV/attributes.ll         |  2 +-
 llvm/test/MC/RISCV/xqciac-invalid.s           | 16 ++++++-------
 llvm/test/MC/RISCV/xqciac-valid.s             | 24 +++++++++----------
 .../TargetParser/RISCVISAInfoTest.cpp         |  4 ++--
 8 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index ae3a1c29df397..c72b65316ae8a 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -191,7 +191,7 @@
 // CHECK-NEXT:     ssctr                1.0       'Ssctr' (Control Transfer Records Supervisor Level)
 // CHECK-NEXT:     svukte               0.3       'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses)
 // CHECK-NEXT:     xqcia                0.2       'Xqcia' (Qualcomm uC Arithmetic Extension)
-// CHECK-NEXT:     xqciac               0.2       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
+// CHECK-NEXT:     xqciac               0.3       'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
 // CHECK-NEXT:     xqcicli              0.2       'Xqcicli' (Qualcomm uC Conditional Load Immediate Extension)
 // CHECK-NEXT:     xqcicm               0.2       'Xqcicm' (Qualcomm uC Conditional Move Extension)
 // CHECK-NEXT:     xqcics               0.2       'Xqcics' (Qualcomm uC Conditional Select Extension)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index c83fd1db0ba9b..71927d57a8f0f 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -433,7 +433,7 @@ The current vendor extensions supported are:
   LLVM implements `version 0.2 of the Qualcomm uC Arithmetic extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
 ``experimental-Xqciac``
-  LLVM implements `version 0.2 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
+  LLVM implements `version 0.3 of the Qualcomm uC Load-Store Address Calculation extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
 ``experimental-Xqcicli``
   LLVM implements `version 0.2 of the Qualcomm uC Conditional Load Immediate extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 4119dd77804f1..a8f433ea91387 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1279,7 +1279,7 @@ def HasVendorXqcilsm
                          "'Xqcilsm' (Qualcomm uC Load Store Multiple Extension)">;
 
 def FeatureVendorXqciac
-    : RISCVExperimentalExtension<0, 2, "Qualcomm uC Load-Store Address Calculation Extension",
+    : RISCVExperimentalExtension<0, 3, "Qualcomm uC Load-Store Address Calculation Extension",
                                  [FeatureStdExtZca]>;
 def HasVendorXqciac
     : Predicate<"Subtarget->hasVendorXqciac()">,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index f746cce8c9a0f..1f042b0f47e96 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -299,9 +299,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 
 let Predicates = [HasVendorXqciac, IsRV32], DecoderNamespace = "Xqciac" in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
-  def QC_C_MULADDI : RVInst16CL<0b001, 0b10, (outs GPRC:$rd_wb),
+  def QC_C_MULIADD : RVInst16CL<0b001, 0b10, (outs GPRC:$rd_wb),
                                (ins GPRC:$rd, GPRC:$rs1, uimm5:$uimm),
-                               "qc.c.muladdi", "$rd, $rs1, $uimm"> {
+                               "qc.c.muliadd", "$rd, $rs1, $uimm"> {
     let Constraints = "$rd = $rd_wb";
     bits<5> uimm;
 
@@ -310,9 +310,9 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
     let Inst{5} = uimm{4};
   }
 
-  def QC_MULADDI : RVInstI<0b110, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+  def QC_MULIADD : RVInstI<0b110, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
                            (ins GPRNoX0:$rd, GPRNoX0:$rs1, simm12:$imm12),
-                           "qc.muladdi", "$rd, $rs1, $imm12"> {
+                           "qc.muliadd", "$rd, $rs1, $imm12"> {
     let Constraints = "$rd = $rd_wb";
   }
 
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index caed0bdfb0498..b9d5bf0a7227c 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -398,7 +398,7 @@
 ; RV32XTHEADSYNC: .attribute 5, "rv32i2p1_xtheadsync1p0"
 ; RV32XWCHC: .attribute 5, "rv32i2p1_xwchc2p2"
 ; RV32XQCIA: .attribute 5, "rv32i2p1_xqcia0p2"
-; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p2"
+; RV32XQCIAC: .attribute 5, "rv32i2p1_zca1p0_xqciac0p3"
 ; RV32XQCICLI: .attribute 5, "rv32i2p1_xqcicli0p2"
 ; RV32XQCICM: .attribute 5, "rv32i2p1_zca1p0_xqcicm0p2"
 ; RV32XQCICS: .attribute 5, "rv32i2p1_xqcics0p2"
diff --git a/llvm/test/MC/RISCV/xqciac-invalid.s b/llvm/test/MC/RISCV/xqciac-invalid.s
index 4e0182aff9cc2..c595888ddee3e 100644
--- a/llvm/test/MC/RISCV/xqciac-invalid.s
+++ b/llvm/test/MC/RISCV/xqciac-invalid.s
@@ -5,29 +5,29 @@
 # RUN:     | FileCheck -check-prefixes=CHECK,CHECK-EXT %s
 
 # CHECK: :[[@LINE+1]]:14: error: invalid operand for instruction
-qc.c.muladdi x5, x10, 4
+qc.c.muliadd x5, x10, 4
 
 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
-qc.c.muladdi x15
+qc.c.muliadd x15
 
 # CHECK-IMM: :[[@LINE+1]]:24: error: immediate must be an integer in the range [0, 31]
-qc.c.muladdi x10, x15, 32
+qc.c.muliadd x10, x15, 32
 
 # CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
-qc.c.muladdi x10, x15, 20
+qc.c.muliadd x10, x15, 20
 
 
 # CHECK: :[[@LINE+1]]:12: error: invalid operand for instruction
-qc.muladdi x0, x10, 1048577
+qc.muliadd x0, x10, 1048577
 
 # CHECK: :[[@LINE+1]]:1: error: too few operands for instruction
-qc.muladdi x10
+qc.muliadd x10
 
 # CHECK-IMM: :[[@LINE+1]]:22: error: operand must be a symbol with %lo/%pcrel_lo/%tprel_lo modifier or an integer in the range [-2048, 2047]
-qc.muladdi x10, x15, 8589934592
+qc.muliadd x10, x15, 8589934592
 
 # CHECK-EXT: :[[@LINE+1]]:1: error: instruction requires the following: 'Xqciac' (Qualcomm uC Load-Store Address Calculation Extension)
-qc.muladdi x10, x15, 577
+qc.muliadd x10, x15, 577
 
 
 # CHECK: :[[@LINE+1]]:11: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/xqciac-valid.s b/llvm/test/MC/RISCV/xqciac-valid.s
index 6e97d8cc447e1..c786d7c4ea51d 100644
--- a/llvm/test/MC/RISCV/xqciac-valid.s
+++ b/llvm/test/MC/RISCV/xqciac-valid.s
@@ -10,30 +10,30 @@
 # RUN:     | llvm-objdump --mattr=+experimental-xqciac --no-print-imm-hex -d - \
 # RUN:     | FileCheck -check-prefix=CHECK-INST %s
 
-# CHECK-INST: qc.c.muladdi    a0, a1, 0
+# CHECK-INST: qc.c.muliadd    a0, a1, 0
 # CHECK-ENC: encoding: [0x8a,0x21]
-qc.c.muladdi x10, x11, 0
+qc.c.muliadd x10, x11, 0
 
-# CHECK-INST: qc.c.muladdi    a0, a1, 31
+# CHECK-INST: qc.c.muliadd    a0, a1, 31
 # CHECK-ENC: encoding: [0xea,0x3d]
-qc.c.muladdi x10, x11, 31
+qc.c.muliadd x10, x11, 31
 
-# CHECK-INST: qc.c.muladdi    a0, a1, 16
+# CHECK-INST: qc.c.muliadd    a0, a1, 16
 # CHECK-ENC: encoding: [0xaa,0x21]
-qc.c.muladdi x10, x11, 16
+qc.c.muliadd x10, x11, 16
 
 
-# CHECK-INST: qc.muladdi      tp, t0, 1234
+# CHECK-INST: qc.muliadd      tp, t0, 1234
 # CHECK-ENC: encoding: [0x0b,0xe2,0x22,0x4d]
-qc.muladdi x4, x5, 1234
+qc.muliadd x4, x5, 1234
 
-# CHECK-INST: qc.muladdi      a0, a1, -2048
+# CHECK-INST: qc.muliadd      a0, a1, -2048
 # CHECK-ENC: encoding: [0x0b,0xe5,0x05,0x80]
-qc.muladdi x10, x11, -2048
+qc.muliadd x10, x11, -2048
 
-# CHECK-INST: qc.muladdi      a0, a1, 2047
+# CHECK-INST: qc.muliadd      a0, a1, 2047
 # CHECK-ENC: encoding: [0x0b,0xe5,0xf5,0x7f]
-qc.muladdi x10, x11, 2047
+qc.muliadd x10, x11, 2047
 
 
 # CHECK-INST: qc.shladd       tp, t0, t1, 12
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 14a60c1857f24..aa8851e4897e2 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -654,7 +654,7 @@ TEST(ParseArchString, RejectsConflictingExtensions) {
   }
 
   for (StringRef Input :
-       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p2",
+       {"rv64i_xqcisls0p2", "rv64i_xqcia0p2", "rv64i_xqciac0p3",
         "rv64i_xqcicsr0p2", "rv64i_xqcilsm0p2", "rv64i_xqcicm0p2",
         "rv64i_xqcics0p2", "rv64i_xqcicli0p2", "rv64i_xqciint0p2",
         "rv64i_xqcilo0p2"}) {
@@ -1117,7 +1117,7 @@ Experimental extensions
     ssctr                1.0
     svukte               0.3
     xqcia                0.2
-    xqciac               0.2
+    xqciac               0.3
     xqcicli              0.2
     xqcicm               0.2
     xqcics               0.2

From 7109f521975e9cc2e8ba4f52ac2a8e1140bd49b5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 20:55:00 -0800
Subject: [PATCH 334/432] [ELF,test] Don't rely on --export-dynamic
 --gc-sections behavior for non-pie static linking

This mode does not retain definitions in GNU ld. While we do, it's not
consistent with the decision that there is no .dynsym . We will change
this and simplify some internal representations.
---
 lld/test/ELF/gc-sections.s                       |  8 ++++++--
 .../ELF/lto/devirt_vcall_vis_export_dynamic.ll   | 16 ++++++++++------
 lld/test/ELF/lto/internalize-exportdyn.ll        |  2 +-
 lld/test/ELF/lto/relocation-model-pic.ll         |  2 +-
 lld/test/ELF/lto/relocation-model-static.ll      |  2 +-
 lld/test/ELF/partition-errors.s                  | 15 +++++++++------
 lld/test/ELF/partition-icf.s                     |  5 ++++-
 lld/test/ELF/partition-move-to-main-startstop.s  |  5 ++++-
 lld/test/ELF/partition-thunk-reuse.s             |  5 ++++-
 lld/test/ELF/partitions.s                        |  9 +++++++--
 10 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/lld/test/ELF/gc-sections.s b/lld/test/ELF/gc-sections.s
index 31e00d495146a..97cbf749ae8a5 100644
--- a/lld/test/ELF/gc-sections.s
+++ b/lld/test/ELF/gc-sections.s
@@ -5,11 +5,15 @@
 # RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=NOGC %s
 # RUN: ld.lld --gc-sections --print-gc-sections %t -o %t2 | FileCheck --check-prefix=GC1-DISCARD %s
 # RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC1 %s
-# RUN: ld.lld --export-dynamic --gc-sections %t -o %t2
+# RUN: ld.lld -pie --export-dynamic --gc-sections %t -o %t2
+# RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC2 %s
+
+## In non-pie static linking, --export-dynamic currently retains the global 'd' even if it is not exported.
+# RUN: ld.lld --export-dynamic --gc-sections --print-gc-sections %t -o %t1
 # RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC2 %s
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 --crel %s -o %t.o
-# RUN: ld.lld --gc-sections --print-gc-sections %t.o -o %t2 | FileCheck --check-prefix=GC1-DISCARD %s
+# RUN: ld.lld -pie --gc-sections --print-gc-sections %t.o -o %t2 | FileCheck --check-prefix=GC1-DISCARD %s
 # RUN: llvm-readobj --sections --symbols %t2 | FileCheck -check-prefix=GC1 %s
 
 # NOGC: Name: .eh_frame
diff --git a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
index 2a52c5ad8ae42..93143b1fa1258 100644
--- a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
+++ b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
@@ -28,6 +28,10 @@
 
 ;; Check that all WPD fails with --export-dynamic.
 
+; RUN: echo '.globl foo; foo:' > %ta.s
+; RUN: llvm-mc -filetype=obj %ta.s -o %ta.o
+; RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
+
 ;; Index based WPD
 ; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
@@ -49,19 +53,19 @@
 ;; Check that WPD fails for target _ZN1D1mEi with --export-dynamic-symbol=_ZTV1D.
 
 ;; Index based WPD
-; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t2.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic-symbol=_ZTV1D 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t2.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
 
 ;; Hybrid WPD
-; RUN: ld.lld %t.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic-symbol=_ZTV1D 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
 
 ;; Regular LTO WPD
-; RUN: ld.lld %t4.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t4.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --export-dynamic-symbol=_ZTV1D 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
@@ -74,19 +78,19 @@
 ; RUN: echo "{ _ZTV1D; };" > %t.list
 
 ;; Index based WPD
-; RUN: ld.lld %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t2.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --dynamic-list=%t.list 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t2.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
 
 ;; Hybrid WPD
-; RUN: ld.lld %t.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --dynamic-list=%t.list 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t.o.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
 
 ;; Regular LTO WPD
-; RUN: ld.lld %t4.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN: ld.lld %t4.o %ta.so -o %t3 -save-temps --lto-whole-program-visibility \
 ; RUN:   -mllvm -pass-remarks=. \
 ; RUN:   --dynamic-list=%t.list 2>&1 | FileCheck %s --check-prefix=REMARK-AONLY
 ; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-AONLY-IR
diff --git a/lld/test/ELF/lto/internalize-exportdyn.ll b/lld/test/ELF/lto/internalize-exportdyn.ll
index 25e319269bbdb..859f881cf3293 100644
--- a/lld/test/ELF/lto/internalize-exportdyn.ll
+++ b/lld/test/ELF/lto/internalize-exportdyn.ll
@@ -3,7 +3,7 @@
 ; RUN: llvm-as a.ll -o a.bc
 ; RUN: llvm-as %p/Inputs/internalize-exportdyn.ll -o b.bc
 ; RUN: llvm-mc -filetype=obj -triple=x86_64 lib.s -o lib.o
-; RUN: ld.lld a.bc b.bc lib.o -o out --export-dynamic -save-temps
+; RUN: ld.lld a.bc b.bc lib.o -o out --export-dynamic -save-temps -pie
 ; RUN: llvm-dis < out.0.2.internalize.bc | FileCheck %s
 ; RUN: ld.lld a.bc b.bc lib.o -o out2 -shared -save-temps
 ; RUN: llvm-dis < out2.0.2.internalize.bc | FileCheck %s --check-prefix=DSO
diff --git a/lld/test/ELF/lto/relocation-model-pic.ll b/lld/test/ELF/lto/relocation-model-pic.ll
index 813ffa76b8d5b..92ac6e5c584ac 100644
--- a/lld/test/ELF/lto/relocation-model-pic.ll
+++ b/lld/test/ELF/lto/relocation-model-pic.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 @foo = external global i32
-define i32 @main() {
+define i32 @_start() {
   %t = load i32, ptr @foo
   ret i32 %t
 }
diff --git a/lld/test/ELF/lto/relocation-model-static.ll b/lld/test/ELF/lto/relocation-model-static.ll
index 443c419624cb8..26503f2044f82 100644
--- a/lld/test/ELF/lto/relocation-model-static.ll
+++ b/lld/test/ELF/lto/relocation-model-static.ll
@@ -13,7 +13,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 @foo = external dso_local global i32
-define i32 @main() {
+define i32 @_start() {
   %t = load i32, ptr @foo
   ret i32 %t
 }
diff --git a/lld/test/ELF/partition-errors.s b/lld/test/ELF/partition-errors.s
index 6150fe20cdadb..8962ded161f17 100644
--- a/lld/test/ELF/partition-errors.s
+++ b/lld/test/ELF/partition-errors.s
@@ -1,13 +1,16 @@
 // REQUIRES: x86, mips
+/// Link against a DSO to ensure that sections are not discarded by --gc-sections.
+// RUN: llvm-mc %S/Inputs/shared.s -o %ts.o -filetype=obj --triple=x86_64
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc -triple=x86_64-unknown-linux -filetype=obj -o %t.o %s
 // RUN: echo "SECTIONS {}" > %t.script
-// RUN: not ld.lld --export-dynamic %t.o %t.script 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o %t.script 2>&1 | FileCheck %s
 // RUN: echo "PHDRS { text PT_LOAD; }" > %t2.script
-// RUN: not ld.lld --export-dynamic %t.o %t2.script 2>&1 | FileCheck %s
-// RUN: not ld.lld --export-dynamic %t.o --section-start .text=0 2>&1 | FileCheck %s
-// RUN: not ld.lld --export-dynamic %t.o -Ttext=0 2>&1 | FileCheck %s
-// RUN: not ld.lld --export-dynamic %t.o -Tdata=0 2>&1 | FileCheck %s
-// RUN: not ld.lld --export-dynamic %t.o -Tbss=0 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o %t2.script 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o --section-start .text=0 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o -Ttext=0 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o -Tdata=0 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %ts.so %t.o -Tbss=0 2>&1 | FileCheck %s
 
 // RUN: llvm-mc -triple=mipsel-unknown-linux -filetype=obj -o %t2.o %s
 // RUN: not ld.lld --export-dynamic %t2.o 2>&1 | FileCheck %s
diff --git a/lld/test/ELF/partition-icf.s b/lld/test/ELF/partition-icf.s
index e8608c899fc19..cffb5d4c8fabf 100644
--- a/lld/test/ELF/partition-icf.s
+++ b/lld/test/ELF/partition-icf.s
@@ -1,6 +1,9 @@
 // REQUIRES: x86
+/// Link against a DSO to ensure that sections are not discarded by --gc-sections.
+// RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %ts.o
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=x86_64-unknown-linux
-// RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections --icf=all
+// RUN: ld.lld %t.o %ts.so -o %t --export-dynamic --gc-sections --icf=all
 // RUN: llvm-readelf -S -s %t | FileCheck %s
 
 // CHECK: part1
diff --git a/lld/test/ELF/partition-move-to-main-startstop.s b/lld/test/ELF/partition-move-to-main-startstop.s
index 76c04957b3f72..9620412573471 100644
--- a/lld/test/ELF/partition-move-to-main-startstop.s
+++ b/lld/test/ELF/partition-move-to-main-startstop.s
@@ -1,6 +1,9 @@
 // REQUIRES: x86
+/// Link against a DSO to ensure that sections are not discarded by --gc-sections.
+// RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %ts.o
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=x86_64-unknown-linux
-// RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections
+// RUN: ld.lld %t.o %ts.so -o %t --export-dynamic --gc-sections
 // RUN: llvm-readelf -S %t | FileCheck --implicit-check-not=has_startstop %s
 
 // We can't let the has_startstop section be split by partition because it is
diff --git a/lld/test/ELF/partition-thunk-reuse.s b/lld/test/ELF/partition-thunk-reuse.s
index 6425bdf345b03..c60adc4ebdd4d 100644
--- a/lld/test/ELF/partition-thunk-reuse.s
+++ b/lld/test/ELF/partition-thunk-reuse.s
@@ -1,6 +1,9 @@
 // REQUIRES: arm
+/// Link against a DSO to ensure that sections are not discarded by --gc-sections.
+// RUN: llvm-mc -filetype=obj -triple=armv7-unknown-linux %S/Inputs/shared.s -o %ts.o
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=armv7-unknown-linux -arm-add-build-attributes
-// RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections
+// RUN: ld.lld %t.o %ts.so -o %t --export-dynamic --gc-sections
 // RUN: llvm-nm %t | FileCheck %s
 
 // CHECK: __Thumbv7ABSLongThunk__start
diff --git a/lld/test/ELF/partitions.s b/lld/test/ELF/partitions.s
index 103094e081a37..0fc25b4da8617 100644
--- a/lld/test/ELF/partitions.s
+++ b/lld/test/ELF/partitions.s
@@ -1,10 +1,15 @@
 // REQUIRES: aarch64, x86
+/// Link against a DSO to ensure that sections are not discarded by --gc-sections.
+// RUN: llvm-mc %S/Inputs/shared.s -o %ts.o -filetype=obj --triple=x86_64
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=x86_64-unknown-linux
-// RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections -z max-page-size=65536
+// RUN: ld.lld %t.o %ts.so -o %t --export-dynamic --gc-sections -z max-page-size=65536
 // RUN: llvm-readelf -S -s %t | FileCheck %s
 
+// RUN: llvm-mc %S/Inputs/shared.s -o %ts.o -filetype=obj --triple=aarch64
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=aarch64 --crel
-// RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections
+// RUN: ld.lld %t.o %ts.so -o %t --export-dynamic --gc-sections
 // RUN: llvm-readelf -S -s %t | FileCheck %s
 
 // This is basically lld/docs/partitions.dot in object file form.

From 7f37b34d31914120a5bb6bd341e7616773df7613 Mon Sep 17 00:00:00 2001
From: Shourya Goel <shouryagoel10000@gmail.com>
Date: Tue, 28 Jan 2025 11:01:16 +0530
Subject: [PATCH 335/432] [libc][complex] Testing infra for MPC (#121261)

This PR aims to add the groundwork to test the precision of libc complex
functions against MPC. I took `cargf` as a test to verify that the infra
works fine.
---
 libc/CMakeLists.txt                       |   1 +
 libc/cmake/modules/LLVMLibCCheckMPC.cmake |  22 +
 libc/src/__support/CMakeLists.txt         |  12 +-
 libc/src/__support/CPP/type_traits.h      |   1 +
 libc/src/__support/complex_basic_ops.h    |  36 +
 libc/src/__support/complex_type.h         |  21 -
 libc/src/complex/generic/CMakeLists.txt   |  20 +-
 libc/src/complex/generic/conj.cpp         |   2 +-
 libc/src/complex/generic/conjf.cpp        |   2 +-
 libc/src/complex/generic/conjf128.cpp     |   2 +-
 libc/src/complex/generic/conjf16.cpp      |   2 +-
 libc/src/complex/generic/conjl.cpp        |   2 +-
 libc/src/complex/generic/cproj.cpp        |   2 +-
 libc/src/complex/generic/cprojf.cpp       |   2 +-
 libc/src/complex/generic/cprojf128.cpp    |   2 +-
 libc/src/complex/generic/cprojf16.cpp     |   2 +-
 libc/src/complex/generic/cprojl.cpp       |   2 +-
 libc/test/UnitTest/FPMatcher.h            |   1 -
 libc/test/src/CMakeLists.txt              |  11 +-
 libc/test/src/complex/CMakeLists.txt      |  81 +--
 libc/test/src/complex/cprojf_test.cpp     |  12 +
 libc/utils/CMakeLists.txt                 |   1 +
 libc/utils/MPCWrapper/CMakeLists.txt      |  24 +
 libc/utils/MPCWrapper/MPCUtils.cpp        | 344 ++++++++++
 libc/utils/MPCWrapper/MPCUtils.h          | 270 ++++++++
 libc/utils/MPCWrapper/check_mpc.cpp       |   8 +
 libc/utils/MPFRWrapper/CMakeLists.txt     |  33 +-
 libc/utils/MPFRWrapper/MPCommon.cpp       | 561 +++++++++++++++
 libc/utils/MPFRWrapper/MPCommon.h         | 342 ++++++++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp      | 791 +---------------------
 libc/utils/MPFRWrapper/MPFRUtils.h        |   8 +-
 31 files changed, 1714 insertions(+), 906 deletions(-)
 create mode 100644 libc/cmake/modules/LLVMLibCCheckMPC.cmake
 create mode 100644 libc/src/__support/complex_basic_ops.h
 create mode 100644 libc/utils/MPCWrapper/CMakeLists.txt
 create mode 100644 libc/utils/MPCWrapper/MPCUtils.cpp
 create mode 100644 libc/utils/MPCWrapper/MPCUtils.h
 create mode 100644 libc/utils/MPCWrapper/check_mpc.cpp
 create mode 100644 libc/utils/MPFRWrapper/MPCommon.cpp
 create mode 100644 libc/utils/MPFRWrapper/MPCommon.h

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index e5ac842edf56e..c061e2a05ebd8 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -262,6 +262,7 @@ if(LIBC_TARGET_OS_IS_GPU)
 endif()
 
 include(LLVMLibCCheckMPFR)
+include(LLVMLibCCheckMPC)
 
 if(LLVM_LIBC_CLANG_TIDY)
   set(LLVM_LIBC_ENABLE_LINTING ON)
diff --git a/libc/cmake/modules/LLVMLibCCheckMPC.cmake b/libc/cmake/modules/LLVMLibCCheckMPC.cmake
new file mode 100644
index 0000000000000..633719b77575c
--- /dev/null
+++ b/libc/cmake/modules/LLVMLibCCheckMPC.cmake
@@ -0,0 +1,22 @@
+if(LIBC_TESTS_CAN_USE_MPFR)
+  set(LLVM_LIBC_MPC_INSTALL_PATH "" CACHE PATH "Path to where MPC is installed (e.g. C:/src/install or ~/src/install)")
+
+  if(LLVM_LIBC_MPC_INSTALL_PATH)
+    set(LIBC_TESTS_CAN_USE_MPC TRUE)
+  elseif(LIBC_TARGET_OS_IS_GPU OR LLVM_LIBC_FULL_BUILD)
+    # In full build mode, the MPC library should be built using our own facilities,
+    # which is currently not possible.
+    set(LIBC_TESTS_CAN_USE_MPC FALSE)
+  else()
+    try_compile(
+      LIBC_TESTS_CAN_USE_MPC
+      ${CMAKE_CURRENT_BINARY_DIR}
+      SOURCES
+        ${LIBC_SOURCE_DIR}/utils/MPCWrapper/check_mpc.cpp
+      COMPILE_DEFINITIONS
+        ${LIBC_COMPILE_OPTIONS_DEFAULT}
+      LINK_LIBRARIES
+        -lmpc -lmpfr -lgmp -latomic
+    )
+  endif()
+endif()
diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index 148484052dcad..0e0556f4eecfd 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -243,12 +243,20 @@ add_header_library(
   HDRS
     complex_type.h
   DEPENDS
-    libc.src.__support.CPP.bit
-    libc.src.__support.FPUtil.fp_bits
     libc.src.__support.macros.properties.types
     libc.src.__support.macros.properties.complex_types
 )
 
+add_header_library(
+  complex_basic_ops
+  HDRS
+    complex_basic_ops.h
+  DEPENDS
+    .complex_type
+    libc.src.__support.CPP.bit
+    libc.src.__support.FPUtil.fp_bits
+)
+
 add_header_library(
   integer_operations
   HDRS
diff --git a/libc/src/__support/CPP/type_traits.h b/libc/src/__support/CPP/type_traits.h
index 910cebbb8d059..d48ee23aeae07 100644
--- a/libc/src/__support/CPP/type_traits.h
+++ b/libc/src/__support/CPP/type_traits.h
@@ -26,6 +26,7 @@
 #include "src/__support/CPP/type_traits/is_array.h"
 #include "src/__support/CPP/type_traits/is_base_of.h"
 #include "src/__support/CPP/type_traits/is_class.h"
+#include "src/__support/CPP/type_traits/is_complex.h"
 #include "src/__support/CPP/type_traits/is_const.h"
 #include "src/__support/CPP/type_traits/is_constant_evaluated.h"
 #include "src/__support/CPP/type_traits/is_convertible.h"
diff --git a/libc/src/__support/complex_basic_ops.h b/libc/src/__support/complex_basic_ops.h
new file mode 100644
index 0000000000000..5992ebec0786c
--- /dev/null
+++ b/libc/src/__support/complex_basic_ops.h
@@ -0,0 +1,36 @@
+//===-- complex basic operations --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_COMPLEX_BASIC_OPERATIONS_H
+#define LLVM_LIBC_SRC___SUPPORT_COMPLEX_BASIC_OPERATIONS_H
+
+#include "complex_type.h"
+#include "src/__support/CPP/bit.h"
+#include "src/__support/FPUtil/FPBits.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+template <typename T> LIBC_INLINE constexpr T conjugate(T c) {
+  Complex<make_real_t<T>> c_c = cpp::bit_cast<Complex<make_real_t<T>>>(c);
+  c_c.imag = -c_c.imag;
+  return cpp::bit_cast<T>(c_c);
+}
+
+template <typename T> LIBC_INLINE constexpr T project(T c) {
+  using real_t = make_real_t<T>;
+  Complex<real_t> c_c = cpp::bit_cast<Complex<real_t>>(c);
+  if (fputil::FPBits<real_t>(c_c.real).is_inf() ||
+      fputil::FPBits<real_t>(c_c.imag).is_inf())
+    return cpp::bit_cast<T>(
+        Complex<real_t>{(fputil::FPBits<real_t>::inf(Sign::POS).get_val()),
+                        static_cast<real_t>(c_c.imag > 0 ? 0.0 : -0.0)});
+  return c;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
+#endif // LLVM_LIBC_SRC___SUPPORT_COMPLEX_BASIC_OPERATIONS_H
diff --git a/libc/src/__support/complex_type.h b/libc/src/__support/complex_type.h
index f72ce8a4efd13..93089de61dba8 100644
--- a/libc/src/__support/complex_type.h
+++ b/libc/src/__support/complex_type.h
@@ -9,8 +9,6 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_COMPLEX_TYPE_H
 #define LLVM_LIBC_SRC___SUPPORT_COMPLEX_TYPE_H
 
-#include "src/__support/CPP/bit.h"
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/complex_types.h"
 #include "src/__support/macros/properties/types.h"
@@ -71,24 +69,5 @@ template <> struct make_real<cfloat128> {
 
 template <typename T> using make_real_t = typename make_real<T>::type;
 
-template <typename T> LIBC_INLINE constexpr T conjugate(T c) {
-  Complex<make_real_t<T>> c_c = cpp::bit_cast<Complex<make_real_t<T>>>(c);
-  c_c.imag = -c_c.imag;
-  return cpp::bit_cast<T>(c_c);
-}
-
-template <typename T> LIBC_INLINE constexpr T project(T c) {
-  using real_t = make_real_t<T>;
-  Complex<real_t> c_c = cpp::bit_cast<Complex<real_t>>(c);
-  if (fputil::FPBits<real_t>(c_c.real).is_inf() ||
-      fputil::FPBits<real_t>(c_c.imag).is_inf()) {
-    return cpp::bit_cast<T>(
-        Complex<real_t>{(fputil::FPBits<real_t>::inf(Sign::POS).get_val()),
-                        static_cast<real_t>(c_c.imag > 0 ? 0.0 : -0.0)});
-  } else {
-    return c;
-  }
-}
-
 } // namespace LIBC_NAMESPACE_DECL
 #endif // LLVM_LIBC_SRC___SUPPORT_COMPLEX_TYPE_H
diff --git a/libc/src/complex/generic/CMakeLists.txt b/libc/src/complex/generic/CMakeLists.txt
index 3dae6f8a61495..82d2b01d534a9 100644
--- a/libc/src/complex/generic/CMakeLists.txt
+++ b/libc/src/complex/generic/CMakeLists.txt
@@ -7,7 +7,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -19,7 +19,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -31,7 +31,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -43,7 +43,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
     libc.src.__support.macros.properties.types
     libc.src.__support.macros.properties.complex_types
 )
@@ -57,7 +57,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
     libc.src.__support.macros.properties.types
     libc.src.__support.macros.properties.complex_types
 )
@@ -71,7 +71,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -83,7 +83,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -95,7 +95,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
 )
 
 add_entrypoint_object(
@@ -107,7 +107,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
     libc.src.__support.macros.properties.types
     libc.src.__support.macros.properties.complex_types
 )
@@ -121,7 +121,7 @@ add_entrypoint_object(
   COMPILE_OPTIONS
     ${libc_opt_high_flag}
   DEPENDS
-    libc.src.__support.complex_type
+    libc.src.__support.complex_basic_ops
     libc.src.__support.macros.properties.types
     libc.src.__support.macros.properties.complex_types
 )
diff --git a/libc/src/complex/generic/conj.cpp b/libc/src/complex/generic/conj.cpp
index cbcd480d6efa5..ec1f5285f6faf 100644
--- a/libc/src/complex/generic/conj.cpp
+++ b/libc/src/complex/generic/conj.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/conj.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/conjf.cpp b/libc/src/complex/generic/conjf.cpp
index a1af3d78ebc6a..bf376bc9e87b6 100644
--- a/libc/src/complex/generic/conjf.cpp
+++ b/libc/src/complex/generic/conjf.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/conjf.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/conjf128.cpp b/libc/src/complex/generic/conjf128.cpp
index a63809a66e25a..d5faba789fe30 100644
--- a/libc/src/complex/generic/conjf128.cpp
+++ b/libc/src/complex/generic/conjf128.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/conjf128.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/conjf16.cpp b/libc/src/complex/generic/conjf16.cpp
index cd1ab67ed1cd9..31e8f9358e5e7 100644
--- a/libc/src/complex/generic/conjf16.cpp
+++ b/libc/src/complex/generic/conjf16.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/conjf16.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/conjl.cpp b/libc/src/complex/generic/conjl.cpp
index 8298ede6fa38f..351fa60b1faec 100644
--- a/libc/src/complex/generic/conjl.cpp
+++ b/libc/src/complex/generic/conjl.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/conjl.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/cproj.cpp b/libc/src/complex/generic/cproj.cpp
index d5e8c3ff3d9ec..2a9f4578afbbf 100644
--- a/libc/src/complex/generic/cproj.cpp
+++ b/libc/src/complex/generic/cproj.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/cproj.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/cprojf.cpp b/libc/src/complex/generic/cprojf.cpp
index d0235f6bfef7e..d4e83dd21ff1c 100644
--- a/libc/src/complex/generic/cprojf.cpp
+++ b/libc/src/complex/generic/cprojf.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/cprojf.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/cprojf128.cpp b/libc/src/complex/generic/cprojf128.cpp
index eb2cd08dfc117..72a741d815032 100644
--- a/libc/src/complex/generic/cprojf128.cpp
+++ b/libc/src/complex/generic/cprojf128.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/cprojf128.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/cprojf16.cpp b/libc/src/complex/generic/cprojf16.cpp
index 8d2d64a439e02..0928345503bec 100644
--- a/libc/src/complex/generic/cprojf16.cpp
+++ b/libc/src/complex/generic/cprojf16.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/cprojf16.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/src/complex/generic/cprojl.cpp b/libc/src/complex/generic/cprojl.cpp
index 34deeb63b16d0..849a6f72755f4 100644
--- a/libc/src/complex/generic/cprojl.cpp
+++ b/libc/src/complex/generic/cprojl.cpp
@@ -8,7 +8,7 @@
 
 #include "src/complex/cprojl.h"
 #include "src/__support/common.h"
-#include "src/__support/complex_type.h"
+#include "src/__support/complex_basic_ops.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index b8e240bf328ce..53e0c16f22101 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -11,7 +11,6 @@
 
 #include "src/__support/CPP/array.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/CPP/type_traits/is_complex.h"
 #include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/fpbits_str.h"
diff --git a/libc/test/src/CMakeLists.txt b/libc/test/src/CMakeLists.txt
index 22ec43588f744..a8f779ff65131 100644
--- a/libc/test/src/CMakeLists.txt
+++ b/libc/test/src/CMakeLists.txt
@@ -1,12 +1,21 @@
 function(add_fp_unittest name)
   cmake_parse_arguments(
     "MATH_UNITTEST"
-    "NEED_MPFR;UNIT_TEST_ONLY;HERMETIC_TEST_ONLY" # Optional arguments
+    "NEED_MPFR;NEED_MPC;UNIT_TEST_ONLY;HERMETIC_TEST_ONLY" # Optional arguments
     "" # Single value arguments
     "LINK_LIBRARIES;DEPENDS" # Multi-value arguments
     ${ARGN}
   )
 
+  if(MATH_UNITTEST_NEED_MPC)
+    set(MATH_UNITTEST_NEED_MPFR TRUE)
+    if(NOT LIBC_TESTS_CAN_USE_MPC)
+      message(VERBOSE "Complex test ${name} will be skipped as MPC library is not available.")
+      return()
+    endif()
+    list(APPEND MATH_UNITTEST_LINK_LIBRARIES libcMPCWrapper)
+  endif()
+
   if(MATH_UNITTEST_NEED_MPFR)
     if(NOT LIBC_TESTS_CAN_USE_MPFR)
       message(VERBOSE "Math test ${name} will be skipped as MPFR library is not available.")
diff --git a/libc/test/src/complex/CMakeLists.txt b/libc/test/src/complex/CMakeLists.txt
index d6b62e4686a22..efd1ede63eca5 100644
--- a/libc/test/src/complex/CMakeLists.txt
+++ b/libc/test/src/complex/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_custom_target(libc-complex-unittests)
 
-add_libc_test(
+add_fp_unittest(
   conj_test
   SUITE
     libc-complex-unittests
@@ -8,11 +8,9 @@ add_libc_test(
     conj_test.cpp
   DEPENDS
     libc.src.complex.conj
-  LINK_LIBRARIES
-      LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   conjf_test
   SUITE
     libc-complex-unittests
@@ -20,11 +18,9 @@ add_libc_test(
     conjf_test.cpp
   DEPENDS
     libc.src.complex.conjf
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   conjl_test
   SUITE
     libc-complex-unittests
@@ -32,11 +28,9 @@ add_libc_test(
     conjl_test.cpp
   DEPENDS
     libc.src.complex.conjl
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   conjf16_test
   SUITE
     libc-complex-unittests
@@ -44,11 +38,9 @@ add_libc_test(
     conjf16_test.cpp
   DEPENDS
     libc.src.complex.conjf16
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   conjf128_test
   SUITE
     libc-complex-unittests
@@ -56,11 +48,9 @@ add_libc_test(
     conjf128_test.cpp
   DEPENDS
     libc.src.complex.conjf128
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cproj_test
   SUITE
     libc-complex-unittests
@@ -68,23 +58,20 @@ add_libc_test(
     cproj_test.cpp
   DEPENDS
     libc.src.complex.cproj
-  LINK_LIBRARIES
-      LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cprojf_test
+  NEED_MPC
   SUITE
     libc-complex-unittests
   SRCS
     cprojf_test.cpp
   DEPENDS
     libc.src.complex.cprojf
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cprojl_test
   SUITE
     libc-complex-unittests
@@ -92,11 +79,9 @@ add_libc_test(
     cprojl_test.cpp
   DEPENDS
     libc.src.complex.cprojl
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cprojf16_test
   SUITE
     libc-complex-unittests
@@ -104,11 +89,9 @@ add_libc_test(
     cprojf16_test.cpp
   DEPENDS
     libc.src.complex.cprojf16
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cprojf128_test
   SUITE
     libc-complex-unittests
@@ -116,11 +99,9 @@ add_libc_test(
     cprojf128_test.cpp
   DEPENDS
     libc.src.complex.cprojf128
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   creal_test
   SUITE
     libc-complex-unittests
@@ -128,11 +109,9 @@ add_libc_test(
     creal_test.cpp
   DEPENDS
     libc.src.complex.creal
-  LINK_LIBRARIES
-      LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   crealf_test
   SUITE
     libc-complex-unittests
@@ -140,11 +119,9 @@ add_libc_test(
     crealf_test.cpp
   DEPENDS
     libc.src.complex.crealf
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   creall_test
   SUITE
     libc-complex-unittests
@@ -152,11 +129,9 @@ add_libc_test(
     creall_test.cpp
   DEPENDS
     libc.src.complex.creall
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   crealf16_test
   SUITE
     libc-complex-unittests
@@ -164,11 +139,9 @@ add_libc_test(
     crealf16_test.cpp
   DEPENDS
     libc.src.complex.crealf16
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   crealf128_test
   SUITE
     libc-complex-unittests
@@ -176,11 +149,9 @@ add_libc_test(
     crealf128_test.cpp
   DEPENDS
     libc.src.complex.crealf128
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cimag_test
   SUITE
     libc-complex-unittests
@@ -188,11 +159,9 @@ add_libc_test(
     cimag_test.cpp
   DEPENDS
     libc.src.complex.cimag
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cimagf_test
   SUITE
     libc-complex-unittests
@@ -200,11 +169,9 @@ add_libc_test(
     cimagf_test.cpp
   DEPENDS
     libc.src.complex.cimagf
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cimagl_test
   SUITE
     libc-complex-unittests
@@ -212,11 +179,9 @@ add_libc_test(
     cimagl_test.cpp
   DEPENDS
     libc.src.complex.cimagl
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cimagf16_test
   SUITE
     libc-complex-unittests
@@ -224,11 +189,9 @@ add_libc_test(
     cimagf16_test.cpp
   DEPENDS
     libc.src.complex.cimagf16
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
 
-add_libc_test(
+add_fp_unittest(
   cimagf128_test
   SUITE
     libc-complex-unittests
@@ -236,6 +199,4 @@ add_libc_test(
     cimagf128_test.cpp
   DEPENDS
     libc.src.complex.cimagf128
-  LINK_LIBRARIES
-    LibcFPTestHelpers
 )
diff --git a/libc/test/src/complex/cprojf_test.cpp b/libc/test/src/complex/cprojf_test.cpp
index 7123ed4e28d4b..4635aa17ad02d 100644
--- a/libc/test/src/complex/cprojf_test.cpp
+++ b/libc/test/src/complex/cprojf_test.cpp
@@ -10,4 +10,16 @@
 
 #include "src/complex/cprojf.h"
 
+#include "utils/MPCWrapper/MPCUtils.h"
+
+using LlvmLibcCprojTestMPC = LIBC_NAMESPACE::testing::FPTest<float>;
+
+namespace mpc = LIBC_NAMESPACE::testing::mpc;
+
+TEST_F(LlvmLibcCprojTestMPC, MPCRND) {
+  _Complex float test = 5.0 + 10.0i;
+  EXPECT_MPC_MATCH_ALL_ROUNDING(mpc::Operation::Cproj, test,
+                                LIBC_NAMESPACE::cprojf(test), 0.5);
+}
+
 LIST_CPROJ_TESTS(_Complex float, float, LIBC_NAMESPACE::cprojf)
diff --git a/libc/utils/CMakeLists.txt b/libc/utils/CMakeLists.txt
index a33c13a045a8a..b85136f2d420f 100644
--- a/libc/utils/CMakeLists.txt
+++ b/libc/utils/CMakeLists.txt
@@ -2,4 +2,5 @@ add_subdirectory(hdrgen)
 
 if(LLVM_INCLUDE_TESTS)
   add_subdirectory(MPFRWrapper)
+  add_subdirectory(MPCWrapper)
 endif()
diff --git a/libc/utils/MPCWrapper/CMakeLists.txt b/libc/utils/MPCWrapper/CMakeLists.txt
new file mode 100644
index 0000000000000..6c12f73109a57
--- /dev/null
+++ b/libc/utils/MPCWrapper/CMakeLists.txt
@@ -0,0 +1,24 @@
+if(LIBC_TESTS_CAN_USE_MPC)
+  add_library(libcMPCWrapper STATIC
+    MPCUtils.cpp
+    MPCUtils.h
+  )
+  _get_common_test_compile_options(compile_options "" "")
+  list(REMOVE_ITEM compile_options "-ffreestanding")
+  target_compile_options(libcMPCWrapper PRIVATE -O3 ${compile_options})
+  add_dependencies(
+    libcMPCWrapper
+    libcMPCommon
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.string
+    libc.src.__support.CPP.stringstream
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.complex_type
+    LibcTest.unit
+  )
+  target_include_directories(libcMPCWrapper PUBLIC ${LIBC_SOURCE_DIR})
+  target_link_libraries(libcMPCWrapper PUBLIC libcMPCommon LibcFPTestHelpers.unit LibcTest.unit mpc)
+elseif(NOT LIBC_TARGET_OS_IS_GPU AND NOT LLVM_LIBC_FULL_BUILD)
+  message(WARNING "Math tests using MPC will be skipped.")
+endif()
diff --git a/libc/utils/MPCWrapper/MPCUtils.cpp b/libc/utils/MPCWrapper/MPCUtils.cpp
new file mode 100644
index 0000000000000..4d5f685a823b0
--- /dev/null
+++ b/libc/utils/MPCWrapper/MPCUtils.cpp
@@ -0,0 +1,344 @@
+//===-- Utils which wrap MPC ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MPCUtils.h"
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/stringstream.h"
+#include "utils/MPFRWrapper/MPCommon.h"
+
+#include <stdint.h>
+
+#include "mpc.h"
+
+template <typename T> using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+
+namespace LIBC_NAMESPACE_DECL {
+namespace testing {
+namespace mpc {
+
+static inline cpp::string str(RoundingMode mode) {
+  switch (mode) {
+  case RoundingMode::Upward:
+    return "MPFR_RNDU";
+  case RoundingMode::Downward:
+    return "MPFR_RNDD";
+  case RoundingMode::TowardZero:
+    return "MPFR_RNDZ";
+  case RoundingMode::Nearest:
+    return "MPFR_RNDN";
+  }
+}
+
+class MPCNumber {
+private:
+  unsigned int precision;
+  mpc_t value;
+  mpc_rnd_t mpc_rounding;
+
+public:
+  explicit MPCNumber(unsigned int p) : precision(p), mpc_rounding(MPC_RNDNN) {
+    mpc_init2(value, precision);
+  }
+
+  MPCNumber() : precision(256), mpc_rounding(MPC_RNDNN) {
+    mpc_init2(value, 256);
+  }
+
+  MPCNumber(unsigned int p, mpc_rnd_t rnd) : precision(p), mpc_rounding(rnd) {
+    mpc_init2(value, precision);
+  }
+
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<_Complex float, XType>, bool> = 0>
+  MPCNumber(XType x,
+            unsigned int precision = mpfr::ExtraPrecision<float>::VALUE,
+            RoundingMode rnd = RoundingMode::Nearest)
+      : precision(precision),
+        mpc_rounding(MPC_RND(mpfr::get_mpfr_rounding_mode(rnd),
+                             mpfr::get_mpfr_rounding_mode(rnd))) {
+    mpc_init2(value, precision);
+    Complex<float> x_c = cpp::bit_cast<Complex<float>>(x);
+    mpfr_t real, imag;
+    mpfr_init2(real, precision);
+    mpfr_init2(imag, precision);
+    mpfr_set_flt(real, x_c.real, mpfr::get_mpfr_rounding_mode(rnd));
+    mpfr_set_flt(imag, x_c.imag, mpfr::get_mpfr_rounding_mode(rnd));
+    mpc_set_fr_fr(value, real, imag, mpc_rounding);
+    mpfr_clear(real);
+    mpfr_clear(imag);
+  }
+
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<_Complex double, XType>, bool> = 0>
+  MPCNumber(XType x,
+            unsigned int precision = mpfr::ExtraPrecision<double>::VALUE,
+            RoundingMode rnd = RoundingMode::Nearest)
+      : precision(precision),
+        mpc_rounding(MPC_RND(mpfr::get_mpfr_rounding_mode(rnd),
+                             mpfr::get_mpfr_rounding_mode(rnd))) {
+    mpc_init2(value, precision);
+    Complex<double> x_c = cpp::bit_cast<Complex<double>>(x);
+    mpc_set_d_d(value, x_c.real, x_c.imag, mpc_rounding);
+  }
+
+  MPCNumber(const MPCNumber &other)
+      : precision(other.precision), mpc_rounding(other.mpc_rounding) {
+    mpc_init2(value, precision);
+    mpc_set(value, other.value, mpc_rounding);
+  }
+
+  ~MPCNumber() { mpc_clear(value); }
+
+  MPCNumber &operator=(const MPCNumber &rhs) {
+    precision = rhs.precision;
+    mpc_rounding = rhs.mpc_rounding;
+    mpc_init2(value, precision);
+    mpc_set(value, rhs.value, mpc_rounding);
+    return *this;
+  }
+
+  void setValue(mpc_t val) const { mpc_set(val, value, mpc_rounding); }
+
+  mpc_t &getValue() { return value; }
+
+  MPCNumber carg() const {
+    mpfr_t res;
+    MPCNumber result(precision, mpc_rounding);
+
+    mpfr_init2(res, precision);
+
+    mpc_arg(res, value, MPC_RND_RE(mpc_rounding));
+    mpc_set_fr(result.value, res, mpc_rounding);
+
+    mpfr_clear(res);
+
+    return result;
+  }
+
+  MPCNumber cproj() const {
+    MPCNumber result(precision, mpc_rounding);
+    mpc_proj(result.value, value, mpc_rounding);
+    return result;
+  }
+};
+
+namespace internal {
+
+template <typename InputType>
+cpp::enable_if_t<cpp::is_complex_v<InputType>, MPCNumber>
+unary_operation(Operation op, InputType input, unsigned int precision,
+                RoundingMode rounding) {
+  MPCNumber mpcInput(input, precision, rounding);
+  switch (op) {
+  case Operation::Carg:
+    return mpcInput.carg();
+  case Operation::Cproj:
+    return mpcInput.cproj();
+  default:
+    __builtin_unreachable();
+  }
+}
+
+template <typename InputType, typename OutputType>
+bool compare_unary_operation_single_output_same_type(Operation op,
+                                                     InputType input,
+                                                     OutputType libc_result,
+                                                     double ulp_tolerance,
+                                                     RoundingMode rounding) {
+
+  unsigned int precision =
+      mpfr::get_precision<make_real_t<InputType>>(ulp_tolerance);
+
+  MPCNumber mpc_result;
+  mpc_result = unary_operation(op, input, precision, rounding);
+
+  mpc_t mpc_result_val;
+  mpc_init2(mpc_result_val, precision);
+  mpc_result.setValue(mpc_result_val);
+
+  mpfr_t real, imag;
+  mpfr_init2(real, precision);
+  mpfr_init2(imag, precision);
+  mpc_real(real, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+  mpc_imag(imag, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+
+  mpfr::MPFRNumber mpfr_real(real, precision, rounding);
+  mpfr::MPFRNumber mpfr_imag(imag, precision, rounding);
+
+  double ulp_real = mpfr_real.ulp(
+      (cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result)).real);
+  double ulp_imag = mpfr_imag.ulp(
+      (cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result)).imag);
+  mpc_clear(mpc_result_val);
+  mpfr_clear(real);
+  mpfr_clear(imag);
+  return (ulp_real <= ulp_tolerance) && (ulp_imag <= ulp_tolerance);
+}
+
+template bool compare_unary_operation_single_output_same_type(
+    Operation, _Complex float, _Complex float, double, RoundingMode);
+template bool compare_unary_operation_single_output_same_type(
+    Operation, _Complex double, _Complex double, double, RoundingMode);
+
+template <typename InputType, typename OutputType>
+bool compare_unary_operation_single_output_different_type(
+    Operation op, InputType input, OutputType libc_result, double ulp_tolerance,
+    RoundingMode rounding) {
+
+  unsigned int precision =
+      mpfr::get_precision<make_real_t<InputType>>(ulp_tolerance);
+
+  MPCNumber mpc_result;
+  mpc_result = unary_operation(op, input, precision, rounding);
+
+  mpc_t mpc_result_val;
+  mpc_init2(mpc_result_val, precision);
+  mpc_result.setValue(mpc_result_val);
+
+  mpfr_t real;
+  mpfr_init2(real, precision);
+  mpc_real(real, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+
+  mpfr::MPFRNumber mpfr_real(real, precision, rounding);
+
+  double ulp_real = mpfr_real.ulp(libc_result);
+  mpc_clear(mpc_result_val);
+  mpfr_clear(real);
+  return (ulp_real <= ulp_tolerance);
+}
+
+template bool compare_unary_operation_single_output_different_type(
+    Operation, _Complex float, float, double, RoundingMode);
+template bool compare_unary_operation_single_output_different_type(
+    Operation, _Complex double, double, double, RoundingMode);
+
+template <typename InputType, typename OutputType>
+void explain_unary_operation_single_output_different_type_error(
+    Operation op, InputType input, OutputType libc_result, double ulp_tolerance,
+    RoundingMode rounding) {
+
+  unsigned int precision =
+      mpfr::get_precision<make_real_t<InputType>>(ulp_tolerance);
+
+  MPCNumber mpc_result;
+  mpc_result = unary_operation(op, input, precision, rounding);
+
+  mpc_t mpc_result_val;
+  mpc_init2(mpc_result_val, precision);
+  mpc_result.setValue(mpc_result_val);
+
+  mpfr_t real;
+  mpfr_init2(real, precision);
+  mpc_real(real, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+
+  mpfr::MPFRNumber mpfr_result(real, precision, rounding);
+  mpfr::MPFRNumber mpfrLibcResult(libc_result, precision, rounding);
+  mpfr::MPFRNumber mpfrInputReal(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(input).real, precision,
+      rounding);
+  mpfr::MPFRNumber mpfrInputImag(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(input).imag, precision,
+      rounding);
+
+  cpp::array<char, 2048> msg_buf;
+  cpp::StringStream msg(msg_buf);
+  msg << "Match value not within tolerance value of MPFR result:\n"
+      << "  Input: " << mpfrInputReal.str() << " + " << mpfrInputImag.str()
+      << "i\n"
+      << "  Rounding mode: " << str(rounding) << '\n'
+      << "    Libc: " << mpfrLibcResult.str() << '\n'
+      << "    MPC: " << mpfr_result.str() << '\n'
+      << '\n'
+      << "  ULP error: " << mpfr_result.ulp_as_mpfr_number(libc_result).str()
+      << '\n';
+  tlog << msg.str();
+  mpc_clear(mpc_result_val);
+  mpfr_clear(real);
+}
+
+template void explain_unary_operation_single_output_different_type_error(
+    Operation, _Complex float, float, double, RoundingMode);
+template void explain_unary_operation_single_output_different_type_error(
+    Operation, _Complex double, double, double, RoundingMode);
+
+template <typename InputType, typename OutputType>
+void explain_unary_operation_single_output_same_type_error(
+    Operation op, InputType input, OutputType libc_result, double ulp_tolerance,
+    RoundingMode rounding) {
+
+  unsigned int precision =
+      mpfr::get_precision<make_real_t<InputType>>(ulp_tolerance);
+
+  MPCNumber mpc_result;
+  mpc_result = unary_operation(op, input, precision, rounding);
+
+  mpc_t mpc_result_val;
+  mpc_init2(mpc_result_val, precision);
+  mpc_result.setValue(mpc_result_val);
+
+  mpfr_t real, imag;
+  mpfr_init2(real, precision);
+  mpfr_init2(imag, precision);
+  mpc_real(real, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+  mpc_imag(imag, mpc_result_val, mpfr::get_mpfr_rounding_mode(rounding));
+
+  mpfr::MPFRNumber mpfr_real(real, precision, rounding);
+  mpfr::MPFRNumber mpfr_imag(imag, precision, rounding);
+  mpfr::MPFRNumber mpfrLibcResultReal(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result).real,
+      precision, rounding);
+  mpfr::MPFRNumber mpfrLibcResultImag(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result).imag,
+      precision, rounding);
+  mpfr::MPFRNumber mpfrInputReal(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(input).real, precision,
+      rounding);
+  mpfr::MPFRNumber mpfrInputImag(
+      cpp::bit_cast<Complex<make_real_t<InputType>>>(input).imag, precision,
+      rounding);
+
+  cpp::array<char, 2048> msg_buf;
+  cpp::StringStream msg(msg_buf);
+  msg << "Match value not within tolerance value of MPFR result:\n"
+      << "  Input: " << mpfrInputReal.str() << " + " << mpfrInputImag.str()
+      << "i\n"
+      << "  Rounding mode: " << str(rounding) << " , " << str(rounding) << '\n'
+      << "    Libc: " << mpfrLibcResultReal.str() << " + "
+      << mpfrLibcResultImag.str() << "i\n"
+      << "    MPC: " << mpfr_real.str() << " + " << mpfr_imag.str() << "i\n"
+      << '\n'
+      << "  ULP error: "
+      << mpfr_real
+             .ulp_as_mpfr_number(
+                 cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result)
+                     .real)
+             .str()
+      << " , "
+      << mpfr_imag
+             .ulp_as_mpfr_number(
+                 cpp::bit_cast<Complex<make_real_t<InputType>>>(libc_result)
+                     .imag)
+             .str()
+      << '\n';
+  tlog << msg.str();
+  mpc_clear(mpc_result_val);
+  mpfr_clear(real);
+  mpfr_clear(imag);
+}
+
+template void explain_unary_operation_single_output_same_type_error(
+    Operation, _Complex float, _Complex float, double, RoundingMode);
+template void explain_unary_operation_single_output_same_type_error(
+    Operation, _Complex double, _Complex double, double, RoundingMode);
+
+} // namespace internal
+
+} // namespace mpc
+} // namespace testing
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/utils/MPCWrapper/MPCUtils.h b/libc/utils/MPCWrapper/MPCUtils.h
new file mode 100644
index 0000000000000..d141b4d986920
--- /dev/null
+++ b/libc/utils/MPCWrapper/MPCUtils.h
@@ -0,0 +1,270 @@
+//===-- MPCUtils.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_MPCWRAPPER_MPCUTILS_H
+#define LLVM_LIBC_UTILS_MPCWRAPPER_MPCUTILS_H
+
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/complex_type.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/complex_types.h"
+#include "src/__support/macros/properties/types.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+#include "test/UnitTest/Test.h"
+
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE_DECL {
+namespace testing {
+namespace mpc {
+
+enum class Operation {
+  // Operations which take a single complex floating point number as input
+  // and produce a single floating point number as output which has the same
+  // floating point type as the real/imaginary part of the input.
+  BeginUnaryOperationsSingleOutputDifferentOutputType,
+  Carg,
+  Cabs,
+  EndUnaryOperationsSingleOutputDifferentOutputType,
+
+  // Operations which take a single complex floating point number as input
+  // and produce a single complex floating point number of the same kind
+  // as output.
+  BeginUnaryOperationsSingleOutputSameOutputType,
+  Cproj,
+  Csqrt,
+  Clog,
+  Cexp,
+  Csinh,
+  Ccosh,
+  Ctanh,
+  Casinh,
+  Cacosh,
+  Catanh,
+  Csin,
+  Ccos,
+  Ctan,
+  Casin,
+  Cacos,
+  Catan,
+  EndUnaryOperationsSingleOutputSameOutputType,
+
+  // Operations which take two complex floating point numbers as input
+  // and produce a single complex floating point number of the same kind
+  // as output.
+  BeginBinaryOperationsSingleOutput,
+  Cpow,
+  EndBinaryOperationsSingleOutput,
+};
+
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+template <typename T> struct BinaryInput {
+  static_assert(LIBC_NAMESPACE::cpp::is_complex_v<T>,
+                "Template parameter of BinaryInput must be a complex floating "
+                "point type.");
+
+  using Type = T;
+  T x, y;
+};
+
+namespace internal {
+
+template <typename InputType, typename OutputType>
+bool compare_unary_operation_single_output_same_type(Operation op,
+                                                     InputType input,
+                                                     OutputType libc_output,
+                                                     double ulp_tolerance,
+                                                     RoundingMode rounding);
+
+template <typename InputType, typename OutputType>
+bool compare_unary_operation_single_output_different_type(
+    Operation op, InputType input, OutputType libc_output, double ulp_tolerance,
+    RoundingMode rounding);
+
+template <typename InputType, typename OutputType>
+bool compare_binary_operation_one_output(Operation op,
+                                         const BinaryInput<InputType> &input,
+                                         OutputType libc_output,
+                                         double ulp_tolerance,
+                                         RoundingMode rounding);
+
+template <typename InputType, typename OutputType>
+void explain_unary_operation_single_output_same_type_error(
+    Operation op, InputType input, OutputType match_value, double ulp_tolerance,
+    RoundingMode rounding);
+
+template <typename InputType, typename OutputType>
+void explain_unary_operation_single_output_different_type_error(
+    Operation op, InputType input, OutputType match_value, double ulp_tolerance,
+    RoundingMode rounding);
+
+template <typename InputType, typename OutputType>
+void explain_binary_operation_one_output_error(
+    Operation op, const BinaryInput<InputType> &input, OutputType match_value,
+    double ulp_tolerance, RoundingMode rounding);
+
+template <Operation op, typename InputType, typename OutputType>
+class MPCMatcher : public testing::Matcher<OutputType> {
+private:
+  InputType input;
+  OutputType match_value;
+  double ulp_tolerance;
+  RoundingMode rounding;
+
+public:
+  MPCMatcher(InputType testInput, double ulp_tolerance, RoundingMode rounding)
+      : input(testInput), ulp_tolerance(ulp_tolerance), rounding(rounding) {}
+
+  bool match(OutputType libcResult) {
+    match_value = libcResult;
+    return match(input, match_value);
+  }
+
+  void explainError() override { // NOLINT
+    explain_error(input, match_value);
+  }
+
+private:
+  template <typename InType, typename OutType>
+  bool match(InType in, OutType out) {
+    if (cpp::is_same_v<InType, OutType>) {
+      return compare_unary_operation_single_output_same_type(
+          op, in, out, ulp_tolerance, rounding);
+    } else {
+      return compare_unary_operation_single_output_different_type(
+          op, in, out, ulp_tolerance, rounding);
+    }
+  }
+
+  template <typename T, typename U>
+  bool match(const BinaryInput<T> &in, U out) {
+    return compare_binary_operation_one_output(op, in, out, ulp_tolerance,
+                                               rounding);
+  }
+
+  template <typename InType, typename OutType>
+  void explain_error(InType in, OutType out) {
+    if (cpp::is_same_v<InType, OutType>) {
+      explain_unary_operation_single_output_same_type_error(
+          op, in, out, ulp_tolerance, rounding);
+    } else {
+      explain_unary_operation_single_output_different_type_error(
+          op, in, out, ulp_tolerance, rounding);
+    }
+  }
+
+  template <typename T, typename U>
+  void explain_error(const BinaryInput<T> &in, U out) {
+    explain_binary_operation_one_output_error(op, in, out, ulp_tolerance,
+                                              rounding);
+  }
+};
+
+} // namespace internal
+
+// Return true if the input and ouput types for the operation op are valid
+// types.
+template <Operation op, typename InputType, typename OutputType>
+constexpr bool is_valid_operation() {
+  return (Operation::BeginBinaryOperationsSingleOutput < op &&
+          op < Operation::EndBinaryOperationsSingleOutput &&
+          cpp::is_complex_type_same<InputType, OutputType> &&
+          cpp::is_complex_v<InputType>) ||
+         (Operation::BeginUnaryOperationsSingleOutputSameOutputType < op &&
+          op < Operation::EndUnaryOperationsSingleOutputSameOutputType &&
+          cpp::is_complex_type_same<InputType, OutputType> &&
+          cpp::is_complex_v<InputType>) ||
+         (Operation::BeginUnaryOperationsSingleOutputDifferentOutputType < op &&
+          op < Operation::EndUnaryOperationsSingleOutputDifferentOutputType &&
+          cpp::is_same_v<make_real_t<InputType>, OutputType> &&
+          cpp::is_complex_v<InputType>);
+}
+
+template <Operation op, typename InputType, typename OutputType>
+cpp::enable_if_t<is_valid_operation<op, InputType, OutputType>(),
+                 internal::MPCMatcher<op, InputType, OutputType>>
+get_mpc_matcher(InputType input, [[maybe_unused]] OutputType output,
+                double ulp_tolerance, RoundingMode rounding) {
+  return internal::MPCMatcher<op, InputType, OutputType>(input, ulp_tolerance,
+                                                         rounding);
+}
+
+} // namespace mpc
+} // namespace testing
+} // namespace LIBC_NAMESPACE_DECL
+
+#define EXPECT_MPC_MATCH_DEFAULT(op, input, match_value, ulp_tolerance)        \
+  EXPECT_THAT(match_value,                                                     \
+              LIBC_NAMESPACE::testing::mpc::get_mpc_matcher<op>(               \
+                  input, match_value, ulp_tolerance,                           \
+                  LIBC_NAMESPACE::fputil::testing::RoundingMode::Nearest))
+
+#define EXPECT_MPC_MATCH_ROUNDING(op, input, match_value, ulp_tolerance,       \
+                                  rounding)                                    \
+  EXPECT_THAT(match_value, LIBC_NAMESPACE::testing::mpc::get_mpc_matcher<op>(  \
+                               input, match_value, ulp_tolerance, rounding))
+
+#define EXPECT_MPC_MATCH_ALL_ROUNDING_HELPER(op, input, match_value,           \
+                                             ulp_tolerance, rounding)          \
+  {                                                                            \
+    MPCRND::ForceRoundingMode __r(rounding);                                   \
+    if (__r.success) {                                                         \
+      EXPECT_MPC_MATCH_ROUNDING(op, input, match_value, ulp_tolerance,         \
+                                rounding);                                     \
+    }                                                                          \
+  }
+
+#define EXPECT_MPC_MATCH_ALL_ROUNDING(op, input, match_value, ulp_tolerance)   \
+  {                                                                            \
+    namespace MPCRND = LIBC_NAMESPACE::fputil::testing;                        \
+    for (int i = 0; i < 4; i++) {                                              \
+      MPCRND::RoundingMode r_mode = static_cast<MPCRND::RoundingMode>(i);      \
+      EXPECT_MPC_MATCH_ALL_ROUNDING_HELPER(op, input, match_value,             \
+                                           ulp_tolerance, r_mode);             \
+    }                                                                          \
+  }
+
+#define TEST_MPC_MATCH_ROUNDING(op, input, match_value, ulp_tolerance,         \
+                                rounding)                                      \
+  LIBC_NAMESPACE::testing::mpc::get_mpc_matcher<op>(input, match_value,        \
+                                                    ulp_tolerance, rounding)   \
+      .match(match_value)
+
+#define ASSERT_MPC_MATCH_DEFAULT(op, input, match_value, ulp_tolerance)        \
+  ASSERT_THAT(match_value,                                                     \
+              LIBC_NAMESPACE::testing::mpc::get_mpc_matcher<op>(               \
+                  input, match_value, ulp_tolerance,                           \
+                  LIBC_NAMESPACE::fputil::testing::RoundingMode::Nearest))
+
+#define ASSERT_MPC_MATCH_ROUNDING(op, input, match_value, ulp_tolerance,       \
+                                  rounding)                                    \
+  ASSERT_THAT(match_value, LIBC_NAMESPACE::testing::mpc::get_mpc_matcher<op>(  \
+                               input, match_value, ulp_tolerance, rounding))
+
+#define ASSERT_MPC_MATCH_ALL_ROUNDING_HELPER(op, input, match_value,           \
+                                             ulp_tolerance, rounding)          \
+  {                                                                            \
+    MPCRND::ForceRoundingMode __r(rounding);                                   \
+    if (__r.success) {                                                         \
+      ASSERT_MPC_MATCH_ROUNDING(op, input, match_value, ulp_tolerance,         \
+                                rounding);                                     \
+    }                                                                          \
+  }
+
+#define ASSERT_MPC_MATCH_ALL_ROUNDING(op, input, match_value, ulp_tolerance)   \
+  {                                                                            \
+    namespace MPCRND = LIBC_NAMESPACE::fputil::testing;                        \
+    for (int i = 0; i < 4; i++) {                                              \
+      MPCRND::RoundingMode r_mode = static_cast<MPCRND::RoundingMode>(i);      \
+      ASSERT_MPC_MATCH_ALL_ROUNDING_HELPER(op, input, match_value,             \
+                                           ulp_tolerance, r_mode);             \
+    }                                                                          \
+  }
+
+#endif // LLVM_LIBC_UTILS_MPCWRAPPER_MPCUTILS_H
diff --git a/libc/utils/MPCWrapper/check_mpc.cpp b/libc/utils/MPCWrapper/check_mpc.cpp
new file mode 100644
index 0000000000000..f1f0505252773
--- /dev/null
+++ b/libc/utils/MPCWrapper/check_mpc.cpp
@@ -0,0 +1,8 @@
+#include <mpc.h>
+
+int main() {
+  mpc_t x;
+  mpc_init2(x, 256);
+  mpc_clear(x);
+  return 0;
+}
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 0101c9f399082..829c2fb2c2f90 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -1,20 +1,39 @@
+if(LIBC_TESTS_CAN_USE_MPFR OR LIBC_TESTS_CAN_USE_MPC)
+  add_library(libcMPCommon STATIC
+    MPCommon.cpp
+    MPCommon.h
+    mpfr_inc.h
+  )
+  _get_common_test_compile_options(compile_options "" "")
+  # mpfr/gmp headers do not work with -ffreestanding flag.
+  list(REMOVE_ITEM compile_options "-ffreestanding")
+  target_compile_options(libcMPCommon PRIVATE -O3 ${compile_options})
+  add_dependencies(
+    libcMPCommon
+    libc.src.__support.CPP.string
+    libc.src.__support.CPP.string_view
+    libc.src.__support.CPP.type_traits
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fp_bits
+  )
+  target_include_directories(libcMPCommon PUBLIC ${LIBC_SOURCE_DIR})
+  target_link_libraries(libcMPCommon PUBLIC LibcFPTestHelpers.unit mpfr gmp)
+elseif(NOT LIBC_TARGET_OS_IS_GPU AND NOT LLVM_LIBC_FULL_BUILD)
+  message(WARNING "Math tests using MPFR will be skipped.")
+endif()
+
 if(LIBC_TESTS_CAN_USE_MPFR)
   add_library(libcMPFRWrapper STATIC
     MPFRUtils.cpp
     MPFRUtils.h
-    mpfr_inc.h
   )
   _get_common_test_compile_options(compile_options "" "")
-  # mpfr/gmp headers do not work with -ffreestanding flag.
-  list(REMOVE_ITEM compile_options "-ffreestanding")
   target_compile_options(libcMPFRWrapper PRIVATE -O3 ${compile_options})
   add_dependencies(
     libcMPFRWrapper
+    libcMPCommon
     libc.src.__support.CPP.array
     libc.src.__support.CPP.stringstream
-    libc.src.__support.CPP.string_view
-    libc.src.__support.CPP.type_traits
-    libc.src.__support.FPUtil.cast
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.fpbits_str
     LibcTest.unit
@@ -24,7 +43,7 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
   target_include_directories(libcMPFRWrapper PUBLIC ${LIBC_SOURCE_DIR})
-  target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
+  target_link_libraries(libcMPFRWrapper PUBLIC libcMPCommon LibcFPTestHelpers.unit LibcTest.unit)
 elseif(NOT LIBC_TARGET_OS_IS_GPU AND NOT LLVM_LIBC_FULL_BUILD)
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()
diff --git a/libc/utils/MPFRWrapper/MPCommon.cpp b/libc/utils/MPFRWrapper/MPCommon.cpp
new file mode 100644
index 0000000000000..8f104c908f036
--- /dev/null
+++ b/libc/utils/MPFRWrapper/MPCommon.cpp
@@ -0,0 +1,561 @@
+//===-- Utils used by both MPCWrapper and MPFRWrapper----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MPCommon.h"
+
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace testing {
+namespace mpfr {
+
+MPFRNumber::MPFRNumber() : mpfr_precision(256), mpfr_rounding(MPFR_RNDN) {
+  mpfr_init2(value, mpfr_precision);
+}
+
+MPFRNumber::MPFRNumber(const MPFRNumber &other)
+    : mpfr_precision(other.mpfr_precision), mpfr_rounding(other.mpfr_rounding) {
+  mpfr_init2(value, mpfr_precision);
+  mpfr_set(value, other.value, mpfr_rounding);
+}
+
+MPFRNumber::MPFRNumber(const MPFRNumber &other, unsigned int precision)
+    : mpfr_precision(precision), mpfr_rounding(other.mpfr_rounding) {
+  mpfr_init2(value, mpfr_precision);
+  mpfr_set(value, other.value, mpfr_rounding);
+}
+
+MPFRNumber::MPFRNumber(const mpfr_t x, unsigned int precision,
+                       RoundingMode rounding)
+    : mpfr_precision(precision),
+      mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+  mpfr_init2(value, mpfr_precision);
+  mpfr_set(value, x, mpfr_rounding);
+}
+
+MPFRNumber::~MPFRNumber() { mpfr_clear(value); }
+
+MPFRNumber &MPFRNumber::operator=(const MPFRNumber &rhs) {
+  mpfr_precision = rhs.mpfr_precision;
+  mpfr_rounding = rhs.mpfr_rounding;
+  mpfr_set(value, rhs.value, mpfr_rounding);
+  return *this;
+}
+
+bool MPFRNumber::is_nan() const { return mpfr_nan_p(value); }
+
+MPFRNumber MPFRNumber::abs() const {
+  MPFRNumber result(*this);
+  mpfr_abs(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::acos() const {
+  MPFRNumber result(*this);
+  mpfr_acos(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::acosh() const {
+  MPFRNumber result(*this);
+  mpfr_acosh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::add(const MPFRNumber &b) const {
+  MPFRNumber result(*this);
+  mpfr_add(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::asin() const {
+  MPFRNumber result(*this);
+  mpfr_asin(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::asinh() const {
+  MPFRNumber result(*this);
+  mpfr_asinh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::atan() const {
+  MPFRNumber result(*this);
+  mpfr_atan(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::atan2(const MPFRNumber &b) {
+  MPFRNumber result(*this);
+  mpfr_atan2(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::atanh() const {
+  MPFRNumber result(*this);
+  mpfr_atanh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::cbrt() const {
+  MPFRNumber result(*this);
+  mpfr_cbrt(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::ceil() const {
+  MPFRNumber result(*this);
+  mpfr_ceil(result.value, value);
+  return result;
+}
+
+MPFRNumber MPFRNumber::cos() const {
+  MPFRNumber result(*this);
+  mpfr_cos(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::cosh() const {
+  MPFRNumber result(*this);
+  mpfr_cosh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::cospi() const {
+  MPFRNumber result(*this);
+
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+  mpfr_cospi(result.value, value, mpfr_rounding);
+  return result;
+#else
+  if (mpfr_integer_p(value)) {
+    mpz_t integer;
+    mpz_init(integer);
+    mpfr_get_z(integer, value, mpfr_rounding);
+
+    int d = mpz_tstbit(integer, 0);
+    mpfr_set_si(result.value, d ? -1 : 1, mpfr_rounding);
+    mpz_clear(integer);
+    return result;
+  }
+
+  MPFRNumber value_pi(0.0, 1280);
+  mpfr_const_pi(value_pi.value, MPFR_RNDN);
+  mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
+  mpfr_cos(result.value, value_pi.value, mpfr_rounding);
+
+  return result;
+#endif
+}
+
+MPFRNumber MPFRNumber::erf() const {
+  MPFRNumber result(*this);
+  mpfr_erf(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::exp() const {
+  MPFRNumber result(*this);
+  mpfr_exp(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::exp2() const {
+  MPFRNumber result(*this);
+  mpfr_exp2(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::exp2m1() const {
+  // TODO: Only use mpfr_exp2m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+  MPFRNumber result(*this);
+  mpfr_exp2m1(result.value, value, mpfr_rounding);
+  return result;
+#else
+  unsigned int prec = mpfr_precision * 3;
+  MPFRNumber result(*this, prec);
+
+  float f = mpfr_get_flt(abs().value, mpfr_rounding);
+  if (f > 0.5f && f < 0x1.0p30f) {
+    mpfr_exp2(result.value, value, mpfr_rounding);
+    mpfr_sub_ui(result.value, result.value, 1, mpfr_rounding);
+    return result;
+  }
+
+  MPFRNumber ln2(2.0f, prec);
+  // log(2)
+  mpfr_log(ln2.value, ln2.value, mpfr_rounding);
+  // x * log(2)
+  mpfr_mul(result.value, value, ln2.value, mpfr_rounding);
+  // e^(x * log(2)) - 1
+  int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+  mpfr_subnormalize(result.value, ex, mpfr_rounding);
+  return result;
+#endif
+}
+
+MPFRNumber MPFRNumber::exp10() const {
+  MPFRNumber result(*this);
+  mpfr_exp10(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::exp10m1() const {
+  // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+  MPFRNumber result(*this);
+  mpfr_exp10m1(result.value, value, mpfr_rounding);
+  return result;
+#else
+  unsigned int prec = mpfr_precision * 3;
+  MPFRNumber result(*this, prec);
+
+  MPFRNumber ln10(10.0f, prec);
+  // log(10)
+  mpfr_log(ln10.value, ln10.value, mpfr_rounding);
+  // x * log(10)
+  mpfr_mul(result.value, value, ln10.value, mpfr_rounding);
+  // e^(x * log(10)) - 1
+  int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+  mpfr_subnormalize(result.value, ex, mpfr_rounding);
+  return result;
+#endif
+}
+
+MPFRNumber MPFRNumber::expm1() const {
+  MPFRNumber result(*this);
+  mpfr_expm1(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::div(const MPFRNumber &b) const {
+  MPFRNumber result(*this);
+  mpfr_div(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::floor() const {
+  MPFRNumber result(*this);
+  mpfr_floor(result.value, value);
+  return result;
+}
+
+MPFRNumber MPFRNumber::fmod(const MPFRNumber &b) {
+  MPFRNumber result(*this);
+  mpfr_fmod(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::frexp(int &exp) {
+  MPFRNumber result(*this);
+  mpfr_exp_t resultExp;
+  mpfr_frexp(&resultExp, result.value, value, mpfr_rounding);
+  exp = resultExp;
+  return result;
+}
+
+MPFRNumber MPFRNumber::hypot(const MPFRNumber &b) {
+  MPFRNumber result(*this);
+  mpfr_hypot(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::log() const {
+  MPFRNumber result(*this);
+  mpfr_log(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::log2() const {
+  MPFRNumber result(*this);
+  mpfr_log2(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::log10() const {
+  MPFRNumber result(*this);
+  mpfr_log10(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::log1p() const {
+  MPFRNumber result(*this);
+  mpfr_log1p(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::pow(const MPFRNumber &b) {
+  MPFRNumber result(*this);
+  mpfr_pow(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::remquo(const MPFRNumber &divisor, int &quotient) {
+  MPFRNumber remainder(*this);
+  long q;
+  mpfr_remquo(remainder.value, &q, value, divisor.value, mpfr_rounding);
+  quotient = q;
+  return remainder;
+}
+
+MPFRNumber MPFRNumber::round() const {
+  MPFRNumber result(*this);
+  mpfr_round(result.value, value);
+  return result;
+}
+
+MPFRNumber MPFRNumber::roundeven() const {
+  MPFRNumber result(*this);
+#if MPFR_VERSION_MAJOR >= 4
+  mpfr_roundeven(result.value, value);
+#else
+  mpfr_rint(result.value, value, MPFR_RNDN);
+#endif
+  return result;
+}
+
+bool MPFRNumber::round_to_long(long &result) const {
+  // We first calculate the rounded value. This way, when converting
+  // to long using mpfr_get_si, the rounding direction of MPFR_RNDN
+  // (or any other rounding mode), does not have an influence.
+  MPFRNumber roundedValue = round();
+  mpfr_clear_erangeflag();
+  result = mpfr_get_si(roundedValue.value, MPFR_RNDN);
+  return mpfr_erangeflag_p();
+}
+
+bool MPFRNumber::round_to_long(mpfr_rnd_t rnd, long &result) const {
+  MPFRNumber rint_result(*this);
+  mpfr_rint(rint_result.value, value, rnd);
+  return rint_result.round_to_long(result);
+}
+
+MPFRNumber MPFRNumber::rint(mpfr_rnd_t rnd) const {
+  MPFRNumber result(*this);
+  mpfr_rint(result.value, value, rnd);
+  return result;
+}
+
+MPFRNumber MPFRNumber::mod_2pi() const {
+  MPFRNumber result(0.0, 1280);
+  MPFRNumber _2pi(0.0, 1280);
+  mpfr_const_pi(_2pi.value, MPFR_RNDN);
+  mpfr_mul_si(_2pi.value, _2pi.value, 2, MPFR_RNDN);
+  mpfr_fmod(result.value, value, _2pi.value, MPFR_RNDN);
+  return result;
+}
+
+MPFRNumber MPFRNumber::mod_pi_over_2() const {
+  MPFRNumber result(0.0, 1280);
+  MPFRNumber pi_over_2(0.0, 1280);
+  mpfr_const_pi(pi_over_2.value, MPFR_RNDN);
+  mpfr_mul_d(pi_over_2.value, pi_over_2.value, 0.5, MPFR_RNDN);
+  mpfr_fmod(result.value, value, pi_over_2.value, MPFR_RNDN);
+  return result;
+}
+
+MPFRNumber MPFRNumber::mod_pi_over_4() const {
+  MPFRNumber result(0.0, 1280);
+  MPFRNumber pi_over_4(0.0, 1280);
+  mpfr_const_pi(pi_over_4.value, MPFR_RNDN);
+  mpfr_mul_d(pi_over_4.value, pi_over_4.value, 0.25, MPFR_RNDN);
+  mpfr_fmod(result.value, value, pi_over_4.value, MPFR_RNDN);
+  return result;
+}
+
+MPFRNumber MPFRNumber::sin() const {
+  MPFRNumber result(*this);
+  mpfr_sin(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::sinpi() const {
+  MPFRNumber result(*this);
+
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+
+  mpfr_sinpi(result.value, value, mpfr_rounding);
+  return result;
+#else
+  if (mpfr_integer_p(value)) {
+    mpfr_set_si(result.value, 0, mpfr_rounding);
+    return result;
+  }
+
+  MPFRNumber value_mul_two(*this);
+  mpfr_mul_si(value_mul_two.value, value, 2, MPFR_RNDN);
+
+  if (mpfr_integer_p(value_mul_two.value)) {
+    auto d = mpfr_get_si(value, MPFR_RNDD);
+    mpfr_set_si(result.value, (d & 1) ? -1 : 1, mpfr_rounding);
+    return result;
+  }
+
+  MPFRNumber value_pi(0.0, 1280);
+  mpfr_const_pi(value_pi.value, MPFR_RNDN);
+  mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
+  mpfr_sin(result.value, value_pi.value, mpfr_rounding);
+  return result;
+#endif
+}
+
+MPFRNumber MPFRNumber::sinh() const {
+  MPFRNumber result(*this);
+  mpfr_sinh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::sqrt() const {
+  MPFRNumber result(*this);
+  mpfr_sqrt(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::sub(const MPFRNumber &b) const {
+  MPFRNumber result(*this);
+  mpfr_sub(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::tan() const {
+  MPFRNumber result(*this);
+  mpfr_tan(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::tanh() const {
+  MPFRNumber result(*this);
+  mpfr_tanh(result.value, value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::tanpi() const {
+  MPFRNumber result(*this);
+
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+
+  mpfr_tanpi(result.value, value, mpfr_rounding);
+  return result;
+#else
+  MPFRNumber value_ret_exact(*this);
+  MPFRNumber value_one(*this);
+  mpfr_set_si(value_one.value, 1, MPFR_RNDN);
+  mpfr_fmod(value_ret_exact.value, value, value_one.value, mpfr_rounding);
+  mpfr_mul_si(value_ret_exact.value, value_ret_exact.value, 4, MPFR_RNDN);
+
+  if (mpfr_integer_p(value_ret_exact.value)) {
+    int mod = mpfr_get_si(value_ret_exact.value, MPFR_RNDN);
+    mod = (mod < 0 ? -1 * mod : mod);
+
+    switch (mod) {
+    case 0:
+      mpfr_set_si(result.value, 0, mpfr_rounding);
+      break;
+    case 1:
+      mpfr_set_si(result.value, (mpfr_signbit(value) ? -1 : 1), mpfr_rounding);
+      break;
+    case 2: {
+      auto d = mpfr_get_si(value, MPFR_RNDZ);
+      d += mpfr_sgn(value) > 0 ? 0 : 1;
+      mpfr_set_inf(result.value, (d & 1) ? -1 : 1);
+      break;
+    }
+    case 3:
+      mpfr_set_si(result.value, (mpfr_signbit(value) ? 1 : -1), mpfr_rounding);
+      break;
+    }
+
+    return result;
+  }
+
+  MPFRNumber value_pi(0.0, 1280);
+  mpfr_const_pi(value_pi.value, MPFR_RNDN);
+  mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
+  mpfr_tan(result.value, value_pi.value, mpfr_rounding);
+  return result;
+#endif
+}
+
+MPFRNumber MPFRNumber::trunc() const {
+  MPFRNumber result(*this);
+  mpfr_trunc(result.value, value);
+  return result;
+}
+
+MPFRNumber MPFRNumber::fma(const MPFRNumber &b, const MPFRNumber &c) {
+  MPFRNumber result(*this);
+  mpfr_fma(result.value, value, b.value, c.value, mpfr_rounding);
+  return result;
+}
+
+MPFRNumber MPFRNumber::mul(const MPFRNumber &b) {
+  MPFRNumber result(*this);
+  mpfr_mul(result.value, value, b.value, mpfr_rounding);
+  return result;
+}
+
+cpp::string MPFRNumber::str() const {
+  // 200 bytes should be more than sufficient to hold a 100-digit number
+  // plus additional bytes for the decimal point, '-' sign etc.
+  constexpr size_t printBufSize = 200;
+  char buffer[printBufSize];
+  mpfr_snprintf(buffer, printBufSize, "%100.50Rf", value);
+  cpp::string_view view(buffer);
+  // Trim whitespaces
+  const char whitespace = ' ';
+  while (!view.empty() && view.front() == whitespace)
+    view.remove_prefix(1);
+  while (!view.empty() && view.back() == whitespace)
+    view.remove_suffix(1);
+  return cpp::string(view.data());
+}
+
+void MPFRNumber::dump(const char *msg) const {
+  mpfr_printf("%s%.128g\n", msg, value);
+}
+
+template <> float MPFRNumber::as<float>() const {
+  return mpfr_get_flt(value, mpfr_rounding);
+}
+
+template <> double MPFRNumber::as<double>() const {
+  return mpfr_get_d(value, mpfr_rounding);
+}
+
+template <> long double MPFRNumber::as<long double>() const {
+  return mpfr_get_ld(value, mpfr_rounding);
+}
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+template <> float16 MPFRNumber::as<float16>() const {
+  // TODO: Either prove that this cast won't cause double-rounding errors, or
+  // find a better way to get a float16.
+  return fputil::cast<float16>(mpfr_get_d(value, mpfr_rounding));
+}
+#endif
+
+#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+template <> float128 MPFRNumber::as<float128>() const {
+  return mpfr_get_float128(value, mpfr_rounding);
+}
+
+#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+
+} // namespace mpfr
+} // namespace testing
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/utils/MPFRWrapper/MPCommon.h b/libc/utils/MPFRWrapper/MPCommon.h
new file mode 100644
index 0000000000000..eaa512e30bc86
--- /dev/null
+++ b/libc/utils/MPFRWrapper/MPCommon.h
@@ -0,0 +1,342 @@
+//===-- MPCommon.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_UTILS_MPFRWRAPPER_MPCOMMON_H
+#define LLVM_LIBC_UTILS_MPFRWRAPPER_MPCOMMON_H
+
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/config.h"
+#include "test/UnitTest/RoundingModeUtils.h"
+
+#include <stdint.h>
+
+#include "mpfr_inc.h"
+
+#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+extern "C" {
+int mpfr_set_float128(mpfr_ptr, float128, mpfr_rnd_t);
+float128 mpfr_get_float128(mpfr_srcptr, mpfr_rnd_t);
+}
+#endif
+
+namespace LIBC_NAMESPACE_DECL {
+namespace testing {
+namespace mpfr {
+
+template <typename T> using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
+using LIBC_NAMESPACE::fputil::testing::RoundingMode;
+
+// A precision value which allows sufficiently large additional
+// precision compared to the floating point precision.
+template <typename T> struct ExtraPrecision;
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+template <> struct ExtraPrecision<float16> {
+  static constexpr unsigned int VALUE = 128;
+};
+#endif
+
+template <> struct ExtraPrecision<float> {
+  static constexpr unsigned int VALUE = 128;
+};
+
+template <> struct ExtraPrecision<double> {
+  static constexpr unsigned int VALUE = 256;
+};
+
+template <> struct ExtraPrecision<long double> {
+#ifdef LIBC_TYPES_LONG_DOUBLE_IS_FLOAT128
+  static constexpr unsigned int VALUE = 512;
+#else
+  static constexpr unsigned int VALUE = 256;
+#endif
+};
+
+#if defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
+template <> struct ExtraPrecision<float128> {
+  static constexpr unsigned int VALUE = 512;
+};
+#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+
+// If the ulp tolerance is less than or equal to 0.5, we would check that the
+// result is rounded correctly with respect to the rounding mode by using the
+// same precision as the inputs.
+template <typename T>
+static inline unsigned int get_precision(double ulp_tolerance) {
+  if (ulp_tolerance <= 0.5) {
+    return LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN + 1;
+  } else {
+    return ExtraPrecision<T>::VALUE;
+  }
+}
+
+static inline mpfr_rnd_t get_mpfr_rounding_mode(RoundingMode mode) {
+  switch (mode) {
+  case RoundingMode::Upward:
+    return MPFR_RNDU;
+    break;
+  case RoundingMode::Downward:
+    return MPFR_RNDD;
+    break;
+  case RoundingMode::TowardZero:
+    return MPFR_RNDZ;
+    break;
+  case RoundingMode::Nearest:
+    return MPFR_RNDN;
+    break;
+  }
+  __builtin_unreachable();
+}
+
+class MPFRNumber {
+  unsigned int mpfr_precision;
+  mpfr_rnd_t mpfr_rounding;
+  mpfr_t value;
+
+public:
+  MPFRNumber();
+  // We use explicit EnableIf specializations to disallow implicit
+  // conversions. Implicit conversions can potentially lead to loss of
+  // precision. We exceptionally allow implicit conversions from float16
+  // to float, as the MPFR API does not support float16, thus requiring
+  // conversion to a higher-precision format.
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<float, XType>
+#ifdef LIBC_TYPES_HAS_FLOAT16
+                                 || cpp::is_same_v<float16, XType>
+#endif
+                             ,
+                             int> = 0>
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
+                      RoundingMode rounding = RoundingMode::Nearest)
+      : mpfr_precision(precision),
+        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set_flt(value, x, mpfr_rounding);
+  }
+
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<double, XType>, int> = 0>
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
+                      RoundingMode rounding = RoundingMode::Nearest)
+      : mpfr_precision(precision),
+        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set_d(value, x, mpfr_rounding);
+  }
+
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
+                      RoundingMode rounding = RoundingMode::Nearest)
+      : mpfr_precision(precision),
+        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set_ld(value, x, mpfr_rounding);
+  }
+
+#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_same_v<float128, XType>, int> = 0>
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<XType>::VALUE,
+                      RoundingMode rounding = RoundingMode::Nearest)
+      : mpfr_precision(precision),
+        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set_float128(value, x, mpfr_rounding);
+  }
+#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
+
+  template <typename XType,
+            cpp::enable_if_t<cpp::is_integral_v<XType>, int> = 0>
+  explicit MPFRNumber(XType x,
+                      unsigned int precision = ExtraPrecision<float>::VALUE,
+                      RoundingMode rounding = RoundingMode::Nearest)
+      : mpfr_precision(precision),
+        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
+    mpfr_init2(value, mpfr_precision);
+    mpfr_set_sj(value, x, mpfr_rounding);
+  }
+
+  MPFRNumber(const MPFRNumber &other);
+  MPFRNumber(const MPFRNumber &other, unsigned int precision);
+  MPFRNumber(const mpfr_t x, unsigned int precision, RoundingMode rounding);
+
+  ~MPFRNumber();
+
+  MPFRNumber &operator=(const MPFRNumber &rhs);
+
+  bool is_nan() const;
+  MPFRNumber abs() const;
+  MPFRNumber acos() const;
+  MPFRNumber acosh() const;
+  MPFRNumber add(const MPFRNumber &b) const;
+  MPFRNumber asin() const;
+  MPFRNumber asinh() const;
+  MPFRNumber atan() const;
+  MPFRNumber atan2(const MPFRNumber &b);
+  MPFRNumber atanh() const;
+  MPFRNumber cbrt() const;
+  MPFRNumber ceil() const;
+  MPFRNumber cos() const;
+  MPFRNumber cosh() const;
+  MPFRNumber cospi() const;
+  MPFRNumber erf() const;
+  MPFRNumber exp() const;
+  MPFRNumber exp2() const;
+  MPFRNumber exp2m1() const;
+  MPFRNumber exp10() const;
+  MPFRNumber exp10m1() const;
+  MPFRNumber expm1() const;
+  MPFRNumber div(const MPFRNumber &b) const;
+  MPFRNumber floor() const;
+  MPFRNumber fmod(const MPFRNumber &b);
+  MPFRNumber frexp(int &exp);
+  MPFRNumber hypot(const MPFRNumber &b);
+  MPFRNumber log() const;
+  MPFRNumber log2() const;
+  MPFRNumber log10() const;
+  MPFRNumber log1p() const;
+  MPFRNumber pow(const MPFRNumber &b);
+  MPFRNumber remquo(const MPFRNumber &divisor, int &quotient);
+  MPFRNumber round() const;
+  MPFRNumber roundeven() const;
+  bool round_to_long(long &result) const;
+  bool round_to_long(mpfr_rnd_t rnd, long &result) const;
+  MPFRNumber rint(mpfr_rnd_t rnd) const;
+  MPFRNumber mod_2pi() const;
+  MPFRNumber mod_pi_over_2() const;
+  MPFRNumber mod_pi_over_4() const;
+  MPFRNumber sin() const;
+  MPFRNumber sinpi() const;
+  MPFRNumber sinh() const;
+  MPFRNumber sqrt() const;
+  MPFRNumber sub(const MPFRNumber &b) const;
+  MPFRNumber tan() const;
+  MPFRNumber tanh() const;
+  MPFRNumber tanpi() const;
+  MPFRNumber trunc() const;
+  MPFRNumber fma(const MPFRNumber &b, const MPFRNumber &c);
+  MPFRNumber mul(const MPFRNumber &b);
+  cpp::string str() const;
+
+  template <typename T> T as() const;
+  void dump(const char *msg) const;
+
+  // Return the ULP (units-in-the-last-place) difference between the
+  // stored MPFR and a floating point number.
+  //
+  // We define ULP difference as follows:
+  //   If exponents of this value and the |input| are same, then:
+  //     ULP(this_value, input) = abs(this_value - input) / eps(input)
+  //   else:
+  //     max = max(abs(this_value), abs(input))
+  //     min = min(abs(this_value), abs(input))
+  //     maxExponent = exponent(max)
+  //     ULP(this_value, input) = (max - 2^maxExponent) / eps(max) +
+  //                              (2^maxExponent - min) / eps(min)
+  //
+  // Remarks:
+  // 1. A ULP of 0.0 will imply that the value is correctly rounded.
+  // 2. We expect that this value and the value to be compared (the [input]
+  //    argument) are reasonable close, and we will provide an upper bound
+  //    of ULP value for testing.  Morever, most of the fractional parts of
+  //    ULP value do not matter much, so using double as the return type
+  //    should be good enough.
+  // 3. For close enough values (values which don't diff in their exponent by
+  //    not more than 1), a ULP difference of N indicates a bit distance
+  //    of N between this number and [input].
+  // 4. A values of +0.0 and -0.0 are treated as equal.
+  template <typename T>
+  cpp::enable_if_t<cpp::is_floating_point_v<T>, MPFRNumber>
+  ulp_as_mpfr_number(T input) {
+    T thisAsT = as<T>();
+    if (thisAsT == input)
+      return MPFRNumber(0.0);
+
+    if (is_nan()) {
+      if (FPBits<T>(input).is_nan())
+        return MPFRNumber(0.0);
+      return MPFRNumber(FPBits<T>::inf().get_val());
+    }
+
+    int thisExponent = FPBits<T>(thisAsT).get_exponent();
+    int inputExponent = FPBits<T>(input).get_exponent();
+    // Adjust the exponents for denormal numbers.
+    if (FPBits<T>(thisAsT).is_subnormal())
+      ++thisExponent;
+    if (FPBits<T>(input).is_subnormal())
+      ++inputExponent;
+
+    if (thisAsT * input < 0 || thisExponent == inputExponent) {
+      MPFRNumber inputMPFR(input);
+      mpfr_sub(inputMPFR.value, value, inputMPFR.value, MPFR_RNDN);
+      mpfr_abs(inputMPFR.value, inputMPFR.value, MPFR_RNDN);
+      mpfr_mul_2si(inputMPFR.value, inputMPFR.value,
+                   -thisExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
+      return inputMPFR;
+    }
+
+    // If the control reaches here, it means that this number and input are
+    // of the same sign but different exponent. In such a case, ULP error is
+    // calculated as sum of two parts.
+    thisAsT = FPBits<T>(thisAsT).abs().get_val();
+    input = FPBits<T>(input).abs().get_val();
+    T min = thisAsT > input ? input : thisAsT;
+    T max = thisAsT > input ? thisAsT : input;
+    int minExponent = FPBits<T>(min).get_exponent();
+    int maxExponent = FPBits<T>(max).get_exponent();
+    // Adjust the exponents for denormal numbers.
+    if (FPBits<T>(min).is_subnormal())
+      ++minExponent;
+    if (FPBits<T>(max).is_subnormal())
+      ++maxExponent;
+
+    MPFRNumber minMPFR(min);
+    MPFRNumber maxMPFR(max);
+
+    MPFRNumber pivot(uint32_t(1));
+    mpfr_mul_2si(pivot.value, pivot.value, maxExponent, MPFR_RNDN);
+
+    mpfr_sub(minMPFR.value, pivot.value, minMPFR.value, MPFR_RNDN);
+    mpfr_mul_2si(minMPFR.value, minMPFR.value,
+                 -minExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
+
+    mpfr_sub(maxMPFR.value, maxMPFR.value, pivot.value, MPFR_RNDN);
+    mpfr_mul_2si(maxMPFR.value, maxMPFR.value,
+                 -maxExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
+
+    mpfr_add(minMPFR.value, minMPFR.value, maxMPFR.value, MPFR_RNDN);
+    return minMPFR;
+  }
+
+  template <typename T>
+  cpp::enable_if_t<cpp::is_floating_point_v<T>, cpp::string>
+  ulp_as_string(T input) {
+    MPFRNumber num = ulp_as_mpfr_number(input);
+    return num.str();
+  }
+
+  template <typename T>
+  cpp::enable_if_t<cpp::is_floating_point_v<T>, double> ulp(T input) {
+    MPFRNumber num = ulp_as_mpfr_number(input);
+    return num.as<double>();
+  }
+};
+
+} // namespace mpfr
+} // namespace testing
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_UTILS_MPFRWRAPPER_MPCOMMON_H
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 0dac497bb779a..775a3d1a31964 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -7,806 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "MPFRUtils.h"
+#include "MPCommon.h"
 
 #include "src/__support/CPP/array.h"
-#include "src/__support/CPP/string.h"
-#include "src/__support/CPP/string_view.h"
 #include "src/__support/CPP/stringstream.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/types.h"
 
-#include <stdint.h>
-
-#include "mpfr_inc.h"
-
-#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-extern "C" {
-int mpfr_set_float128(mpfr_ptr, float128, mpfr_rnd_t);
-float128 mpfr_get_float128(mpfr_srcptr, mpfr_rnd_t);
-}
-#endif
-
-template <typename T> using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-
 namespace LIBC_NAMESPACE_DECL {
 namespace testing {
 namespace mpfr {
-
-// A precision value which allows sufficiently large additional
-// precision compared to the floating point precision.
-template <typename T> struct ExtraPrecision;
-
-#ifdef LIBC_TYPES_HAS_FLOAT16
-template <> struct ExtraPrecision<float16> {
-  static constexpr unsigned int VALUE = 128;
-};
-#endif
-
-template <> struct ExtraPrecision<float> {
-  static constexpr unsigned int VALUE = 128;
-};
-
-template <> struct ExtraPrecision<double> {
-  static constexpr unsigned int VALUE = 256;
-};
-
-template <> struct ExtraPrecision<long double> {
-#ifdef LIBC_TYPES_LONG_DOUBLE_IS_FLOAT128
-  static constexpr unsigned int VALUE = 512;
-#else
-  static constexpr unsigned int VALUE = 256;
-#endif
-};
-
-#if defined(LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE)
-template <> struct ExtraPrecision<float128> {
-  static constexpr unsigned int VALUE = 512;
-};
-#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-
-// If the ulp tolerance is less than or equal to 0.5, we would check that the
-// result is rounded correctly with respect to the rounding mode by using the
-// same precision as the inputs.
-template <typename T>
-static inline unsigned int get_precision(double ulp_tolerance) {
-  if (ulp_tolerance <= 0.5) {
-    return LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN + 1;
-  } else {
-    return ExtraPrecision<T>::VALUE;
-  }
-}
-
-static inline mpfr_rnd_t get_mpfr_rounding_mode(RoundingMode mode) {
-  switch (mode) {
-  case RoundingMode::Upward:
-    return MPFR_RNDU;
-    break;
-  case RoundingMode::Downward:
-    return MPFR_RNDD;
-    break;
-  case RoundingMode::TowardZero:
-    return MPFR_RNDZ;
-    break;
-  case RoundingMode::Nearest:
-    return MPFR_RNDN;
-    break;
-  }
-  __builtin_unreachable();
-}
-
-class MPFRNumber {
-  unsigned int mpfr_precision;
-  mpfr_rnd_t mpfr_rounding;
-
-  mpfr_t value;
-
-public:
-  MPFRNumber() : mpfr_precision(256), mpfr_rounding(MPFR_RNDN) {
-    mpfr_init2(value, mpfr_precision);
-  }
-
-  // We use explicit EnableIf specializations to disallow implicit
-  // conversions. Implicit conversions can potentially lead to loss of
-  // precision. We exceptionally allow implicit conversions from float16
-  // to float, as the MPFR API does not support float16, thus requiring
-  // conversion to a higher-precision format.
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<float, XType>
-#ifdef LIBC_TYPES_HAS_FLOAT16
-                                 || cpp::is_same_v<float16, XType>
-#endif
-                             ,
-                             int> = 0>
-  explicit MPFRNumber(XType x,
-                      unsigned int precision = ExtraPrecision<XType>::VALUE,
-                      RoundingMode rounding = RoundingMode::Nearest)
-      : mpfr_precision(precision),
-        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set_flt(value, x, mpfr_rounding);
-  }
-
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<double, XType>, int> = 0>
-  explicit MPFRNumber(XType x,
-                      unsigned int precision = ExtraPrecision<XType>::VALUE,
-                      RoundingMode rounding = RoundingMode::Nearest)
-      : mpfr_precision(precision),
-        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set_d(value, x, mpfr_rounding);
-  }
-
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<long double, XType>, int> = 0>
-  explicit MPFRNumber(XType x,
-                      unsigned int precision = ExtraPrecision<XType>::VALUE,
-                      RoundingMode rounding = RoundingMode::Nearest)
-      : mpfr_precision(precision),
-        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set_ld(value, x, mpfr_rounding);
-  }
-
-#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_same_v<float128, XType>, int> = 0>
-  explicit MPFRNumber(XType x,
-                      unsigned int precision = ExtraPrecision<XType>::VALUE,
-                      RoundingMode rounding = RoundingMode::Nearest)
-      : mpfr_precision(precision),
-        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set_float128(value, x, mpfr_rounding);
-  }
-#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-
-  template <typename XType,
-            cpp::enable_if_t<cpp::is_integral_v<XType>, int> = 0>
-  explicit MPFRNumber(XType x,
-                      unsigned int precision = ExtraPrecision<float>::VALUE,
-                      RoundingMode rounding = RoundingMode::Nearest)
-      : mpfr_precision(precision),
-        mpfr_rounding(get_mpfr_rounding_mode(rounding)) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set_sj(value, x, mpfr_rounding);
-  }
-
-  MPFRNumber(const MPFRNumber &other)
-      : mpfr_precision(other.mpfr_precision),
-        mpfr_rounding(other.mpfr_rounding) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set(value, other.value, mpfr_rounding);
-  }
-
-  MPFRNumber(const MPFRNumber &other, unsigned int precision)
-      : mpfr_precision(precision), mpfr_rounding(other.mpfr_rounding) {
-    mpfr_init2(value, mpfr_precision);
-    mpfr_set(value, other.value, mpfr_rounding);
-  }
-
-  ~MPFRNumber() { mpfr_clear(value); }
-
-  MPFRNumber &operator=(const MPFRNumber &rhs) {
-    mpfr_precision = rhs.mpfr_precision;
-    mpfr_rounding = rhs.mpfr_rounding;
-    mpfr_set(value, rhs.value, mpfr_rounding);
-    return *this;
-  }
-
-  bool is_nan() const { return mpfr_nan_p(value); }
-
-  MPFRNumber abs() const {
-    MPFRNumber result(*this);
-    mpfr_abs(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber acos() const {
-    MPFRNumber result(*this);
-    mpfr_acos(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber acosh() const {
-    MPFRNumber result(*this);
-    mpfr_acosh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber add(const MPFRNumber &b) const {
-    MPFRNumber result(*this);
-    mpfr_add(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber asin() const {
-    MPFRNumber result(*this);
-    mpfr_asin(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber asinh() const {
-    MPFRNumber result(*this);
-    mpfr_asinh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber atan() const {
-    MPFRNumber result(*this);
-    mpfr_atan(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber atan2(const MPFRNumber &b) {
-    MPFRNumber result(*this);
-    mpfr_atan2(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber atanh() const {
-    MPFRNumber result(*this);
-    mpfr_atanh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber cbrt() const {
-    MPFRNumber result(*this);
-    mpfr_cbrt(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber ceil() const {
-    MPFRNumber result(*this);
-    mpfr_ceil(result.value, value);
-    return result;
-  }
-
-  MPFRNumber cos() const {
-    MPFRNumber result(*this);
-    mpfr_cos(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber cosh() const {
-    MPFRNumber result(*this);
-    mpfr_cosh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber cospi() const {
-    MPFRNumber result(*this);
-
-#if MPFR_VERSION_MAJOR > 4 ||                                                  \
-    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
-    mpfr_cospi(result.value, value, mpfr_rounding);
-    return result;
-#else
-    if (mpfr_integer_p(value)) {
-      mpz_t integer;
-      mpz_init(integer);
-      mpfr_get_z(integer, value, mpfr_rounding);
-
-      int d = mpz_tstbit(integer, 0);
-      mpfr_set_si(result.value, d ? -1 : 1, mpfr_rounding);
-      mpz_clear(integer);
-      return result;
-    }
-
-    MPFRNumber value_pi(0.0, 1280);
-    mpfr_const_pi(value_pi.value, MPFR_RNDN);
-    mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
-    mpfr_cos(result.value, value_pi.value, mpfr_rounding);
-
-    return result;
-#endif
-  }
-
-  MPFRNumber erf() const {
-    MPFRNumber result(*this);
-    mpfr_erf(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber exp() const {
-    MPFRNumber result(*this);
-    mpfr_exp(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber exp2() const {
-    MPFRNumber result(*this);
-    mpfr_exp2(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber exp2m1() const {
-    // TODO: Only use mpfr_exp2m1 once CI and buildbots get MPFR >= 4.2.0.
-#if MPFR_VERSION_MAJOR > 4 ||                                                  \
-    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
-    MPFRNumber result(*this);
-    mpfr_exp2m1(result.value, value, mpfr_rounding);
-    return result;
-#else
-    unsigned int prec = mpfr_precision * 3;
-    MPFRNumber result(*this, prec);
-
-    float f = mpfr_get_flt(abs().value, mpfr_rounding);
-    if (f > 0.5f && f < 0x1.0p30f) {
-      mpfr_exp2(result.value, value, mpfr_rounding);
-      mpfr_sub_ui(result.value, result.value, 1, mpfr_rounding);
-      return result;
-    }
-
-    MPFRNumber ln2(2.0f, prec);
-    // log(2)
-    mpfr_log(ln2.value, ln2.value, mpfr_rounding);
-    // x * log(2)
-    mpfr_mul(result.value, value, ln2.value, mpfr_rounding);
-    // e^(x * log(2)) - 1
-    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
-    mpfr_subnormalize(result.value, ex, mpfr_rounding);
-    return result;
-#endif
-  }
-
-  MPFRNumber exp10() const {
-    MPFRNumber result(*this);
-    mpfr_exp10(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber exp10m1() const {
-    // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0.
-#if MPFR_VERSION_MAJOR > 4 ||                                                  \
-    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
-    MPFRNumber result(*this);
-    mpfr_exp10m1(result.value, value, mpfr_rounding);
-    return result;
-#else
-    unsigned int prec = mpfr_precision * 3;
-    MPFRNumber result(*this, prec);
-
-    MPFRNumber ln10(10.0f, prec);
-    // log(10)
-    mpfr_log(ln10.value, ln10.value, mpfr_rounding);
-    // x * log(10)
-    mpfr_mul(result.value, value, ln10.value, mpfr_rounding);
-    // e^(x * log(10)) - 1
-    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
-    mpfr_subnormalize(result.value, ex, mpfr_rounding);
-    return result;
-#endif
-  }
-
-  MPFRNumber expm1() const {
-    MPFRNumber result(*this);
-    mpfr_expm1(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber div(const MPFRNumber &b) const {
-    MPFRNumber result(*this);
-    mpfr_div(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber floor() const {
-    MPFRNumber result(*this);
-    mpfr_floor(result.value, value);
-    return result;
-  }
-
-  MPFRNumber fmod(const MPFRNumber &b) {
-    MPFRNumber result(*this);
-    mpfr_fmod(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber frexp(int &exp) {
-    MPFRNumber result(*this);
-    mpfr_exp_t resultExp;
-    mpfr_frexp(&resultExp, result.value, value, mpfr_rounding);
-    exp = resultExp;
-    return result;
-  }
-
-  MPFRNumber hypot(const MPFRNumber &b) {
-    MPFRNumber result(*this);
-    mpfr_hypot(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber log() const {
-    MPFRNumber result(*this);
-    mpfr_log(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber log2() const {
-    MPFRNumber result(*this);
-    mpfr_log2(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber log10() const {
-    MPFRNumber result(*this);
-    mpfr_log10(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber log1p() const {
-    MPFRNumber result(*this);
-    mpfr_log1p(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber pow(const MPFRNumber &b) {
-    MPFRNumber result(*this);
-    mpfr_pow(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber remquo(const MPFRNumber &divisor, int &quotient) {
-    MPFRNumber remainder(*this);
-    long q;
-    mpfr_remquo(remainder.value, &q, value, divisor.value, mpfr_rounding);
-    quotient = q;
-    return remainder;
-  }
-
-  MPFRNumber round() const {
-    MPFRNumber result(*this);
-    mpfr_round(result.value, value);
-    return result;
-  }
-
-  MPFRNumber roundeven() const {
-    MPFRNumber result(*this);
-#if MPFR_VERSION_MAJOR >= 4
-    mpfr_roundeven(result.value, value);
-#else
-    mpfr_rint(result.value, value, MPFR_RNDN);
-#endif
-    return result;
-  }
-
-  bool round_to_long(long &result) const {
-    // We first calculate the rounded value. This way, when converting
-    // to long using mpfr_get_si, the rounding direction of MPFR_RNDN
-    // (or any other rounding mode), does not have an influence.
-    MPFRNumber roundedValue = round();
-    mpfr_clear_erangeflag();
-    result = mpfr_get_si(roundedValue.value, MPFR_RNDN);
-    return mpfr_erangeflag_p();
-  }
-
-  bool round_to_long(mpfr_rnd_t rnd, long &result) const {
-    MPFRNumber rint_result(*this);
-    mpfr_rint(rint_result.value, value, rnd);
-    return rint_result.round_to_long(result);
-  }
-
-  MPFRNumber rint(mpfr_rnd_t rnd) const {
-    MPFRNumber result(*this);
-    mpfr_rint(result.value, value, rnd);
-    return result;
-  }
-
-  MPFRNumber mod_2pi() const {
-    MPFRNumber result(0.0, 1280);
-    MPFRNumber _2pi(0.0, 1280);
-    mpfr_const_pi(_2pi.value, MPFR_RNDN);
-    mpfr_mul_si(_2pi.value, _2pi.value, 2, MPFR_RNDN);
-    mpfr_fmod(result.value, value, _2pi.value, MPFR_RNDN);
-    return result;
-  }
-
-  MPFRNumber mod_pi_over_2() const {
-    MPFRNumber result(0.0, 1280);
-    MPFRNumber pi_over_2(0.0, 1280);
-    mpfr_const_pi(pi_over_2.value, MPFR_RNDN);
-    mpfr_mul_d(pi_over_2.value, pi_over_2.value, 0.5, MPFR_RNDN);
-    mpfr_fmod(result.value, value, pi_over_2.value, MPFR_RNDN);
-    return result;
-  }
-
-  MPFRNumber mod_pi_over_4() const {
-    MPFRNumber result(0.0, 1280);
-    MPFRNumber pi_over_4(0.0, 1280);
-    mpfr_const_pi(pi_over_4.value, MPFR_RNDN);
-    mpfr_mul_d(pi_over_4.value, pi_over_4.value, 0.25, MPFR_RNDN);
-    mpfr_fmod(result.value, value, pi_over_4.value, MPFR_RNDN);
-    return result;
-  }
-
-  MPFRNumber sin() const {
-    MPFRNumber result(*this);
-    mpfr_sin(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber sinpi() const {
-    MPFRNumber result(*this);
-
-#if MPFR_VERSION_MAJOR > 4 ||                                                  \
-    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
-
-    mpfr_sinpi(result.value, value, mpfr_rounding);
-    return result;
-#else
-    if (mpfr_integer_p(value)) {
-      mpfr_set_si(result.value, 0, mpfr_rounding);
-      return result;
-    }
-
-    MPFRNumber value_mul_two(*this);
-    mpfr_mul_si(value_mul_two.value, value, 2, MPFR_RNDN);
-
-    if (mpfr_integer_p(value_mul_two.value)) {
-      auto d = mpfr_get_si(value, MPFR_RNDD);
-      mpfr_set_si(result.value, (d & 1) ? -1 : 1, mpfr_rounding);
-      return result;
-    }
-
-    MPFRNumber value_pi(0.0, 1280);
-    mpfr_const_pi(value_pi.value, MPFR_RNDN);
-    mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
-    mpfr_sin(result.value, value_pi.value, mpfr_rounding);
-    return result;
-#endif
-  }
-
-  MPFRNumber sinh() const {
-    MPFRNumber result(*this);
-    mpfr_sinh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber sqrt() const {
-    MPFRNumber result(*this);
-    mpfr_sqrt(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber sub(const MPFRNumber &b) const {
-    MPFRNumber result(*this);
-    mpfr_sub(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber tan() const {
-    MPFRNumber result(*this);
-    mpfr_tan(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber tanh() const {
-    MPFRNumber result(*this);
-    mpfr_tanh(result.value, value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber tanpi() const {
-    MPFRNumber result(*this);
-
-#if MPFR_VERSION_MAJOR > 4 ||                                                  \
-    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
-
-    mpfr_tanpi(result.value, value, mpfr_rounding);
-    return result;
-#else
-    MPFRNumber value_ret_exact(*this);
-    MPFRNumber value_one(*this);
-    mpfr_set_si(value_one.value, 1, MPFR_RNDN);
-    mpfr_fmod(value_ret_exact.value, value, value_one.value, mpfr_rounding);
-    mpfr_mul_si(value_ret_exact.value, value_ret_exact.value, 4, MPFR_RNDN);
-
-    if (mpfr_integer_p(value_ret_exact.value)) {
-      int mod = mpfr_get_si(value_ret_exact.value, MPFR_RNDN);
-      mod = (mod < 0 ? -1 * mod : mod);
-
-      switch (mod) {
-      case 0:
-        mpfr_set_si(result.value, 0, mpfr_rounding);
-        break;
-      case 1:
-        mpfr_set_si(result.value, (mpfr_signbit(value) ? -1 : 1),
-                    mpfr_rounding);
-        break;
-      case 2: {
-        auto d = mpfr_get_si(value, MPFR_RNDZ);
-        d += mpfr_sgn(value) > 0 ? 0 : 1;
-        mpfr_set_inf(result.value, (d & 1) ? -1 : 1);
-        break;
-      }
-      case 3:
-        mpfr_set_si(result.value, (mpfr_signbit(value) ? 1 : -1),
-                    mpfr_rounding);
-        break;
-      }
-
-      return result;
-    }
-
-    MPFRNumber value_pi(0.0, 1280);
-    mpfr_const_pi(value_pi.value, MPFR_RNDN);
-    mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
-    mpfr_tan(result.value, value_pi.value, mpfr_rounding);
-    return result;
-#endif
-  }
-
-  MPFRNumber trunc() const {
-    MPFRNumber result(*this);
-    mpfr_trunc(result.value, value);
-    return result;
-  }
-
-  MPFRNumber fma(const MPFRNumber &b, const MPFRNumber &c) {
-    MPFRNumber result(*this);
-    mpfr_fma(result.value, value, b.value, c.value, mpfr_rounding);
-    return result;
-  }
-
-  MPFRNumber mul(const MPFRNumber &b) {
-    MPFRNumber result(*this);
-    mpfr_mul(result.value, value, b.value, mpfr_rounding);
-    return result;
-  }
-
-  cpp::string str() const {
-    // 200 bytes should be more than sufficient to hold a 100-digit number
-    // plus additional bytes for the decimal point, '-' sign etc.
-    constexpr size_t printBufSize = 200;
-    char buffer[printBufSize];
-    mpfr_snprintf(buffer, printBufSize, "%100.50Rf", value);
-    cpp::string_view view(buffer);
-    // Trim whitespaces
-    const char whitespace = ' ';
-    while (!view.empty() && view.front() == whitespace)
-      view.remove_prefix(1);
-    while (!view.empty() && view.back() == whitespace)
-      view.remove_suffix(1);
-    return cpp::string(view.data());
-  }
-
-  // These functions are useful for debugging.
-  template <typename T> T as() const;
-
-  void dump(const char *msg) const { mpfr_printf("%s%.128g\n", msg, value); }
-
-  // Return the ULP (units-in-the-last-place) difference between the
-  // stored MPFR and a floating point number.
-  //
-  // We define ULP difference as follows:
-  //   If exponents of this value and the |input| are same, then:
-  //     ULP(this_value, input) = abs(this_value - input) / eps(input)
-  //   else:
-  //     max = max(abs(this_value), abs(input))
-  //     min = min(abs(this_value), abs(input))
-  //     maxExponent = exponent(max)
-  //     ULP(this_value, input) = (max - 2^maxExponent) / eps(max) +
-  //                              (2^maxExponent - min) / eps(min)
-  //
-  // Remarks:
-  // 1. A ULP of 0.0 will imply that the value is correctly rounded.
-  // 2. We expect that this value and the value to be compared (the [input]
-  //    argument) are reasonable close, and we will provide an upper bound
-  //    of ULP value for testing.  Morever, most of the fractional parts of
-  //    ULP value do not matter much, so using double as the return type
-  //    should be good enough.
-  // 3. For close enough values (values which don't diff in their exponent by
-  //    not more than 1), a ULP difference of N indicates a bit distance
-  //    of N between this number and [input].
-  // 4. A values of +0.0 and -0.0 are treated as equal.
-  template <typename T>
-  cpp::enable_if_t<cpp::is_floating_point_v<T>, MPFRNumber>
-  ulp_as_mpfr_number(T input) {
-    T thisAsT = as<T>();
-    if (thisAsT == input)
-      return MPFRNumber(0.0);
-
-    if (is_nan()) {
-      if (FPBits<T>(input).is_nan())
-        return MPFRNumber(0.0);
-      return MPFRNumber(FPBits<T>::inf().get_val());
-    }
-
-    int thisExponent = FPBits<T>(thisAsT).get_exponent();
-    int inputExponent = FPBits<T>(input).get_exponent();
-    // Adjust the exponents for denormal numbers.
-    if (FPBits<T>(thisAsT).is_subnormal())
-      ++thisExponent;
-    if (FPBits<T>(input).is_subnormal())
-      ++inputExponent;
-
-    if (thisAsT * input < 0 || thisExponent == inputExponent) {
-      MPFRNumber inputMPFR(input);
-      mpfr_sub(inputMPFR.value, value, inputMPFR.value, MPFR_RNDN);
-      mpfr_abs(inputMPFR.value, inputMPFR.value, MPFR_RNDN);
-      mpfr_mul_2si(inputMPFR.value, inputMPFR.value,
-                   -thisExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
-      return inputMPFR;
-    }
-
-    // If the control reaches here, it means that this number and input are
-    // of the same sign but different exponent. In such a case, ULP error is
-    // calculated as sum of two parts.
-    thisAsT = FPBits<T>(thisAsT).abs().get_val();
-    input = FPBits<T>(input).abs().get_val();
-    T min = thisAsT > input ? input : thisAsT;
-    T max = thisAsT > input ? thisAsT : input;
-    int minExponent = FPBits<T>(min).get_exponent();
-    int maxExponent = FPBits<T>(max).get_exponent();
-    // Adjust the exponents for denormal numbers.
-    if (FPBits<T>(min).is_subnormal())
-      ++minExponent;
-    if (FPBits<T>(max).is_subnormal())
-      ++maxExponent;
-
-    MPFRNumber minMPFR(min);
-    MPFRNumber maxMPFR(max);
-
-    MPFRNumber pivot(uint32_t(1));
-    mpfr_mul_2si(pivot.value, pivot.value, maxExponent, MPFR_RNDN);
-
-    mpfr_sub(minMPFR.value, pivot.value, minMPFR.value, MPFR_RNDN);
-    mpfr_mul_2si(minMPFR.value, minMPFR.value,
-                 -minExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
-
-    mpfr_sub(maxMPFR.value, maxMPFR.value, pivot.value, MPFR_RNDN);
-    mpfr_mul_2si(maxMPFR.value, maxMPFR.value,
-                 -maxExponent + FPBits<T>::FRACTION_LEN, MPFR_RNDN);
-
-    mpfr_add(minMPFR.value, minMPFR.value, maxMPFR.value, MPFR_RNDN);
-    return minMPFR;
-  }
-
-  template <typename T>
-  cpp::enable_if_t<cpp::is_floating_point_v<T>, cpp::string>
-  ulp_as_string(T input) {
-    MPFRNumber num = ulp_as_mpfr_number(input);
-    return num.str();
-  }
-
-  template <typename T>
-  cpp::enable_if_t<cpp::is_floating_point_v<T>, double> ulp(T input) {
-    MPFRNumber num = ulp_as_mpfr_number(input);
-    return num.as<double>();
-  }
-};
-
-template <> float MPFRNumber::as<float>() const {
-  return mpfr_get_flt(value, mpfr_rounding);
-}
-
-template <> double MPFRNumber::as<double>() const {
-  return mpfr_get_d(value, mpfr_rounding);
-}
-
-template <> long double MPFRNumber::as<long double>() const {
-  return mpfr_get_ld(value, mpfr_rounding);
-}
-
-#ifdef LIBC_TYPES_HAS_FLOAT16
-template <> float16 MPFRNumber::as<float16>() const {
-  // TODO: Either prove that this cast won't cause double-rounding errors, or
-  // find a better way to get a float16.
-  return fputil::cast<float16>(mpfr_get_d(value, mpfr_rounding));
-}
-#endif
-
-#ifdef LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-template <> float128 MPFRNumber::as<float128>() const {
-  return mpfr_get_float128(value, mpfr_rounding);
-}
-
-#endif // LIBC_TYPES_FLOAT128_IS_NOT_LONG_DOUBLE
-
 namespace internal {
 
 template <typename InputType>
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index c7a57819f68b7..bc65f87c6b5ab 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -21,7 +21,7 @@ namespace testing {
 namespace mpfr {
 
 enum class Operation : int {
-  // Operations with take a single floating point number as input
+  // Operations which take a single floating point number as input
   // and produce a single floating point number as output. The input
   // and output floating point numbers are of the same kind.
   BeginUnaryOperationsSingleOutput,
@@ -87,10 +87,10 @@ enum class Operation : int {
   EndBinaryOperationsSingleOutput,
 
   // Operations which take two floating point numbers of the same type as
-  // input and produce two outputs. The first output is a floating nubmer of
-  // the same type as the inputs. The second output is af type 'int'.
+  // input and produce two outputs. The first output is a floating point number
+  // of the same type as the inputs. The second output is of type 'int'.
   BeginBinaryOperationsTwoOutputs,
-  RemQuo, // The first output, the floating point output, is the remainder.
+  RemQuo, // The first output(floating point) is the remainder.
   EndBinaryOperationsTwoOutputs,
 
   // Operations which take three floating point nubmers of the same type as

From 3c64f86314fbf9a3cd578419f16e621a4de57eaa Mon Sep 17 00:00:00 2001
From: Hongren Zheng <i@zenithal.me>
Date: Tue, 28 Jan 2025 13:31:41 +0800
Subject: [PATCH 336/432] [mlir] Add OpAsmTypeInterface for pretty-print
 (#121187)

See
https://discourse.llvm.org/t/rfc-introduce-opasm-type-attr-interface-for-pretty-print-in-asmprinter/83792
for detailed introduction.

This PR acts as the first part of it
* Add `OpAsmTypeInterface` and `getAsmName` API for deducing ASM name
from type
* Add default impl in `OpAsmOpInterface` to respect this API when
available.

The `OpAsmAttrInterface` / hooking into Alias system part should be
another PR, using a `getAlias` API.

### Discussion

* Instead of using `StringRef getAsmName()` as the API, I use `void
getAsmName(OpAsmSetNameFn)`, as returning StringRef might be unsafe
(std::string constructed inside then returned a _ref_; and this aligns
with the design of `getAsmResultNames`.
* On the result packing of an op, the current approach is that when not
all of the result types are `OpAsmTypeInterface`, then do nothing (old
default impl)

### Review

Cc @j2kun and @Alexanderviand-intel for downstream; Cc @River707 and
@joker-eph for relevent commit history; Cc @ftynse for discourse.
---
 mlir/include/mlir/IR/CMakeLists.txt        |  9 +++++-
 mlir/include/mlir/IR/OpAsmInterface.td     | 21 ++++++++++++++
 mlir/include/mlir/IR/OpImplementation.h    | 11 ++++++--
 mlir/lib/IR/AsmPrinter.cpp                 |  3 +-
 mlir/test/IR/op-asm-interface.mlir         | 24 ++++++++++++++++
 mlir/test/lib/Dialect/Test/TestOpDefs.cpp  | 32 ++++++++++++++++++++++
 mlir/test/lib/Dialect/Test/TestOps.td      | 15 ++++++++++
 mlir/test/lib/Dialect/Test/TestTypeDefs.td |  5 ++++
 mlir/test/lib/Dialect/Test/TestTypes.cpp   |  5 ++++
 9 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/IR/op-asm-interface.mlir

diff --git a/mlir/include/mlir/IR/CMakeLists.txt b/mlir/include/mlir/IR/CMakeLists.txt
index b741eb18d4791..0c7937dfd69e5 100644
--- a/mlir/include/mlir/IR/CMakeLists.txt
+++ b/mlir/include/mlir/IR/CMakeLists.txt
@@ -1,7 +1,14 @@
-add_mlir_interface(OpAsmInterface)
 add_mlir_interface(SymbolInterfaces)
 add_mlir_interface(RegionKindInterface)
 
+set(LLVM_TARGET_DEFINITIONS OpAsmInterface.td)
+mlir_tablegen(OpAsmOpInterface.h.inc -gen-op-interface-decls)
+mlir_tablegen(OpAsmOpInterface.cpp.inc -gen-op-interface-defs)
+mlir_tablegen(OpAsmTypeInterface.h.inc -gen-type-interface-decls)
+mlir_tablegen(OpAsmTypeInterface.cpp.inc -gen-type-interface-defs)
+add_public_tablegen_target(MLIROpAsmInterfaceIncGen)
+add_dependencies(mlir-generic-headers MLIROpAsmInterfaceIncGen)
+
 set(LLVM_TARGET_DEFINITIONS BuiltinAttributes.td)
 mlir_tablegen(BuiltinAttributes.h.inc -gen-attrdef-decls)
 mlir_tablegen(BuiltinAttributes.cpp.inc -gen-attrdef-defs)
diff --git a/mlir/include/mlir/IR/OpAsmInterface.td b/mlir/include/mlir/IR/OpAsmInterface.td
index 98b5095ff2d66..34c830a12856f 100644
--- a/mlir/include/mlir/IR/OpAsmInterface.td
+++ b/mlir/include/mlir/IR/OpAsmInterface.td
@@ -109,6 +109,27 @@ def OpAsmOpInterface : OpInterface<"OpAsmOpInterface"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// OpAsmTypeInterface
+//===----------------------------------------------------------------------===//
+
+def OpAsmTypeInterface : TypeInterface<"OpAsmTypeInterface"> {
+  let description = [{
+    This interface provides hooks to interact with the AsmPrinter and AsmParser
+    classes.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Get a name to use when printing a value of this type.
+      }],
+      "void", "getAsmName",
+      (ins "::mlir::OpAsmSetNameFn":$setNameFn), "", ";"
+    >,
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // ResourceHandleParameter
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 6c1ff4d0e5e6b..d9c925a9c56e6 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -734,7 +734,7 @@ class AsmParser {
   virtual OptionalParseResult parseOptionalInteger(APInt &result) = 0;
   virtual OptionalParseResult parseOptionalDecimalInteger(APInt &result) = 0;
 
- private:
+private:
   template <typename IntT, typename ParseFn>
   OptionalParseResult parseOptionalIntegerAndCheck(IntT &result,
                                                    ParseFn &&parseFn) {
@@ -756,7 +756,7 @@ class AsmParser {
     return success();
   }
 
- public:
+public:
   template <typename IntT>
   OptionalParseResult parseOptionalInteger(IntT &result) {
     return parseOptionalIntegerAndCheck(
@@ -1727,6 +1727,10 @@ class OpAsmParser : public AsmParser {
 // Dialect OpAsm interface.
 //===--------------------------------------------------------------------===//
 
+/// A functor used to set the name of the result. See 'getAsmResultNames' below
+/// for more details.
+using OpAsmSetNameFn = function_ref<void(StringRef)>;
+
 /// A functor used to set the name of the start of a result group of an
 /// operation. See 'getAsmResultNames' below for more details.
 using OpAsmSetValueNameFn = function_ref<void(Value, StringRef)>;
@@ -1820,7 +1824,8 @@ ParseResult parseDimensionList(OpAsmParser &parser,
 //===--------------------------------------------------------------------===//
 
 /// The OpAsmOpInterface, see OpAsmInterface.td for more details.
-#include "mlir/IR/OpAsmInterface.h.inc"
+#include "mlir/IR/OpAsmOpInterface.h.inc"
+#include "mlir/IR/OpAsmTypeInterface.h.inc"
 
 namespace llvm {
 template <>
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index c603db450cbdd..fa4a1b4b72b02 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -125,7 +125,8 @@ void OpAsmPrinter::printFunctionalType(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 /// The OpAsmOpInterface, see OpAsmInterface.td for more details.
-#include "mlir/IR/OpAsmInterface.cpp.inc"
+#include "mlir/IR/OpAsmOpInterface.cpp.inc"
+#include "mlir/IR/OpAsmTypeInterface.cpp.inc"
 
 LogicalResult
 OpAsmDialectInterface::parseResource(AsmParsedResourceEntry &entry) const {
diff --git a/mlir/test/IR/op-asm-interface.mlir b/mlir/test/IR/op-asm-interface.mlir
new file mode 100644
index 0000000000000..a9c199e3dc973
--- /dev/null
+++ b/mlir/test/IR/op-asm-interface.mlir
@@ -0,0 +1,24 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics | FileCheck %s
+
+//===----------------------------------------------------------------------===//
+// Test OpAsmOpInterface
+//===----------------------------------------------------------------------===//
+
+func.func @result_name_from_op_asm_type_interface() {
+  // CHECK-LABEL: @result_name_from_op_asm_type_interface
+  // CHECK: %op_asm_type_interface
+  %0 = "test.result_name_from_type"() : () -> !test.op_asm_type_interface
+  return
+}
+
+// -----
+
+func.func @block_argument_name_from_op_asm_type_interface() {
+  // CHECK-LABEL: @block_argument_name_from_op_asm_type_interface
+  // CHECK: ^bb0(%op_asm_type_interface
+  test.block_argument_name_from_type {
+    ^bb0(%arg0: !test.op_asm_type_interface):
+      "test.terminator"() : ()->()
+  }
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index c6be26d0a44d9..f6b8a0005f285 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -506,6 +506,38 @@ void CustomResultsNameOp::getAsmResultNames(
         setNameFn(getResult(i), str.getValue());
 }
 
+//===----------------------------------------------------------------------===//
+// ResultNameFromTypeOp
+//===----------------------------------------------------------------------===//
+
+void ResultNameFromTypeOp::getAsmResultNames(
+    function_ref<void(Value, StringRef)> setNameFn) {
+  auto result = getResult();
+  auto setResultNameFn = [&](::llvm::StringRef name) {
+    setNameFn(result, name);
+  };
+  auto opAsmTypeInterface =
+      ::mlir::cast<::mlir::OpAsmTypeInterface>(result.getType());
+  opAsmTypeInterface.getAsmName(setResultNameFn);
+}
+
+//===----------------------------------------------------------------------===//
+// BlockArgumentNameFromTypeOp
+//===----------------------------------------------------------------------===//
+
+void BlockArgumentNameFromTypeOp::getAsmBlockArgumentNames(
+    ::mlir::Region &region, ::mlir::OpAsmSetValueNameFn setNameFn) {
+  for (auto &block : region) {
+    for (auto arg : block.getArguments()) {
+      if (auto opAsmTypeInterface =
+              ::mlir::dyn_cast<::mlir::OpAsmTypeInterface>(arg.getType())) {
+        auto setArgNameFn = [&](StringRef name) { setNameFn(arg, name); };
+        opAsmTypeInterface.getAsmName(setArgNameFn);
+      }
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // ResultTypeWithTraitOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 0b1f22b3ee932..f37573c1351ce 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -924,6 +924,21 @@ def CustomResultsNameOp
   let results = (outs Variadic<AnyInteger>:$r);
 }
 
+// This is used to test OpAsmTypeInterface::getAsmName for op result name,
+def ResultNameFromTypeOp
+ : TEST_Op<"result_name_from_type",
+           [DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>]> {
+  let results = (outs AnyType:$r);
+}
+
+// This is used to test OpAsmTypeInterface::getAsmName for block argument,
+def BlockArgumentNameFromTypeOp
+  : TEST_Op<"block_argument_name_from_type",
+      [DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmBlockArgumentNames"]>]> {
+  let regions = (region AnyRegion:$body);
+  let assemblyFormat = "regions attr-dict-with-keyword";
+}
+
 // This is used to test the OpAsmOpInterface::getDefaultDialect() feature:
 // operations nested in a region under this op will drop the "test." dialect
 // prefix.
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 60108ac86d1ed..6335701786ecc 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -398,4 +398,9 @@ def TestTypeVerification : Test_Type<"TestTypeVerification"> {
   let assemblyFormat = "`<` $param `>`";
 }
 
+def TestTypeOpAsmTypeInterface : Test_Type<"TestTypeOpAsmTypeInterface",
+    [DeclareTypeInterfaceMethods<OpAsmTypeInterface, ["getAsmName"]>]> {
+  let mnemonic = "op_asm_type_interface";
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index b822e019e09d2..1ae7ac472d989 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -532,3 +532,8 @@ void TestRecursiveAliasType::print(AsmPrinter &printer) const {
   }
   printer << ">";
 }
+
+void TestTypeOpAsmTypeInterfaceType::getAsmName(
+    OpAsmSetNameFn setNameFn) const {
+  setNameFn("op_asm_type_interface");
+}

From 3a439e2caf0bb545ee451df1de5b02ea068140f7 Mon Sep 17 00:00:00 2001
From: Hongren Zheng <i@zenithal.me>
Date: Tue, 28 Jan 2025 13:32:28 +0800
Subject: [PATCH 337/432] [mlir][dataflow] disallow outside use of
 propagateIfChanged for DataFlowSolver (#120885)

Detailed writeup is in https://github.com/google/heir/issues/1153. See
also https://github.com/llvm/llvm-project/pull/120881. In short,
`propagateIfChanged` is used outside of the `DataFlowAnalysis` scope,
because it is public, but it does not propagate as expected as the
`DataFlowSolver` has stopped running.

To solve such misuse, `propagateIfChanged` should be made
protected/private.

For downstream users affected by this, to correctly propagate the
change, the Analysis should be re-run (check #120881) instead of just a
`propagateIfChanged`

The change to `IntegerRangeAnalysis` is just a expansion of the
`solver->propagateIfChanged`. The `Lattice` has already been updated by
the `join`. Propagation is done by `onUpdate`.

Cc @Mogball for review
---
 mlir/include/mlir/Analysis/DataFlowFramework.h | 5 +++++
 mlir/lib/Analysis/DataFlowFramework.cpp        | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/mlir/include/mlir/Analysis/DataFlowFramework.h b/mlir/include/mlir/Analysis/DataFlowFramework.h
index b6d10ba0bea2d..a3714c4332fbb 100644
--- a/mlir/include/mlir/Analysis/DataFlowFramework.h
+++ b/mlir/include/mlir/Analysis/DataFlowFramework.h
@@ -401,6 +401,8 @@ class DataFlowSolver {
 
   /// Propagate an update to an analysis state if it changed by pushing
   /// dependent work items to the back of the queue.
+  /// This should only be used when DataFlowSolver is running.
+  /// Otherwise, the solver won't process the work items.
   void propagateIfChanged(AnalysisState *state, ChangeResult changed);
 
   /// Get the configuration of the solver.
@@ -410,6 +412,9 @@ class DataFlowSolver {
   /// Configuration of the dataflow solver.
   DataFlowConfig config;
 
+  /// The solver is working on the worklist.
+  bool isRunning = false;
+
   /// The solver's work queue. Work items can be inserted to the front of the
   /// queue to be processed greedily, speeding up computations that otherwise
   /// quickly degenerate to quadratic due to propagation of state updates.
diff --git a/mlir/lib/Analysis/DataFlowFramework.cpp b/mlir/lib/Analysis/DataFlowFramework.cpp
index d2742c6e4b966..028decbae31c3 100644
--- a/mlir/lib/Analysis/DataFlowFramework.cpp
+++ b/mlir/lib/Analysis/DataFlowFramework.cpp
@@ -10,6 +10,7 @@
 #include "mlir/IR/Location.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/Value.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Config/abi-breaking.h"
 #include "llvm/Support/Casting.h"
@@ -104,6 +105,10 @@ Location LatticeAnchor::getLoc() const {
 //===----------------------------------------------------------------------===//
 
 LogicalResult DataFlowSolver::initializeAndRun(Operation *top) {
+  // Enable enqueue to the worklist.
+  isRunning = true;
+  auto guard = llvm::make_scope_exit([&]() { isRunning = false; });
+
   // Initialize the analyses.
   for (DataFlowAnalysis &analysis : llvm::make_pointee_range(childAnalyses)) {
     DATAFLOW_DEBUG(llvm::dbgs()
@@ -134,6 +139,8 @@ LogicalResult DataFlowSolver::initializeAndRun(Operation *top) {
 
 void DataFlowSolver::propagateIfChanged(AnalysisState *state,
                                         ChangeResult changed) {
+  assert(isRunning &&
+         "DataFlowSolver is not running, should not use propagateIfChanged");
   if (changed == ChangeResult::Change) {
     DATAFLOW_DEBUG(llvm::dbgs() << "Propagating update to " << state->debugName
                                 << " of " << state->anchor << "\n"

From d50ebd47ae57812e5d2db1e3d3157f26b8d9d159 Mon Sep 17 00:00:00 2001
From: Gedare Bloom <gedare@rtems.org>
Date: Mon, 27 Jan 2025 22:40:17 -0700
Subject: [PATCH 338/432] [clang-format] Add style option
 `PenaltyBreakBeforeMemberAccess` (#118409)

The penalty for breaking before a member access is hard-coded to 150.
Add a configuration option to allow setting it.

---------

Co-authored-by: Owen Pan <owenpiano@gmail.com>
---
 clang/docs/ClangFormatStyleOptions.rst     |  5 +++++
 clang/docs/ReleaseNotes.rst                |  1 +
 clang/include/clang/Format/Format.h        |  5 +++++
 clang/lib/Format/Format.cpp                |  3 +++
 clang/lib/Format/TokenAnnotator.cpp        |  8 +++++---
 clang/unittests/Format/ConfigParseTest.cpp |  2 ++
 clang/unittests/Format/FormatTest.cpp      | 18 ++++++++++++++++++
 7 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 30a2325949f48..bbb912eb10e94 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -5198,6 +5198,11 @@ the configuration (without a prefix: ``Auto``).
 **PenaltyBreakBeforeFirstCallParameter** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <PenaltyBreakBeforeFirstCallParameter>`
   The penalty for breaking a function call after ``call(``.
 
+.. _PenaltyBreakBeforeMemberAccess:
+
+**PenaltyBreakBeforeMemberAccess** (``Unsigned``) :versionbadge:`clang-format 20` :ref:`¶ <PenaltyBreakBeforeMemberAccess>`
+  The penalty for breaking before a member access operator (``.``, ``->``).
+
 .. _PenaltyBreakComment:
 
 **PenaltyBreakComment** (``Unsigned``) :versionbadge:`clang-format 3.7` :ref:`¶ <PenaltyBreakComment>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index d8b7145986a2a..bf4dbf9a4dd26 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1294,6 +1294,7 @@ clang-format
 - Adds support for bash globstar in ``.clang-format-ignore``.
 - Adds ``WrapNamespaceBodyWithEmptyLines`` option.
 - Adds the ``IndentExportBlock`` option.
+- Adds ``PenaltyBreakBeforeMemberAccess`` option.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index fd526f189ec83..6f432d1d50315 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3639,6 +3639,10 @@ struct FormatStyle {
   /// \version 3.7
   unsigned PenaltyBreakBeforeFirstCallParameter;
 
+  /// The penalty for breaking before a member access operator (``.``, ``->``).
+  /// \version 20
+  unsigned PenaltyBreakBeforeMemberAccess;
+
   /// The penalty for each line break introduced inside a comment.
   /// \version 3.7
   unsigned PenaltyBreakComment;
@@ -5311,6 +5315,7 @@ struct FormatStyle {
            PenaltyBreakAssignment == R.PenaltyBreakAssignment &&
            PenaltyBreakBeforeFirstCallParameter ==
                R.PenaltyBreakBeforeFirstCallParameter &&
+           PenaltyBreakBeforeMemberAccess == R.PenaltyBreakBeforeMemberAccess &&
            PenaltyBreakComment == R.PenaltyBreakComment &&
            PenaltyBreakFirstLessLess == R.PenaltyBreakFirstLessLess &&
            PenaltyBreakOpenParenthesis == R.PenaltyBreakOpenParenthesis &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index c25d9bf7c2251..f02bf95cfeed7 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1091,6 +1091,8 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("PenaltyBreakAssignment", Style.PenaltyBreakAssignment);
     IO.mapOptional("PenaltyBreakBeforeFirstCallParameter",
                    Style.PenaltyBreakBeforeFirstCallParameter);
+    IO.mapOptional("PenaltyBreakBeforeMemberAccess",
+                   Style.PenaltyBreakBeforeMemberAccess);
     IO.mapOptional("PenaltyBreakComment", Style.PenaltyBreakComment);
     IO.mapOptional("PenaltyBreakFirstLessLess",
                    Style.PenaltyBreakFirstLessLess);
@@ -1659,6 +1661,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
 
   LLVMStyle.PenaltyBreakAssignment = prec::Assignment;
   LLVMStyle.PenaltyBreakBeforeFirstCallParameter = 19;
+  LLVMStyle.PenaltyBreakBeforeMemberAccess = 150;
   LLVMStyle.PenaltyBreakComment = 300;
   LLVMStyle.PenaltyBreakFirstLessLess = 120;
   LLVMStyle.PenaltyBreakOpenParenthesis = 0;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 08fda7a2ffa51..a172df5291ae6 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4313,9 +4313,11 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
     //
     //   aaaaaaa
     //       .aaaaaaaaa.bbbbbbbb(cccccccc);
-    return !Right.NextOperator || !Right.NextOperator->Previous->closesScope()
-               ? 150
-               : 35;
+    const auto *NextOperator = Right.NextOperator;
+    const auto Penalty = Style.PenaltyBreakBeforeMemberAccess;
+    return NextOperator && NextOperator->Previous->closesScope()
+               ? std::min(Penalty, 35u)
+               : Penalty;
   }
 
   if (Right.is(TT_TrailingAnnotation) &&
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 9746aa3547846..10788449a1a1d 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -266,6 +266,8 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("PenaltyBreakAssignment: 1234", PenaltyBreakAssignment, 1234u);
   CHECK_PARSE("PenaltyBreakBeforeFirstCallParameter: 1234",
               PenaltyBreakBeforeFirstCallParameter, 1234u);
+  CHECK_PARSE("PenaltyBreakBeforeMemberAccess: 1234",
+              PenaltyBreakBeforeMemberAccess, 1234u);
   CHECK_PARSE("PenaltyBreakTemplateDeclaration: 1234",
               PenaltyBreakTemplateDeclaration, 1234u);
   CHECK_PARSE("PenaltyBreakOpenParenthesis: 1234", PenaltyBreakOpenParenthesis,
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 265461561d201..57f12221cdc7e 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -22365,6 +22365,24 @@ TEST_F(FormatTest, BreakPenaltyAfterForLoopLParen) {
                Style);
 }
 
+TEST_F(FormatTest, BreakPenaltyBeforeMemberAccess) {
+  auto Style = getLLVMStyle();
+  EXPECT_EQ(Style.PenaltyBreakBeforeMemberAccess, 150u);
+
+  Style.ColumnLimit = 60;
+  Style.PenaltyBreakBeforeMemberAccess = 110;
+  verifyFormat("aaaaaaaa.aaaaaaaa.bbbbbbbb()\n"
+               "    .ccccccccccccccccccccc(dddddddd);\n"
+               "aaaaaaaa.aaaaaaaa\n"
+               "    .bbbbbbbb(cccccccccccccccccccccccccccccccc);",
+               Style);
+
+  Style.ColumnLimit = 13;
+  verifyFormat("foo->bar\n"
+               "    .b(a);",
+               Style);
+}
+
 TEST_F(FormatTest, BreakPenaltyScopeResolution) {
   FormatStyle Style = getLLVMStyle();
   Style.ColumnLimit = 20;

From d606f68a2e37440729f223616062b9ebb28e6482 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Mon, 27 Jan 2025 21:42:21 -0800
Subject: [PATCH 339/432] [bazel] Fix build after
 e0c7f081f1582d49f81ec4c6cdbf5d6ef13c58ba

---
 utils/bazel/llvm-project-overlay/lld/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
index 40154de2c7a02..94bcf68896615 100644
--- a/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lld/BUILD.bazel
@@ -43,6 +43,7 @@ cc_library(
     srcs = [":config_version_gen"] + glob(["Common/*.cpp"]),
     hdrs = glob(["include/lld/Common/*.h"]),
     includes = ["include"],
+    textual_hdrs = glob(["include/lld/Common/*.inc"]),
     deps = [
         ":vcs_version",
         "//llvm:CodeGen",

From 839cdb58e71fa7b8a132ac9497580cc23a3ae7f7 Mon Sep 17 00:00:00 2001
From: Shourya Goel <shouryagoel10000@gmail.com>
Date: Tue, 28 Jan 2025 11:19:15 +0530
Subject: [PATCH 340/432] [libc][complex] remove -ffreestanding flag from
 MPFRUtils compile options (#124702)

Fixes buildbot error in #121261
---
 libc/utils/MPFRWrapper/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index 829c2fb2c2f90..8aeee2463dc51 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -28,6 +28,8 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     MPFRUtils.h
   )
   _get_common_test_compile_options(compile_options "" "")
+  # mpfr/gmp headers do not work with -ffreestanding flag.
+  list(REMOVE_ITEM compile_options "-ffreestanding")
   target_compile_options(libcMPFRWrapper PRIVATE -O3 ${compile_options})
   add_dependencies(
     libcMPFRWrapper

From 42432ada8e0b9ebd716a5522f799698ec49cd3db Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Tue, 28 Jan 2025 11:25:56 +0530
Subject: [PATCH 341/432] [AMDGPU][NFC] Sort AMDGPUPassRegistry entries
 alphabetically (#124544)

---
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 142b41f030635..92cc4972fb65a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -20,7 +20,6 @@ MODULE_PASS("amdgpu-always-inline", AMDGPUAlwaysInlinePass())
 MODULE_PASS("amdgpu-lower-buffer-fat-pointers",
             AMDGPULowerBufferFatPointersPass(*this))
 MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
-MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-lower-enqueued-block", AMDGPUOpenCLEnqueuedBlockLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-perf-hint",
@@ -28,6 +27,7 @@ MODULE_PASS("amdgpu-perf-hint",
               *static_cast<const GCNTargetMachine *>(this)))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-remove-incompatible-functions", AMDGPURemoveIncompatibleFunctionsPass(*this))
+MODULE_PASS("amdgpu-sw-lower-lds", AMDGPUSwLowerLDSPass(*this))
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
 #undef MODULE_PASS
 
@@ -56,13 +56,13 @@ FUNCTION_PASS("amdgpu-lower-kernel-arguments",
               AMDGPULowerKernelArgumentsPass(*this))
 FUNCTION_PASS("amdgpu-lower-kernel-attributes",
               AMDGPULowerKernelAttributesPass())
-FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-promote-alloca", AMDGPUPromoteAllocaPass(*this))
 FUNCTION_PASS("amdgpu-promote-alloca-to-vector",
               AMDGPUPromoteAllocaToVectorPass(*this))
 FUNCTION_PASS("amdgpu-promote-kernel-arguments",
               AMDGPUPromoteKernelArgumentsPass())
 FUNCTION_PASS("amdgpu-rewrite-undef-for-phi", AMDGPURewriteUndefForPHIPass())
+FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass())
 FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes",
               AMDGPUUnifyDivergentExitNodesPass())
 FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass())
@@ -97,11 +97,11 @@ FUNCTION_PASS_WITH_PARAMS(
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this))
+MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
 MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass())
-MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass())
 MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass());
-MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass())
+MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass())
 MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass())
 MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass())
 MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass())

From 6a9d0e53ae04a60222c8e39d8ced3183aa30588a Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 28 Jan 2025 14:51:12 +0900
Subject: [PATCH 342/432] [llvm-cov] Prevent assertion failure in sumMCDCPairs

Since #112694, MCDCRecord::isCondFolded() has returned true for
"partially folded" conditions. Besides,
isConditionIndependencePairCovered() returns true if the unfolded
condition is satisfied. This might break consistency
(CoveredPairs <= NumPairs).
---
 llvm/tools/llvm-cov/CoverageSummaryInfo.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
index 5c002a694f66a..be9aef8416e8d 100644
--- a/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
+++ b/llvm/tools/llvm-cov/CoverageSummaryInfo.cpp
@@ -53,10 +53,11 @@ auto sumMCDCPairs(const ArrayRef<MCDCRecord> &Records) {
   for (const auto &Record : Records) {
     const auto NumConditions = Record.getNumConditions();
     for (unsigned C = 0; C < NumConditions; C++) {
-      if (!Record.isCondFolded(C))
+      if (!Record.isCondFolded(C)) {
         ++NumPairs;
-      if (Record.isConditionIndependencePairCovered(C))
-        ++CoveredPairs;
+        if (Record.isConditionIndependencePairCovered(C))
+          ++CoveredPairs;
+      }
     }
   }
   return MCDCCoverageInfo(CoveredPairs, NumPairs);

From f10441ad003236ef3b9e5415a571d2be0c0ce5ce Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 22:02:27 -0800
Subject: [PATCH 343/432] [ELF] Refine includeInDynsym condition

`includeInDynsym` has a special case for isUndefWeak and
--no-dynamic-linker, which can be removed if we simplify disallow
dynamic symbols for static-pie.

The partition feature reports errors only when a symbol `isExported`.
We need to link in a DSO to trigger the mips error.
---
 lld/ELF/Config.h                |  2 ++
 lld/ELF/Driver.cpp              |  8 ++++++--
 lld/ELF/Symbols.cpp             | 13 +++++--------
 lld/ELF/Writer.cpp              |  6 ++++--
 lld/test/ELF/partition-errors.s |  4 +++-
 5 files changed, 20 insertions(+), 13 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index b2859486d58e9..1d47645edcd83 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -654,6 +654,8 @@ struct Ctx : CommonLinkerContext {
   std::unique_ptr<llvm::TarWriter> tar;
   // InputFile for linker created symbols with no source location.
   InputFile *internalFile = nullptr;
+  // True if symbols can be exported (isExported) or preemptible.
+  bool hasDynsym = false;
   // True if SHT_LLVM_SYMPART is used.
   std::atomic<bool> hasSympart{false};
   // True if there are TLS IE relocations. Set DF_STATIC_TLS if -shared.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 06c93710497ed..9d0c992c1e851 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2554,7 +2554,7 @@ void LinkerDriver::compileBitcodeFiles(bool skipLinkedOutput) {
       for (Symbol *sym : obj->getGlobalSymbols()) {
         if (!sym->isDefined())
           continue;
-        if (sym->includeInDynsym(ctx))
+        if (ctx.hasDynsym && sym->includeInDynsym(ctx))
           sym->isExported = true;
         if (sym->hasVersionSuffix)
           sym->parseSymbolVersion(ctx);
@@ -2899,8 +2899,12 @@ template <class ELFT> void LinkerDriver::link(opt::InputArgList &args) {
 
   parseFiles(ctx, files);
 
+  // Dynamic linking is used if there is an input DSO,
+  // or -shared or non-static pie is specified.
+  ctx.hasDynsym = !ctx.sharedFiles.empty() || ctx.arg.shared ||
+                  (ctx.arg.pie && !ctx.arg.noDynamicLinker);
   // Create dynamic sections for dynamic linking and static PIE.
-  ctx.arg.hasDynSymTab = !ctx.sharedFiles.empty() || ctx.arg.isPic;
+  ctx.arg.hasDynSymTab = ctx.hasDynsym || ctx.arg.isPic;
 
   // If an entry symbol is in a static archive, pull out that file now.
   if (Symbol *sym = ctx.symtab->find(ctx.arg.entry))
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 497db842bc9be..b10391c65dfdc 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -272,11 +272,7 @@ bool Symbol::includeInDynsym(Ctx &ctx) const {
   if (computeBinding(ctx) == STB_LOCAL)
     return false;
   if (!isDefined() && !isCommon())
-    // This should unconditionally return true, unfortunately glibc -static-pie
-    // expects undefined weak symbols not to exist in .dynsym, e.g.
-    // __pthread_mutex_lock reference in _dl_add_to_namespace_list,
-    // __pthread_initialize_minimal reference in csu/libc-start.c.
-    return !(isUndefWeak() && ctx.arg.noDynamicLinker);
+    return true;
 
   return exportDynamic ||
          (ctx.arg.exportDynamic && (isUsedInRegularObj || !ltoCanOmit));
@@ -374,13 +370,14 @@ void elf::parseVersionAndComputeIsPreemptible(Ctx &ctx) {
   // Symbol themselves might know their versions because symbols
   // can contain versions in the form of <name>@<version>.
   // Let them parse and update their names to exclude version suffix.
-  bool hasDynSymTab = ctx.arg.hasDynSymTab;
+  bool hasDynsym = ctx.hasDynsym;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (sym->hasVersionSuffix)
       sym->parseSymbolVersion(ctx);
-    sym->isExported = sym->includeInDynsym(ctx);
-    if (hasDynSymTab)
+    if (hasDynsym) {
+      sym->isExported = sym->includeInDynsym(ctx);
       sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+    }
   }
 }
 
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 8ce0b68c9f29c..535d7871a18ad 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -282,6 +282,7 @@ static void demoteDefined(Defined &sym, DenseMap<SectionBase *, size_t> &map) {
 static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Demote symbols");
   DenseMap<InputFile *, DenseMap<SectionBase *, size_t>> sectionIndexMap;
+  bool hasDynsym = ctx.hasDynsym;
   for (Symbol *sym : ctx.symtab->getSymbols()) {
     if (auto *d = dyn_cast<Defined>(sym)) {
       if (d->section && !d->section->isLive())
@@ -297,9 +298,10 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
       }
     }
 
-    sym->isExported = sym->includeInDynsym(ctx);
-    if (ctx.arg.hasDynSymTab)
+    if (hasDynsym) {
+      sym->isExported = sym->includeInDynsym(ctx);
       sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
+    }
   }
 }
 
diff --git a/lld/test/ELF/partition-errors.s b/lld/test/ELF/partition-errors.s
index 8962ded161f17..476f377640871 100644
--- a/lld/test/ELF/partition-errors.s
+++ b/lld/test/ELF/partition-errors.s
@@ -12,8 +12,10 @@
 // RUN: not ld.lld --export-dynamic %ts.so %t.o -Tdata=0 2>&1 | FileCheck %s
 // RUN: not ld.lld --export-dynamic %ts.so %t.o -Tbss=0 2>&1 | FileCheck %s
 
+// RUN: llvm-mc %S/Inputs/shared.s -o %ts.o -filetype=obj --triple=mipsel-unknown-linux
+// RUN: ld.lld -shared -soname=ts %ts.o -o %ts.so
 // RUN: llvm-mc -triple=mipsel-unknown-linux -filetype=obj -o %t2.o %s
-// RUN: not ld.lld --export-dynamic %t2.o 2>&1 | FileCheck %s
+// RUN: not ld.lld --export-dynamic %t2.o %ts.so 2>&1 | FileCheck %s
 
 // CHECK: error: {{.*}}.o: partitions cannot be used
 

From d4af658323c6e2492ca1224930488c390a08c720 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 20:54:07 -0800
Subject: [PATCH 344/432] [RISCV] Support multiple memory operands in
 expandRV32ZdinxStore.

TailMerge can create stores with multiple memory operands. We
need to split all of them instead of assuming there is only one.
---
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   | 32 ++++++++--------
 llvm/test/CodeGen/RISCV/zdinx-memoperand.ll   | 37 +++++++++++++++++++
 2 files changed, 53 insertions(+), 16 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/zdinx-memoperand.ll

diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index eb3e1a1fe9fd5..0656bfbef6b35 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -341,15 +341,15 @@ bool RISCVExpandPseudo::expandRV32ZdinxStore(MachineBasicBlock &MBB,
                 .addImm(MBBI->getOperand(2).getImm() + 4);
   }
 
-  if (!MBBI->memoperands_empty()) {
-    assert(MBBI->hasOneMemOperand() && "Expected mem operand");
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 4);
-    MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 4, 4);
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
+  MachineFunction *MF = MBB.getParent();
+  SmallVector<MachineMemOperand *> NewLoMMOs;
+  SmallVector<MachineMemOperand *> NewHiMMOs;
+  for (const MachineMemOperand *MMO : MBBI->memoperands()) {
+    NewLoMMOs.push_back(MF->getMachineMemOperand(MMO, 0, 4));
+    NewHiMMOs.push_back(MF->getMachineMemOperand(MMO, 4, 4));
   }
+  MIBLo.setMemRefs(NewLoMMOs);
+  MIBHi.setMemRefs(NewHiMMOs);
 
   MBBI->eraseFromParent();
   return true;
@@ -401,15 +401,15 @@ bool RISCVExpandPseudo::expandRV32ZdinxLoad(MachineBasicBlock &MBB,
                 .add(MBBI->getOperand(2));
   }
 
-  if (!MBBI->memoperands_empty()) {
-    assert(MBBI->hasOneMemOperand() && "Expected mem operand");
-    MachineMemOperand *OldMMO = MBBI->memoperands().front();
-    MachineFunction *MF = MBB.getParent();
-    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 4);
-    MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 4, 4);
-    MIBLo.setMemRefs(MMOLo);
-    MIBHi.setMemRefs(MMOHi);
+  MachineFunction *MF = MBB.getParent();
+  SmallVector<MachineMemOperand *> NewLoMMOs;
+  SmallVector<MachineMemOperand *> NewHiMMOs;
+  for (const MachineMemOperand *MMO : MBBI->memoperands()) {
+    NewLoMMOs.push_back(MF->getMachineMemOperand(MMO, 0, 4));
+    NewHiMMOs.push_back(MF->getMachineMemOperand(MMO, 4, 4));
   }
+  MIBLo.setMemRefs(NewLoMMOs);
+  MIBHi.setMemRefs(NewHiMMOs);
 
   MBBI->eraseFromParent();
   return true;
diff --git a/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll b/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll
new file mode 100644
index 0000000000000..d618253912470
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zdinx-memoperand.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+zdinx | FileCheck %s
+
+; This test previously asserted because TailMerge created a PseudoRV32ZdinxSD
+; with 2 memoperands which RISCVExpandPseudo could not handle.
+
+define i32 @foo(double %x, ptr %y, i64 %0, i64 %1, i1 %cmp6.not, ptr %arrayidx13, ptr %arrayidx20) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andi a0, a7, 1
+; CHECK-NEXT:    beqz a0, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %if.else
+; CHECK-NEXT:    lw a0, 4(sp)
+; CHECK-NEXT:    j .LBB0_3
+; CHECK-NEXT:  .LBB0_2: # %if.then7
+; CHECK-NEXT:    lw a0, 0(sp)
+; CHECK-NEXT:  .LBB0_3: # %common.ret
+; CHECK-NEXT:    fcvt.d.w a2, zero
+; CHECK-NEXT:    sw a2, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    li a0, 0
+; CHECK-NEXT:    ret
+entry:
+  br i1 %cmp6.not, label %if.else, label %if.then7
+
+common.ret:                                       ; preds = %if.else, %if.then7
+  ret i32 0
+
+if.then7:                                         ; preds = %entry
+  store double 0.000000e+00, ptr %arrayidx13, align 8
+  br label %common.ret
+
+if.else:                                          ; preds = %entry
+  store double 0.000000e+00, ptr %arrayidx20, align 8
+  br label %common.ret
+}
+

From 952685a43d0436577df95ea318b2460438f67982 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 22:19:48 -0800
Subject: [PATCH 345/432] [ELF,test] Add static-pie test related to demoted
 lazy symbol

The reverted
1a4d6de1b532149b10522eae5dabce39e5f7c687
("[ELF] Remove redundant isExported computation")
had incorrect
```
+        if (sym->includeInDynsym(ctx))
+          sym->isExported = true;
```

causing undefined weak symbols (defined in archives, demoted; e.g.
__cxa_finalize) to be exported for static-pie.

Add a regression test for this corner case. The issue actually exposed
another issue related to includeInDynsym, which has been fixed by
f10441ad003236ef3b9e5415a571d2be0c0ce5ce.
---
 lld/test/ELF/weak-undef-lib.s | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lld/test/ELF/weak-undef-lib.s b/lld/test/ELF/weak-undef-lib.s
index 19b59fddaa723..a554e1d5a2f89 100644
--- a/lld/test/ELF/weak-undef-lib.s
+++ b/lld/test/ELF/weak-undef-lib.s
@@ -17,6 +17,11 @@
 # CHECK-NEXT: Other: 0
 # CHECK-NEXT: Section: Undefined
 
+# RUN: ld.lld -pie -o %t %t1.o --start-lib %t2.o --no-dynamic-linker
+# RUN: llvm-readelf --dyn-syms %t | FileCheck %s --check-prefix=STATICPIE
+
+# STATICPIE: Symbol table '.dynsym' contains 1
+
 ## -u specifies a STB_DEFAULT undefined symbol, so the definition from %t2.o is
 ## fetched.
 # RUN: ld.lld -u foo %t1.o --start-lib %t2.o -o %t1

From 4167ea2cb082a2acb00b8b1dc09aa780dc0e3110 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 27 Jan 2025 22:26:15 -0800
Subject: [PATCH 346/432] Revert "[libcxx] Use alias for detecting overriden
 function" (#124431)

Reverts llvm/llvm-project#120805

This change while desirable has two issues we discovered:

- It is incompatible with `-funique-internal-linkage-names`, see
https://github.com/llvm/llvm-project/pull/120805#discussion_r1913709817
- It is incompatible with `-fvisibility-global-new-delete=force-hidden`,
see
https://github.com/llvm/llvm-project/issues/123224#issuecomment-2607963878

We were hoping to address both of these issues with
https://github.com/llvm/llvm-project/pull/122983, but that change has
other issues we haven't yet managed to resolve. For now, we have decided
to revert the change to avoid shipping a broken feature in LLVM 20, and
we plan to follow up with a new approach post branch.
---
 libcxx/src/include/overridable_function.h | 115 +++++++++++++---------
 libcxx/src/new.cpp                        |  22 +++--
 libcxxabi/src/stdlib_new_delete.cpp       |  22 +++--
 3 files changed, 94 insertions(+), 65 deletions(-)

diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
index 7372e347831bb..6c70f6242ddd6 100644
--- a/libcxx/src/include/overridable_function.h
+++ b/libcxx/src/include/overridable_function.h
@@ -29,81 +29,106 @@
 // This is a low-level utility which does not work on all platforms, since it needs
 // to make assumptions about the object file format in use. Furthermore, it requires
 // the "base definition" of the function (the one we want to check whether it has been
-// overridden) to be defined using the _LIBCPP_OVERRIDABLE_FUNCTION macro.
+// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
 //
 // This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux
 // and others). On platforms where we know how to implement this detection, the macro
 // _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on
-// other platforms. The _LIBCPP_OVERRIDABLE_FUNCTION macro expands to regular function
-// definition on unsupported platforms so that it can be used to decorate functions
-// regardless of whether detection is actually supported.
+// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to
+// nothing on unsupported platforms so that it can be used to decorate functions regardless
+// of whether detection is actually supported.
 //
 // How does this work?
 // -------------------
 //
 // Let's say we want to check whether a weak function `f` has been overridden by the user.
-// The general mechanism works by defining a symbol `f_impl__` and a weak alias `f` via the
-// _LIBCPP_OVERRIDABLE_FUNCTION macro.
+// The general mechanism works by placing `f`'s definition (in the libc++ built library)
+// inside a special section, which we do using the `__section__` attribute via the
+// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
 //
 // Then, when comes the time to check whether the function has been overridden, we take
-// the address of the function `f` and we check whether it is different from `f_impl__`.
-// If so it means the function was overriden by the user.
+// the address of the function and we check whether it falls inside the special function
+// we created. This can be done by finding pointers to the start and the end of the section
+// (which is done differently for ELF and Mach-O), and then checking whether `f` falls
+// within those bounds. If it falls within those bounds, then `f` is still inside the
+// special section and so it is the version we defined in the libc++ built library, i.e.
+// it was not overridden. Otherwise, it was overridden by the user because it falls
+// outside of the section.
 //
 // Important note
 // --------------
 //
-// This mechanism should never be used outside of the libc++ built library. Functions defined
-// with this macro must be defined at global scope.
+// This mechanism should never be used outside of the libc++ built library. In particular,
+// attempting to use this within the libc++ headers will not work at all because we don't
+// want to be defining special sections inside user's executables which use our headers.
 //
 
 #if defined(_LIBCPP_OBJECT_FORMAT_MACHO)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
-
-template <auto _Func>
-_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE                                                                 \
+    __attribute__((__section__("__TEXT,__lcxx_override,regular,pure_instructions")))
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  // Declare two dummy bytes and give them these special `__asm` values. These values are
+  // defined by the linker, which means that referring to `&__lcxx_override_start` will
+  // effectively refer to the address where the section starts (and same for the end).
+  extern char __lcxx_override_start __asm("section$start$__TEXT$__lcxx_override");
+  extern char __lcxx_override_end __asm("section$end$__TEXT$__lcxx_override");
+
+  // Now get a uintptr_t out of these locations, and out of the function pointer.
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__lcxx_override_start);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+#  if __has_feature(ptrauth_calls)
+  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. Also, in particular,
+  // we must NOT pass a function pointer, otherwise we will strip the function pointer, and then attempt
+  // to authenticate and re-sign it when casting it to a uintptr_t again, which will fail because we just
+  // stripped the function pointer. See rdar://122927845.
+  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
+#  endif
+
+  // Finally, the function was overridden if it falls outside of the section's bounds.
+  return __ptr < __start || __ptr > __end;
+}
 _LIBCPP_END_NAMESPACE_STD
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
-    static __attribute__((used)) type symbol##_impl__ arglist __asm__("_" _LIBCPP_TOSTRING(symbol));                   \
-    __asm__(".globl _" _LIBCPP_TOSTRING(symbol));                                                                      \
-    __asm__(".weak_definition _" _LIBCPP_TOSTRING(symbol));                                                            \
-    extern __typeof(symbol##_impl__) name __attribute__((weak_import));                                                \
-    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
-    template <>                                                                                                        \
-    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
-      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
-    }                                                                                                                  \
-    _LIBCPP_END_NAMESPACE_STD                                                                                          \
-    static type symbol##_impl__ arglist
-
-#elif defined(_LIBCPP_OBJECT_FORMAT_ELF)
+// The NVPTX linker cannot create '__start/__stop' sections.
+#elif defined(_LIBCPP_OBJECT_FORMAT_ELF) && !defined(__NVPTX__)
 
-_LIBCPP_BEGIN_NAMESPACE_STD
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override")))
 
-template <auto _Func>
-_LIBCPP_HIDE_FROM_ABI constexpr bool __is_function_overridden();
+// This is very similar to what we do for Mach-O above. The ELF linker will implicitly define
+// variables with those names corresponding to the start and the end of the section.
+//
+// See https://stackoverflow.com/questions/16552710/how-do-you-get-the-start-and-end-addresses-of-a-custom-elf-section
+extern char __start___lcxx_override;
+extern char __stop___lcxx_override;
 
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__start___lcxx_override);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__stop___lcxx_override);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+#  if __has_feature(ptrauth_calls)
+  // We must pass a void* to ptrauth_strip since it only accepts a pointer type. See full explanation above.
+  __ptr = reinterpret_cast<uintptr_t>(ptrauth_strip(reinterpret_cast<void*>(__ptr), ptrauth_key_function_pointer));
+#  endif
+
+  return __ptr < __start || __ptr > __end;
+}
 _LIBCPP_END_NAMESPACE_STD
 
-#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist)                                                    \
-    static type symbol##_impl__ arglist __asm__(_LIBCPP_TOSTRING(symbol##_impl__));                                    \
-    [[gnu::weak, gnu::alias(_LIBCPP_TOSTRING(symbol##_impl__))]] type name arglist;                                    \
-    _LIBCPP_BEGIN_NAMESPACE_STD                                                                                        \
-    template <>                                                                                                        \
-    inline bool __is_function_overridden<static_cast<type(*) arglist>(name)>() {                                       \
-      return static_cast<type(*) arglist>(name) != symbol##_impl__;                                                    \
-    }                                                                                                                  \
-    _LIBCPP_END_NAMESPACE_STD                                                                                          \
-    static type symbol##_impl__ arglist
-
 #else
 
 #  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 0
-#  define _LIBCPP_OVERRIDABLE_FUNCTION(symbol, type, name, arglist) _LIBCPP_WEAK type name arglist
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE /* nothing */
 
 #endif
 
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index b14b52248df33..e010fe4c4f191 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -43,7 +43,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -54,7 +54,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #  endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -82,7 +82,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -136,8 +136,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -148,7 +148,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -168,14 +168,16 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #    endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+  return ::operator new(size, alignment);
+}
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #    if !_LIBCPP_HAS_EXCEPTIONS
 #      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index 73798e211c313..f386b28f0cfe6 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -63,7 +63,7 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znwm, void*, operator new, (std::size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -74,7 +74,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
       "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
@@ -94,7 +94,7 @@ _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
 #endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_Znam, void*, operator new[], (size_t size)) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
   return ::operator new(size);
 }
 
@@ -102,7 +102,7 @@ _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
 #if !_LIBCPP_HAS_EXCEPTIONS
 #  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
       "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
@@ -156,8 +156,8 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnwmSt11align_val_t, void*, operator new, (std::size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
   if (p == nullptr)
     __throw_bad_alloc_shim();
@@ -168,7 +168,7 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
       "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
       "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
@@ -188,14 +188,16 @@ _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const s
 #  endif
 }
 
-_LIBCPP_OVERRIDABLE_FUNCTION(_ZnamSt11align_val_t, void*, operator new[], (size_t size, std::align_val_t alignment))
-_THROW_BAD_ALLOC { return ::operator new(size, alignment); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+  return ::operator new(size, alignment);
+}
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
 #  if !_LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
   _LIBCPP_ASSERT_SHIM(
-      !std::__is_function_overridden<static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])>(),
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
       "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
       "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
       "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "

From b593110d89aea76b8b10152b24ece154bff3e4b5 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 27 Jan 2025 22:32:38 -0800
Subject: [PATCH 347/432] [compiler-rt] Deprecate LLVM_ENABLE_PROJECTS in favor
 of LLVM_ENABLE_RUNTIMES (#124016)

We plan to make this a hard error in the LLVM 21 release.

Link #124012
---
 llvm/CMakeLists.txt       | 7 +++++++
 llvm/docs/ReleaseNotes.md | 6 ++++++
 2 files changed, 13 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index bdf746525e233..c9ff3696e22d6 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -157,6 +157,13 @@ if ("libc" IN_LIST LLVM_ENABLE_PROJECTS)
     "https://libc.llvm.org/ for building the runtimes.")
 endif()
 
+if ("compiler-rt" IN_LIST LLVM_ENABLE_PROJECTS)
+  message(WARNING "Using LLVM_ENABLE_PROJECTS=compiler-rt is deprecated now, and will "
+    "become a fatal error in the LLVM 21 release.  Please use "
+    "-DLLVM_ENABLE_RUNTIMES=compiler-rt or see the instructions at "
+    "https://compiler-rt.llvm.org/ for building the runtimes.")
+endif()
+
 # Select the runtimes to build
 #
 # As we migrate runtimes to using the bootstrapping build, the set of default runtimes
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 0872f20bea590..c065787b874ab 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -120,6 +120,12 @@ Changes to building LLVM
 ------------------------
 
 * Raised the minimum MSVC version to Visual Studio 2019 16.8.
+* Deprecated support for building compiler-rt with `LLVM_ENABLE_PROJECTS`.
+  Users should instead use `LLVM_ENABLE_RUNTIMES`, either through the
+  runtimes or the bootstrapping build.
+* Deprecated support for building libc with `LLVM_ENABLE_PROJECTS`.
+  Users should instead use `LLVM_ENABLE_RUNTIMES`, either through the
+  runtimes or the bootstrapping build.
 
 Changes to TableGen
 -------------------

From 8f8a640e9ab579ec2297dfe97249bb4f6bd6e021 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 22:33:49 -0800
Subject: [PATCH 348/432] [ELF,test] Test static-pie __global_pointer$

---
 lld/test/ELF/riscv-gp.s | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/lld/test/ELF/riscv-gp.s b/lld/test/ELF/riscv-gp.s
index e82e36ee9a7ae..a30f5e9fbc625 100644
--- a/lld/test/ELF/riscv-gp.s
+++ b/lld/test/ELF/riscv-gp.s
@@ -18,6 +18,14 @@
 
 # ERR: error: relocation R_RISCV_PCREL_HI20 cannot be used against symbol '__global_pointer$'; recompile with -fPIC
 
+# RUN: ld.lld -pie --no-dynamic-linker --export-dynamic %t.64.o -o %t.64e
+# RUN: llvm-readelf -s %t.64e | FileCheck %s --check-prefix=STATICPIE
+
+# STATICPIE:     '.dynsym'
+# STATICPIE-NOT: __global_pointer$
+# STATICPIE:     '.symtab'
+# STATICPIE:     __global_pointer$
+
 ## -r mode does not define __global_pointer$.
 # RUN: ld.lld -r %t.64.o -o %t.64.ro
 # RUN: llvm-readelf -s %t.64.ro | FileCheck --check-prefix=RELOCATABLE %s

From 5e43dd5bde99bf0e1f58efb4bb4fd3eb1a78d253 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 22:38:04 -0800
Subject: [PATCH 349/432] [test] Add missing -triple=x86_64

Fixes 7109f521975e9cc2e8ba4f52ac2a8e1140bd49b5
---
 lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
index 93143b1fa1258..189e3c0b821bd 100644
--- a/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
+++ b/lld/test/ELF/lto/devirt_vcall_vis_export_dynamic.ll
@@ -29,7 +29,7 @@
 ;; Check that all WPD fails with --export-dynamic.
 
 ; RUN: echo '.globl foo; foo:' > %ta.s
-; RUN: llvm-mc -filetype=obj %ta.s -o %ta.o
+; RUN: llvm-mc -filetype=obj -triple=x86_64 %ta.s -o %ta.o
 ; RUN: ld.lld -shared -soname=ta %ta.o -o %ta.so
 
 ;; Index based WPD

From 085f7fb560ee08a4d78a51dbf247ea816f8515a7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 27 Jan 2025 22:39:40 -0800
Subject: [PATCH 350/432] [ELF] Remove redundant isExported computation

Commit 2a26292388fcab0c857c91b2d08074c33abd37e8 made `isExported`
accurate except a few linker-synthesized symbols in finalizeSections.
We can collect these linker-synthesized symbols into a vector
and avoid recomputation for other symbols.

This is reland of 1a4d6de1b532149b10522eae5dabce39e5f7c687 after
`isExported` has been made accurate by f10441ad003236ef3b9e5415a571d2be0c0ce5ce
---
 lld/ELF/Config.h   |  1 +
 lld/ELF/Writer.cpp | 13 ++++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 1d47645edcd83..c2aadb2cef520 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -619,6 +619,7 @@ struct Ctx : CommonLinkerContext {
   };
   ElfSym sym{};
   std::unique_ptr<SymbolTable> symtab;
+  SmallVector<Symbol *, 0> synthesizedSymbols;
 
   SmallVector<std::unique_ptr<MemoryBuffer>> memoryBuffers;
   SmallVector<ELFFileBase *, 0> objectFiles;
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 535d7871a18ad..6c7bcee02047b 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -149,6 +149,7 @@ static Defined *addOptionalRegular(Ctx &ctx, StringRef name, SectionBase *sec,
   if (!s || s->isDefined() || s->isCommon())
     return nullptr;
 
+  ctx.synthesizedSymbols.push_back(s);
   s->resolve(ctx, Defined{ctx, ctx.internalFile, StringRef(), STB_GLOBAL,
                           stOther, STT_NOTYPE, val,
                           /*size=*/0, sec});
@@ -295,13 +296,13 @@ static void demoteSymbolsAndComputeIsPreemptible(Ctx &ctx) {
                   sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
+        if (hasDynsym && sym->includeInDynsym(ctx))
+          sym->isExported = true;
       }
     }
 
-    if (hasDynsym) {
-      sym->isExported = sym->includeInDynsym(ctx);
+    if (hasDynsym)
       sym->isPreemptible = sym->isExported && computeIsPreemptible(ctx, *sym);
-    }
   }
 }
 
@@ -1838,6 +1839,12 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     }
   }
 
+  // If the previous code block defines any non-hidden symbols (e.g.
+  // __global_pointer$), they may be exported.
+  if (ctx.hasDynsym)
+    for (Symbol *sym : ctx.synthesizedSymbols)
+      sym->isExported = sym->includeInDynsym(ctx);
+
   demoteSymbolsAndComputeIsPreemptible(ctx);
 
   if (ctx.arg.copyRelocs && ctx.arg.discard != DiscardPolicy::None)

From ea9993a9a3500c3fdda3faa731c458389458eaa6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 27 Jan 2025 22:40:05 -0800
Subject: [PATCH 351/432] [RISCV] Add P550 scheduler model. (#124639)

P550 falls between P450 and P650. It has 1 additional FEX pipe over
P450. Mul and cpop latency are 3 instead of 2.

I've set the MicroOpBufferSize to 96 instead of 56 based on the ROB size
measurement from
https://chipsandcheese.com/p/inside-sifives-p550-microarchitecture I
believe we set this value too low for P450 and P650 and should update
them in a separate PR.
---
 llvm/lib/Target/RISCV/RISCV.td                |   1 +
 llvm/lib/Target/RISCV/RISCVProcessors.td      |   2 +-
 llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td | 361 ++++++++++++++++++
 .../tools/llvm-mca/RISCV/SiFiveP500/alu.s     | 200 ++++++++++
 .../test/tools/llvm-mca/RISCV/SiFiveP500/fp.s | 155 ++++++++
 .../tools/llvm-mca/RISCV/SiFiveP500/load.s    |  54 +++
 6 files changed, 772 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveP500/alu.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveP500/fp.s
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFiveP500/load.s

diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 4e0c64a5ca2c6..87c07c3cd505f 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -50,6 +50,7 @@ include "RISCVSchedMIPSP8700.td"
 include "RISCVSchedRocket.td"
 include "RISCVSchedSiFive7.td"
 include "RISCVSchedSiFiveP400.td"
+include "RISCVSchedSiFiveP500.td"
 include "RISCVSchedSiFiveP600.td"
 include "RISCVSchedSyntacoreSCR1.td"
 include "RISCVSchedSyntacoreSCR345.td"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 6dfed7ddeb9f6..28b13f74c2991 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -327,7 +327,7 @@ defvar SiFiveP500TuneFeatures = [TuneNoDefaultUnroll,
                                  TuneAUIPCADDIFusion,
                                  TunePostRAScheduler];
 
-def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", NoSchedModel,
+def SIFIVE_P550 : RISCVProcessorModel<"sifive-p550", SiFiveP500Model,
                                       [Feature64Bit,
                                        FeatureStdExtI,
                                        FeatureStdExtZifencei,
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td
new file mode 100644
index 0000000000000..32cfa701c4fdb
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP500.td
@@ -0,0 +1,361 @@
+//==- RISCVSchedSiFiveP500.td - SiFiveP500 Scheduling Defs ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+
+def SiFiveP500Model : SchedMachineModel {
+  let IssueWidth = 3;         // 3 micro-ops are dispatched per cycle.
+  let MicroOpBufferSize = 96; // Max micro-ops that can be buffered.
+  let LoadLatency = 4;        // Cycles for loads to access the cache.
+  let MispredictPenalty = 9;  // Extra cycles for a mispredicted branch.
+  let CompleteModel = false;
+}
+
+// The SiFiveP500 microarchitecure has 7 pipelines:
+// Three pipelines for integer operations.
+// Two pipelines for FPU operations.
+// One pipeline for Load operations.
+// One pipeline for Store operations.
+let SchedModel = SiFiveP500Model in {
+
+def SiFiveP500IEXQ0       : ProcResource<1>;
+def SiFiveP500IEXQ1       : ProcResource<1>;
+def SiFiveP500IEXQ2       : ProcResource<1>;
+def SiFiveP500FEXQ0       : ProcResource<1>;
+def SiFiveP500FEXQ1       : ProcResource<1>;
+def SiFiveP500Load        : ProcResource<1>;
+def SiFiveP500Store       : ProcResource<1>;
+
+def SiFiveP500IntArith    : ProcResGroup<[SiFiveP500IEXQ0, SiFiveP500IEXQ1, SiFiveP500IEXQ2]>;
+defvar SiFiveP500Branch   = SiFiveP500IEXQ0;
+defvar SiFiveP500SYS      = SiFiveP500IEXQ1;
+defvar SiFiveP500CMOV     = SiFiveP500IEXQ1;
+defvar SiFiveP500MulI2F   = SiFiveP500IEXQ2;
+def SiFiveP500Div         : ProcResource<1>;
+
+def SiFiveP500FloatArith  : ProcResGroup<[SiFiveP500FEXQ0, SiFiveP500FEXQ1]>;
+defvar SiFiveP500F2I      = SiFiveP500FEXQ0;
+def SiFiveP500FloatDiv    : ProcResource<1>;
+
+let Latency = 1 in {
+// Integer arithmetic and logic
+def : WriteRes<WriteIALU, [SiFiveP500IntArith]>;
+def : WriteRes<WriteIALU32, [SiFiveP500IntArith]>;
+def : WriteRes<WriteShiftImm, [SiFiveP500IntArith]>;
+def : WriteRes<WriteShiftImm32, [SiFiveP500IntArith]>;
+def : WriteRes<WriteShiftReg, [SiFiveP500IntArith]>;
+def : WriteRes<WriteShiftReg32, [SiFiveP500IntArith]>;
+// Branching
+def : WriteRes<WriteJmp, [SiFiveP500Branch]>;
+def : WriteRes<WriteJal, [SiFiveP500Branch]>;
+def : WriteRes<WriteJalr, [SiFiveP500Branch]>;
+}
+
+// CMOV
+def P500WriteCMOV : SchedWriteRes<[SiFiveP500Branch, SiFiveP500CMOV]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def : InstRW<[P500WriteCMOV], (instrs PseudoCCMOVGPRNoX0)>;
+
+let Latency = 3 in {
+// Integer multiplication
+def : WriteRes<WriteIMul, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteIMul32, [SiFiveP500MulI2F]>;
+// cpop[w] look exactly like multiply.
+def : WriteRes<WriteCPOP, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteCPOP32, [SiFiveP500MulI2F]>;
+}
+
+// Integer division
+def : WriteRes<WriteIDiv, [SiFiveP500MulI2F, SiFiveP500Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIDiv32, [SiFiveP500MulI2F, SiFiveP500Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+// Integer remainder
+def : WriteRes<WriteIRem, [SiFiveP500MulI2F, SiFiveP500Div]> {
+  let Latency = 35;
+  let ReleaseAtCycles = [1, 34];
+}
+def : WriteRes<WriteIRem32, [SiFiveP500MulI2F, SiFiveP500Div]> {
+  let Latency = 20;
+  let ReleaseAtCycles = [1, 19];
+}
+
+let Latency = 1 in {
+// Bitmanip
+def : WriteRes<WriteRotateImm, [SiFiveP500IntArith]>;
+def : WriteRes<WriteRotateImm32, [SiFiveP500IntArith]>;
+def : WriteRes<WriteRotateReg, [SiFiveP500IntArith]>;
+def : WriteRes<WriteRotateReg32, [SiFiveP500IntArith]>;
+
+def : WriteRes<WriteCLZ, [SiFiveP500IntArith]>;
+def : WriteRes<WriteCLZ32, [SiFiveP500IntArith]>;
+def : WriteRes<WriteCTZ, [SiFiveP500IntArith]>;
+def : WriteRes<WriteCTZ32, [SiFiveP500IntArith]>;
+
+def : WriteRes<WriteORCB, [SiFiveP500IntArith]>;
+def : WriteRes<WriteIMinMax, [SiFiveP500IntArith]>;
+
+def : WriteRes<WriteREV8, [SiFiveP500IntArith]>;
+
+def : WriteRes<WriteSHXADD, [SiFiveP500IntArith]>;
+def : WriteRes<WriteSHXADD32, [SiFiveP500IntArith]>;
+}
+
+// Memory
+let Latency = 1 in {
+def : WriteRes<WriteSTB, [SiFiveP500Store]>;
+def : WriteRes<WriteSTH, [SiFiveP500Store]>;
+def : WriteRes<WriteSTW, [SiFiveP500Store]>;
+def : WriteRes<WriteSTD, [SiFiveP500Store]>;
+def : WriteRes<WriteFST16, [SiFiveP500Store]>;
+def : WriteRes<WriteFST32, [SiFiveP500Store]>;
+def : WriteRes<WriteFST64, [SiFiveP500Store]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteLDB, [SiFiveP500Load]>;
+def : WriteRes<WriteLDH, [SiFiveP500Load]>;
+}
+let Latency = 4 in {
+def : WriteRes<WriteLDW, [SiFiveP500Load]>;
+def : WriteRes<WriteLDD, [SiFiveP500Load]>;
+}
+
+let Latency = 6 in {
+def : WriteRes<WriteFLD16, [SiFiveP500Load]>;
+def : WriteRes<WriteFLD32, [SiFiveP500Load]>;
+def : WriteRes<WriteFLD64, [SiFiveP500Load]>;
+}
+
+// Atomic memory
+let Latency = 3 in {
+def : WriteRes<WriteAtomicSTW, [SiFiveP500Store]>;
+def : WriteRes<WriteAtomicSTD, [SiFiveP500Store]>;
+def : WriteRes<WriteAtomicW, [SiFiveP500Load]>;
+def : WriteRes<WriteAtomicD, [SiFiveP500Load]>;
+def : WriteRes<WriteAtomicLDW, [SiFiveP500Load]>;
+def : WriteRes<WriteAtomicLDD, [SiFiveP500Load]>;
+}
+
+// Floating point
+let Latency = 4 in {
+def : WriteRes<WriteFAdd16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFAdd32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFAdd64, [SiFiveP500FloatArith]>;
+
+def : WriteRes<WriteFMul16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMul32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMul64, [SiFiveP500FloatArith]>;
+
+def : WriteRes<WriteFMA16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMA32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMA64, [SiFiveP500FloatArith]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFSGNJ16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFSGNJ32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFSGNJ64, [SiFiveP500FloatArith]>;
+
+def : WriteRes<WriteFMinMax16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMinMax32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFMinMax64, [SiFiveP500FloatArith]>;
+}
+
+// Half precision.
+def : WriteRes<WriteFDiv16, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 19;
+  let ReleaseAtCycles = [1, 18];
+}
+def : WriteRes<WriteFSqrt16, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Single precision.
+def : WriteRes<WriteFDiv32, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 19;
+  let ReleaseAtCycles = [1, 18];
+}
+def : WriteRes<WriteFSqrt32, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 18;
+  let ReleaseAtCycles = [1, 17];
+}
+
+// Double precision
+def : WriteRes<WriteFDiv64, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 33;
+  let ReleaseAtCycles = [1, 32];
+}
+def : WriteRes<WriteFSqrt64, [SiFiveP500FEXQ1, SiFiveP500FloatDiv]> {
+  let Latency = 33;
+  let ReleaseAtCycles = [1, 32];
+}
+
+// Conversions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF16, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF32, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtI32ToF64, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF16, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF32, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtI64ToF64, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFCvtF16ToI32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF16ToI64, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF16ToF32, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFCvtF16ToF64, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFCvtF32ToI32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF32ToI64, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF32ToF16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFCvtF32ToF64, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFCvtF64ToI32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF64ToI64, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCvtF64ToF16, [SiFiveP500FloatArith]>;
+def : WriteRes<WriteFCvtF64ToF32, [SiFiveP500FloatArith]>;
+
+def : WriteRes<WriteFClass16, [SiFiveP500F2I]>;
+def : WriteRes<WriteFClass32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFClass64, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCmp16, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCmp32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFCmp64, [SiFiveP500F2I]>;
+def : WriteRes<WriteFMovI16ToF16, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFMovF16ToI16, [SiFiveP500F2I]>;
+def : WriteRes<WriteFMovI32ToF32, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFMovF32ToI32, [SiFiveP500F2I]>;
+def : WriteRes<WriteFMovI64ToF64, [SiFiveP500MulI2F]>;
+def : WriteRes<WriteFMovF64ToI64, [SiFiveP500F2I]>;
+}
+
+// Others
+def : WriteRes<WriteCSR, [SiFiveP500SYS]>;
+def : WriteRes<WriteNop, []>;
+
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[WriteIALU, ReadIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Bypass and advance
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShiftImm, 0>;
+def : ReadAdvance<ReadShiftImm32, 0>;
+def : ReadAdvance<ReadShiftReg, 0>;
+def : ReadAdvance<ReadShiftReg32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIRem, 0>;
+def : ReadAdvance<ReadIRem32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFStoreData, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFAdd16, 0>;
+def : ReadAdvance<ReadFAdd32, 0>;
+def : ReadAdvance<ReadFAdd64, 0>;
+def : ReadAdvance<ReadFMul16, 0>;
+def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
+def : ReadAdvance<ReadFDiv16, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt16, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp16, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ16, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax16, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF16ToI32, 0>;
+def : ReadAdvance<ReadFCvtF16ToI64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF16, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF16, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFCvtF16ToF32, 0>;
+def : ReadAdvance<ReadFCvtF32ToF16, 0>;
+def : ReadAdvance<ReadFCvtF16ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF16, 0>;
+def : ReadAdvance<ReadFMovF16ToI16, 0>;
+def : ReadAdvance<ReadFMovI16ToF16, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass16, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+
+// Bitmanip
+def : ReadAdvance<ReadRotateImm, 0>;
+def : ReadAdvance<ReadRotateImm32, 0>;
+def : ReadAdvance<ReadRotateReg, 0>;
+def : ReadAdvance<ReadRotateReg32, 0>;
+def : ReadAdvance<ReadCLZ, 0>;
+def : ReadAdvance<ReadCLZ32, 0>;
+def : ReadAdvance<ReadCTZ, 0>;
+def : ReadAdvance<ReadCTZ32, 0>;
+def : ReadAdvance<ReadCPOP, 0>;
+def : ReadAdvance<ReadCPOP32, 0>;
+def : ReadAdvance<ReadORCB, 0>;
+def : ReadAdvance<ReadIMinMax, 0>;
+def : ReadAdvance<ReadREV8, 0>;
+def : ReadAdvance<ReadSHXADD, 0>;
+def : ReadAdvance<ReadSHXADD32, 0>;
+
+//===----------------------------------------------------------------------===//
+// Unsupported extensions
+defm : UnsupportedSchedV;
+defm : UnsupportedSchedZabha;
+defm : UnsupportedSchedZbc;
+defm : UnsupportedSchedZbs;
+defm : UnsupportedSchedZbkb;
+defm : UnsupportedSchedZbkx;
+defm : UnsupportedSchedSFB;
+defm : UnsupportedSchedZfa;
+defm : UnsupportedSchedZvk;
+defm : UnsupportedSchedXsfvcp;
+}
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/alu.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/alu.s
new file mode 100644
index 0000000000000..d7791b73bf1bf
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/alu.s
@@ -0,0 +1,200 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p550 -iterations=1 < %s | FileCheck %s
+
+add t0, t0, t0
+addw t2, t2, t2
+sub t1, t1, t1
+subw t3, t3, t3
+
+addi t0, t0, 100
+addiw t2, t2, 200
+
+and t2, t2, t2
+or  t3, t3, t3
+xor t4, t4, t4
+
+andi t2, t2, 10
+ori  t3, t3, 20
+xori t4, t4, 30
+
+sll t5, t5, t6
+srl t0, s0, t6
+sra s1, s1, t6
+
+slli t5, t5, 1
+srli t0, s0, 2
+srai s1, s1, 3
+
+mul s6, s6, s7
+mulw s4, s4, a2
+
+div a0, a0, a0
+divw a1, a1, a1
+rem a2, a2, a2
+remw a2, a2, a2
+
+rol t5, t5, t6
+ror t6, t6, t5
+rori t5, t5, 5
+
+rolw t5, t5, t6
+rorw t6, t6, t5
+roriw t5, t5, 5
+
+andn a0, a0, a1
+orn a1, a2, a3
+xnor a2, a3, a4
+
+min a3, a4, a5
+max a4, a5, a6
+minu a5, a6, a7
+maxu a6, a7, s0
+
+orc.b s0, s1
+rev8 s1, s2
+
+cpop t1, t1
+cpopw t2, t2
+
+sh1add t0, t1, t2
+sh2add t0, t1, t2
+sh3add t0, t1, t2
+
+add.uw    s0, s1, s2
+sh1add.uw t0, t1, t2
+sh2add.uw t0, t1, t2
+sh3add.uw t0, t1, t2
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      48
+# CHECK-NEXT: Total Cycles:      120
+# CHECK-NEXT: Total uOps:        48
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.40
+# CHECK-NEXT: IPC:               0.40
+# CHECK-NEXT: Block RThroughput: 106.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      1     0.33                        add	t0, t0, t0
+# CHECK-NEXT:  1      1     0.33                        addw	t2, t2, t2
+# CHECK-NEXT:  1      1     0.33                        sub	t1, t1, t1
+# CHECK-NEXT:  1      1     0.33                        subw	t3, t3, t3
+# CHECK-NEXT:  1      1     0.33                        addi	t0, t0, 100
+# CHECK-NEXT:  1      1     0.33                        addiw	t2, t2, 200
+# CHECK-NEXT:  1      1     0.33                        and	t2, t2, t2
+# CHECK-NEXT:  1      1     0.33                        or	t3, t3, t3
+# CHECK-NEXT:  1      1     0.33                        xor	t4, t4, t4
+# CHECK-NEXT:  1      1     0.33                        andi	t2, t2, 10
+# CHECK-NEXT:  1      1     0.33                        ori	t3, t3, 20
+# CHECK-NEXT:  1      1     0.33                        xori	t4, t4, 30
+# CHECK-NEXT:  1      1     0.33                        sll	t5, t5, t6
+# CHECK-NEXT:  1      1     0.33                        srl	t0, s0, t6
+# CHECK-NEXT:  1      1     0.33                        sra	s1, s1, t6
+# CHECK-NEXT:  1      1     0.33                        slli	t5, t5, 1
+# CHECK-NEXT:  1      1     0.33                        srli	t0, s0, 2
+# CHECK-NEXT:  1      1     0.33                        srai	s1, s1, 3
+# CHECK-NEXT:  1      3     1.00                        mul	s6, s6, s7
+# CHECK-NEXT:  1      3     1.00                        mulw	s4, s4, a2
+# CHECK-NEXT:  1      35    34.00                       div	a0, a0, a0
+# CHECK-NEXT:  1      20    19.00                       divw	a1, a1, a1
+# CHECK-NEXT:  1      35    34.00                       rem	a2, a2, a2
+# CHECK-NEXT:  1      20    19.00                       remw	a2, a2, a2
+# CHECK-NEXT:  1      1     0.33                        rol	t5, t5, t6
+# CHECK-NEXT:  1      1     0.33                        ror	t6, t6, t5
+# CHECK-NEXT:  1      1     0.33                        rori	t5, t5, 5
+# CHECK-NEXT:  1      1     0.33                        rolw	t5, t5, t6
+# CHECK-NEXT:  1      1     0.33                        rorw	t6, t6, t5
+# CHECK-NEXT:  1      1     0.33                        roriw	t5, t5, 5
+# CHECK-NEXT:  1      1     0.33                        andn	a0, a0, a1
+# CHECK-NEXT:  1      1     0.33                        orn	a1, a2, a3
+# CHECK-NEXT:  1      1     0.33                        xnor	a2, a3, a4
+# CHECK-NEXT:  1      1     0.33                        min	a3, a4, a5
+# CHECK-NEXT:  1      1     0.33                        max	a4, a5, a6
+# CHECK-NEXT:  1      1     0.33                        minu	a5, a6, a7
+# CHECK-NEXT:  1      1     0.33                        maxu	a6, a7, s0
+# CHECK-NEXT:  1      1     0.33                        orc.b	s0, s1
+# CHECK-NEXT:  1      1     0.33                        rev8	s1, s2
+# CHECK-NEXT:  1      3     1.00                        cpop	t1, t1
+# CHECK-NEXT:  1      3     1.00                        cpopw	t2, t2
+# CHECK-NEXT:  1      1     0.33                        sh1add	t0, t1, t2
+# CHECK-NEXT:  1      1     0.33                        sh2add	t0, t1, t2
+# CHECK-NEXT:  1      1     0.33                        sh3add	t0, t1, t2
+# CHECK-NEXT:  1      1     0.33                        add.uw	s0, s1, s2
+# CHECK-NEXT:  1      1     0.33                        sh1add.uw	t0, t1, t2
+# CHECK-NEXT:  1      1     0.33                        sh2add.uw	t0, t1, t2
+# CHECK-NEXT:  1      1     0.33                        sh3add.uw	t0, t1, t2
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP500Div
+# CHECK-NEXT: [1]   - SiFiveP500FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP500FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP500FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP500IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP500IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP500IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP500Load
+# CHECK-NEXT: [8]   - SiFiveP500Store
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT: 106.00  -      -      -     14.00  15.00  19.00   -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     add	t0, t0, t0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     addw	t2, t2, t2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     sub	t1, t1, t1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     subw	t3, t3, t3
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     addi	t0, t0, 100
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     addiw	t2, t2, 200
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     and	t2, t2, t2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     or	t3, t3, t3
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     xor	t4, t4, t4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     andi	t2, t2, 10
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     ori	t3, t3, 20
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     xori	t4, t4, 30
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sll	t5, t5, t6
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     srl	t0, s0, t6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     sra	s1, s1, t6
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     slli	t5, t5, 1
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     srli	t0, s0, 2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     srai	s1, s1, 3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mul	s6, s6, s7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     mulw	s4, s4, a2
+# CHECK-NEXT: 34.00   -      -      -      -      -     1.00    -      -     div	a0, a0, a0
+# CHECK-NEXT: 19.00   -      -      -      -      -     1.00    -      -     divw	a1, a1, a1
+# CHECK-NEXT: 34.00   -      -      -      -      -     1.00    -      -     rem	a2, a2, a2
+# CHECK-NEXT: 19.00   -      -      -      -      -     1.00    -      -     remw	a2, a2, a2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     rol	t5, t5, t6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     ror	t6, t6, t5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     rori	t5, t5, 5
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     rolw	t5, t5, t6
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     rorw	t6, t6, t5
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     roriw	t5, t5, 5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     andn	a0, a0, a1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     orn	a1, a2, a3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     xnor	a2, a3, a4
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     min	a3, a4, a5
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     max	a4, a5, a6
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     minu	a5, a6, a7
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     maxu	a6, a7, s0
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     orc.b	s0, s1
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     rev8	s1, s2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cpop	t1, t1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     cpopw	t2, t2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     sh1add	t0, t1, t2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     sh2add	t0, t1, t2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sh3add	t0, t1, t2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     add.uw	s0, s1, s2
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -      -     sh1add.uw	t0, t1, t2
+# CHECK-NEXT:  -      -      -      -     1.00    -      -      -      -     sh2add.uw	t0, t1, t2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     sh3add.uw	t0, t1, t2
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/fp.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/fp.s
new file mode 100644
index 0000000000000..8eb64405b7911
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/fp.s
@@ -0,0 +1,155 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p550 -iterations=1 < %s | FileCheck %s
+
+fmin.s ft0, fa0, fa1
+fmax.s ft1, fa0, fa1
+fmadd.s fs0, fs0, fs8, fs9
+fmsub.s fs1, fs1, fs8, fs9
+fmul.s fs3, fs3, fs4
+fadd.s fs4, fs4, fs5
+fdiv.s fs2, fs3, fs4
+fsqrt.s ft1, fa2
+
+fcvt.s.w ft2, a0
+fcvt.s.l ft3, a1
+fcvt.s.wu ft2, a0
+fcvt.s.lu ft3, a1
+fcvt.w.s a0, ft2
+fcvt.l.s a1, fs2
+fcvt.wu.s a0, ft2
+fcvt.lu.s a1, fs2
+
+feq.s a2, fa0, fa1
+fclass.s a3, fa2
+
+fmin.d ft2, ft4, ft5
+fmax.d ft3, ft4, ft5
+fmadd.d fs0, fs0, fs8, fs9
+fmsub.d fs1, fs1, fs8, fs9
+fmul.d ft4, ft4, ft5
+fadd.d ft4, ft5, ft6
+fdiv.d fs4, fa3, ft5
+fsqrt.d ft2, fa3
+
+fcvt.d.w ft2, a0
+fcvt.d.l ft3, a1
+fcvt.d.w ft2, a0
+fcvt.d.l ft3, a1
+fcvt.w.d a0, ft2
+fcvt.l.d a1, fs2
+
+feq.d a2, fa0, fa1
+fclass.d a3, fa2
+
+fcvt.d.s ft1, ft2
+fcvt.s.d ft3, ft4
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      36
+# CHECK-NEXT: Total Cycles:      106
+# CHECK-NEXT: Total uOps:        36
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.34
+# CHECK-NEXT: IPC:               0.34
+# CHECK-NEXT: Block RThroughput: 99.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      2     0.50                        fmin.s	ft0, fa0, fa1
+# CHECK-NEXT:  1      2     0.50                        fmax.s	ft1, fa0, fa1
+# CHECK-NEXT:  1      4     0.50                        fmadd.s	fs0, fs0, fs8, fs9
+# CHECK-NEXT:  1      4     0.50                        fmsub.s	fs1, fs1, fs8, fs9
+# CHECK-NEXT:  1      4     0.50                        fmul.s	fs3, fs3, fs4
+# CHECK-NEXT:  1      4     0.50                        fadd.s	fs4, fs4, fs5
+# CHECK-NEXT:  1      19    18.00                       fdiv.s	fs2, fs3, fs4
+# CHECK-NEXT:  1      18    17.00                       fsqrt.s	ft1, fa2
+# CHECK-NEXT:  1      2     1.00                        fcvt.s.w	ft2, a0
+# CHECK-NEXT:  1      2     1.00                        fcvt.s.l	ft3, a1
+# CHECK-NEXT:  1      2     1.00                        fcvt.s.wu	ft2, a0
+# CHECK-NEXT:  1      2     1.00                        fcvt.s.lu	ft3, a1
+# CHECK-NEXT:  1      2     1.00                        fcvt.w.s	a0, ft2
+# CHECK-NEXT:  1      2     1.00                        fcvt.l.s	a1, fs2
+# CHECK-NEXT:  1      2     1.00                        fcvt.wu.s	a0, ft2
+# CHECK-NEXT:  1      2     1.00                        fcvt.lu.s	a1, fs2
+# CHECK-NEXT:  1      2     1.00                        feq.s	a2, fa0, fa1
+# CHECK-NEXT:  1      2     1.00                        fclass.s	a3, fa2
+# CHECK-NEXT:  1      2     0.50                        fmin.d	ft2, ft4, ft5
+# CHECK-NEXT:  1      2     0.50                        fmax.d	ft3, ft4, ft5
+# CHECK-NEXT:  1      4     0.50                        fmadd.d	fs0, fs0, fs8, fs9
+# CHECK-NEXT:  1      4     0.50                        fmsub.d	fs1, fs1, fs8, fs9
+# CHECK-NEXT:  1      4     0.50                        fmul.d	ft4, ft4, ft5
+# CHECK-NEXT:  1      4     0.50                        fadd.d	ft4, ft5, ft6
+# CHECK-NEXT:  1      33    32.00                       fdiv.d	fs4, fa3, ft5
+# CHECK-NEXT:  1      33    32.00                       fsqrt.d	ft2, fa3
+# CHECK-NEXT:  1      2     1.00                        fcvt.d.w	ft2, a0
+# CHECK-NEXT:  1      2     1.00                        fcvt.d.l	ft3, a1
+# CHECK-NEXT:  1      2     1.00                        fcvt.d.w	ft2, a0
+# CHECK-NEXT:  1      2     1.00                        fcvt.d.l	ft3, a1
+# CHECK-NEXT:  1      2     1.00                        fcvt.w.d	a0, ft2
+# CHECK-NEXT:  1      2     1.00                        fcvt.l.d	a1, fs2
+# CHECK-NEXT:  1      2     1.00                        feq.d	a2, fa0, fa1
+# CHECK-NEXT:  1      2     1.00                        fclass.d	a3, fa2
+# CHECK-NEXT:  1      2     0.50                        fcvt.d.s	ft1, ft2
+# CHECK-NEXT:  1      2     0.50                        fcvt.s.d	ft3, ft4
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP500Div
+# CHECK-NEXT: [1]   - SiFiveP500FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP500FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP500FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP500IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP500IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP500IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP500Load
+# CHECK-NEXT: [8]   - SiFiveP500Store
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -     14.00  14.00  99.00   -      -     8.00    -      -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmin.s	ft0, fa0, fa1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fmax.s	ft1, fa0, fa1
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmadd.s	fs0, fs0, fs8, fs9
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fmsub.s	fs1, fs1, fs8, fs9
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmul.s	fs3, fs3, fs4
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fadd.s	fs4, fs4, fs5
+# CHECK-NEXT:  -      -     1.00   18.00   -      -      -      -      -     fdiv.s	fs2, fs3, fs4
+# CHECK-NEXT:  -      -     1.00   17.00   -      -      -      -      -     fsqrt.s	ft1, fa2
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.s.w	ft2, a0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.s.l	ft3, a1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.s.wu	ft2, a0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.s.lu	ft3, a1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.w.s	a0, ft2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.l.s	a1, fs2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.wu.s	a0, ft2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.lu.s	a1, fs2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     feq.s	a2, fa0, fa1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fclass.s	a3, fa2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmin.d	ft2, ft4, ft5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmax.d	ft3, ft4, ft5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmadd.d	fs0, fs0, fs8, fs9
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fmsub.d	fs1, fs1, fs8, fs9
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fmul.d	ft4, ft4, ft5
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fadd.d	ft4, ft5, ft6
+# CHECK-NEXT:  -      -     1.00   32.00   -      -      -      -      -     fdiv.d	fs4, fa3, ft5
+# CHECK-NEXT:  -      -     1.00   32.00   -      -      -      -      -     fsqrt.d	ft2, fa3
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.d.w	ft2, a0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.d.l	ft3, a1
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.d.w	ft2, a0
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -      -     fcvt.d.l	ft3, a1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.w.d	a0, ft2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fcvt.l.d	a1, fs2
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     feq.d	a2, fa0, fa1
+# CHECK-NEXT:  -     1.00    -      -      -      -      -      -      -     fclass.d	a3, fa2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fcvt.d.s	ft1, ft2
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -     fcvt.s.d	ft3, ft4
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/load.s b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/load.s
new file mode 100644
index 0000000000000..2b7df9215e0cb
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFiveP500/load.s
@@ -0,0 +1,54 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-p550 -iterations=1 < %s | FileCheck %s
+
+lw t0, 0(a0)
+ld t0, 0(a0)
+
+flw ft0, 0(a0)
+fld ft0, 0(a0)
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      4
+# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total uOps:        4
+
+# CHECK:      Dispatch Width:    3
+# CHECK-NEXT: uOps Per Cycle:    0.33
+# CHECK-NEXT: IPC:               0.33
+# CHECK-NEXT: Block RThroughput: 4.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      4     1.00    *                   lw	t0, 0(a0)
+# CHECK-NEXT:  1      4     1.00    *                   ld	t0, 0(a0)
+# CHECK-NEXT:  1      6     1.00    *                   flw	ft0, 0(a0)
+# CHECK-NEXT:  1      6     1.00    *                   fld	ft0, 0(a0)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFiveP500Div
+# CHECK-NEXT: [1]   - SiFiveP500FEXQ0
+# CHECK-NEXT: [2]   - SiFiveP500FEXQ1
+# CHECK-NEXT: [3]   - SiFiveP500FloatDiv
+# CHECK-NEXT: [4]   - SiFiveP500IEXQ0
+# CHECK-NEXT: [5]   - SiFiveP500IEXQ1
+# CHECK-NEXT: [6]   - SiFiveP500IEXQ2
+# CHECK-NEXT: [7]   - SiFiveP500Load
+# CHECK-NEXT: [8]   - SiFiveP500Store
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]
+# CHECK-NEXT:  -      -      -      -      -      -      -     4.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     lw	t0, 0(a0)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     ld	t0, 0(a0)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     flw	ft0, 0(a0)
+# CHECK-NEXT:  -      -      -      -      -      -      -     1.00    -     fld	ft0, 0(a0)

From b968fd95026639a2a1d2057627b41622f3b5c97d Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Mon, 27 Jan 2025 22:45:37 -0800
Subject: [PATCH 352/432] [StrTable] Mechanically convert NVPTX builtins to use
 TableGen (#122873)

This switches them to use tho common TableGen layer, extending it to
support the missing features needed by the NVPTX backend.

The biggest thing was to build a TableGen system that computes the
cumulative SM and PTX feature sets the same way the macros did. That's
done with some string concatenation tricks in TableGen, but they worked
out pretty neatly and are very comparable in complexity to the macro
version.

Then the actual defines were mapped over using a very hacky Python
script. It was never productionized or intended to work in the future,
but for posterity:

https://gist.github.com/chandlerc/10bdf8fb1312e252b4a501bace184b66

Last but not least, there was a very odd "bug" in one of the converted
builtins' prototype in the TableGen model: it didn't handle uses of `Z`
and `U` both as *qualifiers* of a single type, treating `Z` as its own
`int32_t` type. So my hacky Python script converted `ZUi` into two
types, an `int32_t` and an `unsigned int`. This produced a very wrong
prototype. But the tests caught this nicely and I fixed it manually
rather than trying to improve the Python script as it occurred in
exactly one place I could find.

This should provide direct benefits of allowing future refactorings to
more directly leverage TableGen to express builtins more structurally
rather than textually. It will also make my efforts to move builtins to
string tables significantly more effective for the NVPTX backend where
the X-macro approach resulted in *significantly* less efficient string
tables than other targets due to the long repeated feature strings.
---
 clang/include/clang/Basic/BuiltinsNVPTX.def   | 1119 -----------------
 clang/include/clang/Basic/BuiltinsNVPTX.td    | 1078 ++++++++++++++++
 clang/include/clang/Basic/CMakeLists.txt      |    4 +
 clang/include/clang/Basic/TargetBuiltins.h    |   10 +-
 clang/include/module.modulemap                |    1 -
 clang/lib/Basic/Targets/NVPTX.cpp             |    6 +-
 clang/test/CodeGen/builtins-nvptx.c           |    2 +-
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp |   37 +
 8 files changed, 1126 insertions(+), 1131 deletions(-)
 delete mode 100644 clang/include/clang/Basic/BuiltinsNVPTX.def
 create mode 100644 clang/include/clang/Basic/BuiltinsNVPTX.td

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def
deleted file mode 100644
index 37b4e6ff77fda..0000000000000
--- a/clang/include/clang/Basic/BuiltinsNVPTX.def
+++ /dev/null
@@ -1,1119 +0,0 @@
-//===--- BuiltinsPTX.def - PTX Builtin function database ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PTX-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#pragma push_macro("SM_53")
-#pragma push_macro("SM_70")
-#pragma push_macro("SM_72")
-#pragma push_macro("SM_75")
-#pragma push_macro("SM_80")
-#pragma push_macro("SM_86")
-#pragma push_macro("SM_87")
-#pragma push_macro("SM_89")
-#pragma push_macro("SM_90")
-#pragma push_macro("SM_90a")
-#pragma push_macro("SM_100")
-#pragma push_macro("SM_100a")
-#define SM_100a "sm_100a"
-#define SM_100 "sm_100|" SM_100a
-#define SM_90a "sm_90a"
-#define SM_90 "sm_90|" SM_90a "|" SM_100
-#define SM_89 "sm_89|" SM_90
-#define SM_87 "sm_87|" SM_89
-#define SM_86 "sm_86|" SM_87
-#define SM_80 "sm_80|" SM_86
-#define SM_75 "sm_75|" SM_80
-#define SM_72 "sm_72|" SM_75
-#define SM_70 "sm_70|" SM_72
-
-#pragma push_macro("SM_60")
-#define SM_60 "sm_60|sm_61|sm_62|" SM_70
-#define SM_53 "sm_53|" SM_60
-
-#pragma push_macro("PTX42")
-#pragma push_macro("PTX60")
-#pragma push_macro("PTX61")
-#pragma push_macro("PTX62")
-#pragma push_macro("PTX63")
-#pragma push_macro("PTX64")
-#pragma push_macro("PTX65")
-#pragma push_macro("PTX70")
-#pragma push_macro("PTX71")
-#pragma push_macro("PTX72")
-#pragma push_macro("PTX73")
-#pragma push_macro("PTX74")
-#pragma push_macro("PTX75")
-#pragma push_macro("PTX76")
-#pragma push_macro("PTX77")
-#pragma push_macro("PTX78")
-#pragma push_macro("PTX80")
-#pragma push_macro("PTX81")
-#pragma push_macro("PTX82")
-#pragma push_macro("PTX83")
-#pragma push_macro("PTX84")
-#pragma push_macro("PTX85")
-#pragma push_macro("PTX86")
-#define PTX86 "ptx86"
-#define PTX85 "ptx85|" PTX86
-#define PTX84 "ptx84|" PTX85
-#define PTX83 "ptx83|" PTX84
-#define PTX82 "ptx82|" PTX83
-#define PTX81 "ptx81|" PTX82
-#define PTX80 "ptx80|" PTX81
-#define PTX78 "ptx78|" PTX80
-#define PTX77 "ptx77|" PTX78
-#define PTX76 "ptx76|" PTX77
-#define PTX75 "ptx75|" PTX76
-#define PTX74 "ptx74|" PTX75
-#define PTX73 "ptx73|" PTX74
-#define PTX72 "ptx72|" PTX73
-#define PTX71 "ptx71|" PTX72
-#define PTX70 "ptx70|" PTX71
-#define PTX65 "ptx65|" PTX70
-#define PTX64 "ptx64|" PTX65
-#define PTX63 "ptx63|" PTX64
-#define PTX62 "ptx62|" PTX63
-#define PTX61 "ptx61|" PTX62
-#define PTX60 "ptx60|" PTX61
-#define PTX42 "ptx42|" PTX60
-
-#pragma push_macro("AND")
-#define AND(a, b) "(" a "),(" b ")"
-
-// Special Registers
-
-BUILTIN(__nvvm_read_ptx_sreg_tid_x, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_tid_y, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_tid_z, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_tid_w, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_ntid_x, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ntid_y, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ntid_z, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ntid_w, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_ctaid_x, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ctaid_y, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ctaid_z, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_ctaid_w, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_nctaid_x, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_nctaid_y, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_nctaid_z, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_nctaid_w, "i", "nc")
-
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_x, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_y, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_z, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_clusterid_w, "i", "nc", AND(SM_90, PTX78))
-
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_x, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_y, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_z, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_nclusterid_w, "i", "nc", AND(SM_90, PTX78))
-
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_x, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_y, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_z, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctaid_w, "i", "nc", AND(SM_90, PTX78))
-
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_x, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_y, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_z, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctaid_w, "i", "nc", AND(SM_90, PTX78))
-
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_ctarank, "i", "nc", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_read_ptx_sreg_cluster_nctarank, "i", "nc", AND(SM_90, PTX78))
-
-TARGET_BUILTIN(__nvvm_is_explicit_cluster, "b", "nc", AND(SM_90, PTX78))
-
-BUILTIN(__nvvm_read_ptx_sreg_laneid, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_warpid, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_nwarpid, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_warpsize, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_smid, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_nsmid, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_gridid, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_lanemask_eq, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_lanemask_le, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_lanemask_lt, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_lanemask_ge, "i", "nc")
-BUILTIN(__nvvm_read_ptx_sreg_lanemask_gt, "i", "nc")
-
-BUILTIN(__nvvm_read_ptx_sreg_clock, "i", "n")
-BUILTIN(__nvvm_read_ptx_sreg_clock64, "LLi", "n")
-BUILTIN(__nvvm_read_ptx_sreg_globaltimer, "LLi", "n")
-
-BUILTIN(__nvvm_read_ptx_sreg_pm0, "i", "n")
-BUILTIN(__nvvm_read_ptx_sreg_pm1, "i", "n")
-BUILTIN(__nvvm_read_ptx_sreg_pm2, "i", "n")
-BUILTIN(__nvvm_read_ptx_sreg_pm3, "i", "n")
-
-// MISC
-
-BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
-BUILTIN(__nvvm_exit, "v", "r")
-BUILTIN(__nvvm_reflect, "UicC*", "r")
-TARGET_BUILTIN(__nvvm_nanosleep, "vUi", "n", AND(SM_70, PTX63))
-
-// Min Max
-
-TARGET_BUILTIN(__nvvm_fmin_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_nan_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, "hhh", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_nan_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, "yyy", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, "yyy", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, "V2yV2yV2y", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, "V2yV2yV2y", "",
-               AND(SM_86, PTX72))
-BUILTIN(__nvvm_fmin_f, "fff", "")
-BUILTIN(__nvvm_fmin_ftz_f, "fff", "")
-TARGET_BUILTIN(__nvvm_fmin_nan_f, "fff", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-BUILTIN(__nvvm_fmin_d, "ddd", "")
-
-TARGET_BUILTIN(__nvvm_fmax_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_nan_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, "hhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, "hhh", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, "hhh", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, "V2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, "V2hV2hV2h", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_nan_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_bf16, "yyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, "yyy", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, "yyy", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_bf16x2, "V2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, "V2yV2yV2y", "",
-               AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, "V2yV2yV2y", "",
-               AND(SM_86, PTX72))
-BUILTIN(__nvvm_fmax_f, "fff", "")
-BUILTIN(__nvvm_fmax_ftz_f, "fff", "")
-TARGET_BUILTIN(__nvvm_fmax_nan_f, "fff", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, "fff", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, "fff", "", AND(SM_86, PTX72))
-BUILTIN(__nvvm_fmax_d, "ddd", "")
-
-// Multiplication
-
-BUILTIN(__nvvm_mulhi_i, "iii", "")
-BUILTIN(__nvvm_mulhi_ui, "UiUiUi", "")
-BUILTIN(__nvvm_mulhi_ll, "LLiLLiLLi", "")
-BUILTIN(__nvvm_mulhi_ull, "ULLiULLiULLi", "")
-
-BUILTIN(__nvvm_mul_rn_ftz_f,  "fff", "")
-BUILTIN(__nvvm_mul_rn_f,  "fff", "")
-BUILTIN(__nvvm_mul_rz_ftz_f,  "fff", "")
-BUILTIN(__nvvm_mul_rz_f,  "fff", "")
-BUILTIN(__nvvm_mul_rm_ftz_f,  "fff", "")
-BUILTIN(__nvvm_mul_rm_f,  "fff", "")
-BUILTIN(__nvvm_mul_rp_ftz_f,  "fff", "")
-BUILTIN(__nvvm_mul_rp_f,  "fff", "")
-
-BUILTIN(__nvvm_mul_rn_d,  "ddd", "")
-BUILTIN(__nvvm_mul_rz_d,  "ddd", "")
-BUILTIN(__nvvm_mul_rm_d,  "ddd", "")
-BUILTIN(__nvvm_mul_rp_d,  "ddd", "")
-
-BUILTIN(__nvvm_mul24_i,  "iii", "")
-BUILTIN(__nvvm_mul24_ui,  "UiUiUi", "")
-
-// Div
-
-BUILTIN(__nvvm_div_approx_ftz_f,  "fff", "")
-BUILTIN(__nvvm_div_approx_f,  "fff", "")
-
-BUILTIN(__nvvm_div_rn_ftz_f,  "fff", "")
-BUILTIN(__nvvm_div_rn_f,  "fff", "")
-BUILTIN(__nvvm_div_rz_ftz_f,  "fff", "")
-BUILTIN(__nvvm_div_rz_f,  "fff", "")
-BUILTIN(__nvvm_div_rm_ftz_f,  "fff", "")
-BUILTIN(__nvvm_div_rm_f,  "fff", "")
-BUILTIN(__nvvm_div_rp_ftz_f,  "fff", "")
-BUILTIN(__nvvm_div_rp_f,  "fff", "")
-
-BUILTIN(__nvvm_div_rn_d,  "ddd", "")
-BUILTIN(__nvvm_div_rz_d,  "ddd", "")
-BUILTIN(__nvvm_div_rm_d,  "ddd", "")
-BUILTIN(__nvvm_div_rp_d,  "ddd", "")
-
-// Sad
-
-BUILTIN(__nvvm_sad_i, "iiii", "")
-BUILTIN(__nvvm_sad_ui, "UiUiUiUi", "")
-
-// Floor, Ceil
-
-BUILTIN(__nvvm_floor_ftz_f, "ff", "")
-BUILTIN(__nvvm_floor_f, "ff", "")
-BUILTIN(__nvvm_floor_d, "dd", "")
-
-BUILTIN(__nvvm_ceil_ftz_f, "ff", "")
-BUILTIN(__nvvm_ceil_f, "ff", "")
-BUILTIN(__nvvm_ceil_d, "dd", "")
-
-// Abs
-
-BUILTIN(__nvvm_fabs_ftz_f, "ff", "")
-BUILTIN(__nvvm_fabs_f, "ff", "")
-BUILTIN(__nvvm_fabs_d, "dd", "")
-
-// Round
-
-BUILTIN(__nvvm_round_ftz_f, "ff", "")
-BUILTIN(__nvvm_round_f, "ff", "")
-BUILTIN(__nvvm_round_d, "dd", "")
-
-// Trunc
-
-BUILTIN(__nvvm_trunc_ftz_f, "ff", "")
-BUILTIN(__nvvm_trunc_f, "ff", "")
-BUILTIN(__nvvm_trunc_d, "dd", "")
-
-// Saturate
-
-BUILTIN(__nvvm_saturate_ftz_f, "ff", "")
-BUILTIN(__nvvm_saturate_f, "ff", "")
-BUILTIN(__nvvm_saturate_d, "dd", "")
-
-// Exp2, Log2
-
-BUILTIN(__nvvm_ex2_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_ex2_approx_f, "ff", "")
-BUILTIN(__nvvm_ex2_approx_d, "dd", "")
-TARGET_BUILTIN(__nvvm_ex2_approx_f16, "hh", "", AND(SM_75, PTX70))
-TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, "V2hV2h", "", AND(SM_75, PTX70))
-
-BUILTIN(__nvvm_lg2_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_lg2_approx_f, "ff", "")
-BUILTIN(__nvvm_lg2_approx_d, "dd", "")
-
-// Sin, Cos
-
-BUILTIN(__nvvm_sin_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_sin_approx_f, "ff", "")
-
-BUILTIN(__nvvm_cos_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_cos_approx_f, "ff", "")
-
-// Fma
-
-TARGET_BUILTIN(__nvvm_fma_rn_f16, "hhhh", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, "hhhh", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, "hhhh", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, "hhhh", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, "hhhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, "hhhh", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, "V2hV2hV2hV2h", "", AND(SM_53, PTX42))
-TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, "V2hV2hV2hV2h", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_bf16, "yyyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, "yyyy", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, "V2yV2yV2yV2y", "", AND(SM_80, PTX70))
-TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, "V2yV2yV2yV2y", "", AND(SM_80, PTX70))
-BUILTIN(__nvvm_fma_rn_ftz_f, "ffff", "")
-BUILTIN(__nvvm_fma_rn_f, "ffff", "")
-BUILTIN(__nvvm_fma_rz_ftz_f, "ffff", "")
-BUILTIN(__nvvm_fma_rz_f, "ffff", "")
-BUILTIN(__nvvm_fma_rm_ftz_f, "ffff", "")
-BUILTIN(__nvvm_fma_rm_f, "ffff", "")
-BUILTIN(__nvvm_fma_rp_ftz_f, "ffff", "")
-BUILTIN(__nvvm_fma_rp_f, "ffff", "")
-BUILTIN(__nvvm_fma_rn_d, "dddd", "")
-BUILTIN(__nvvm_fma_rz_d, "dddd", "")
-BUILTIN(__nvvm_fma_rm_d, "dddd", "")
-BUILTIN(__nvvm_fma_rp_d, "dddd", "")
-
-// Rcp
-
-BUILTIN(__nvvm_rcp_rn_ftz_f, "ff", "")
-BUILTIN(__nvvm_rcp_rn_f, "ff", "")
-BUILTIN(__nvvm_rcp_rz_ftz_f, "ff", "")
-BUILTIN(__nvvm_rcp_rz_f, "ff", "")
-BUILTIN(__nvvm_rcp_rm_ftz_f, "ff", "")
-BUILTIN(__nvvm_rcp_rm_f, "ff", "")
-BUILTIN(__nvvm_rcp_rp_ftz_f, "ff", "")
-BUILTIN(__nvvm_rcp_rp_f, "ff", "")
-
-BUILTIN(__nvvm_rcp_rn_d, "dd", "")
-BUILTIN(__nvvm_rcp_rz_d, "dd", "")
-BUILTIN(__nvvm_rcp_rm_d, "dd", "")
-BUILTIN(__nvvm_rcp_rp_d, "dd", "")
-
-BUILTIN(__nvvm_rcp_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_rcp_approx_ftz_d, "dd", "")
-
-// Sqrt
-
-BUILTIN(__nvvm_sqrt_rn_ftz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rn_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rz_ftz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rm_ftz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rm_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rp_ftz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_rp_f, "ff", "")
-BUILTIN(__nvvm_sqrt_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_sqrt_approx_f, "ff", "")
-
-BUILTIN(__nvvm_sqrt_rn_d, "dd", "")
-BUILTIN(__nvvm_sqrt_rz_d, "dd", "")
-BUILTIN(__nvvm_sqrt_rm_d, "dd", "")
-BUILTIN(__nvvm_sqrt_rp_d, "dd", "")
-
-// Rsqrt
-
-BUILTIN(__nvvm_rsqrt_approx_ftz_f, "ff", "")
-BUILTIN(__nvvm_rsqrt_approx_f, "ff", "")
-BUILTIN(__nvvm_rsqrt_approx_d, "dd", "")
-
-// Add
-
-BUILTIN(__nvvm_add_rn_ftz_f, "fff", "")
-BUILTIN(__nvvm_add_rn_f, "fff", "")
-BUILTIN(__nvvm_add_rz_ftz_f, "fff", "")
-BUILTIN(__nvvm_add_rz_f, "fff", "")
-BUILTIN(__nvvm_add_rm_ftz_f, "fff", "")
-BUILTIN(__nvvm_add_rm_f, "fff", "")
-BUILTIN(__nvvm_add_rp_ftz_f, "fff", "")
-BUILTIN(__nvvm_add_rp_f, "fff", "")
-
-BUILTIN(__nvvm_add_rn_d, "ddd", "")
-BUILTIN(__nvvm_add_rz_d, "ddd", "")
-BUILTIN(__nvvm_add_rm_d, "ddd", "")
-BUILTIN(__nvvm_add_rp_d, "ddd", "")
-
-// Convert
-
-BUILTIN(__nvvm_d2f_rn_ftz, "fd", "")
-BUILTIN(__nvvm_d2f_rn, "fd", "")
-BUILTIN(__nvvm_d2f_rz_ftz, "fd", "")
-BUILTIN(__nvvm_d2f_rz, "fd", "")
-BUILTIN(__nvvm_d2f_rm_ftz, "fd", "")
-BUILTIN(__nvvm_d2f_rm, "fd", "")
-BUILTIN(__nvvm_d2f_rp_ftz, "fd", "")
-BUILTIN(__nvvm_d2f_rp, "fd", "")
-
-BUILTIN(__nvvm_d2i_rn, "id", "")
-BUILTIN(__nvvm_d2i_rz, "id", "")
-BUILTIN(__nvvm_d2i_rm, "id", "")
-BUILTIN(__nvvm_d2i_rp, "id", "")
-
-BUILTIN(__nvvm_d2ui_rn, "Uid", "")
-BUILTIN(__nvvm_d2ui_rz, "Uid", "")
-BUILTIN(__nvvm_d2ui_rm, "Uid", "")
-BUILTIN(__nvvm_d2ui_rp, "Uid", "")
-
-BUILTIN(__nvvm_i2d_rn, "di", "")
-BUILTIN(__nvvm_i2d_rz, "di", "")
-BUILTIN(__nvvm_i2d_rm, "di", "")
-BUILTIN(__nvvm_i2d_rp, "di", "")
-
-BUILTIN(__nvvm_ui2d_rn, "dUi", "")
-BUILTIN(__nvvm_ui2d_rz, "dUi", "")
-BUILTIN(__nvvm_ui2d_rm, "dUi", "")
-BUILTIN(__nvvm_ui2d_rp, "dUi", "")
-
-BUILTIN(__nvvm_f2i_rn_ftz, "if", "")
-BUILTIN(__nvvm_f2i_rn, "if", "")
-BUILTIN(__nvvm_f2i_rz_ftz, "if", "")
-BUILTIN(__nvvm_f2i_rz, "if", "")
-BUILTIN(__nvvm_f2i_rm_ftz, "if", "")
-BUILTIN(__nvvm_f2i_rm, "if", "")
-BUILTIN(__nvvm_f2i_rp_ftz, "if", "")
-BUILTIN(__nvvm_f2i_rp, "if", "")
-
-BUILTIN(__nvvm_f2ui_rn_ftz, "Uif", "")
-BUILTIN(__nvvm_f2ui_rn, "Uif", "")
-BUILTIN(__nvvm_f2ui_rz_ftz, "Uif", "")
-BUILTIN(__nvvm_f2ui_rz, "Uif", "")
-BUILTIN(__nvvm_f2ui_rm_ftz, "Uif", "")
-BUILTIN(__nvvm_f2ui_rm, "Uif", "")
-BUILTIN(__nvvm_f2ui_rp_ftz, "Uif", "")
-BUILTIN(__nvvm_f2ui_rp, "Uif", "")
-
-BUILTIN(__nvvm_i2f_rn, "fi", "")
-BUILTIN(__nvvm_i2f_rz, "fi", "")
-BUILTIN(__nvvm_i2f_rm, "fi", "")
-BUILTIN(__nvvm_i2f_rp, "fi", "")
-
-BUILTIN(__nvvm_ui2f_rn, "fUi", "")
-BUILTIN(__nvvm_ui2f_rz, "fUi", "")
-BUILTIN(__nvvm_ui2f_rm, "fUi", "")
-BUILTIN(__nvvm_ui2f_rp, "fUi", "")
-
-BUILTIN(__nvvm_lohi_i2d, "dii", "")
-
-BUILTIN(__nvvm_d2i_lo, "id", "")
-BUILTIN(__nvvm_d2i_hi, "id", "")
-
-BUILTIN(__nvvm_f2ll_rn_ftz, "LLif", "")
-BUILTIN(__nvvm_f2ll_rn, "LLif", "")
-BUILTIN(__nvvm_f2ll_rz_ftz, "LLif", "")
-BUILTIN(__nvvm_f2ll_rz, "LLif", "")
-BUILTIN(__nvvm_f2ll_rm_ftz, "LLif", "")
-BUILTIN(__nvvm_f2ll_rm, "LLif", "")
-BUILTIN(__nvvm_f2ll_rp_ftz, "LLif", "")
-BUILTIN(__nvvm_f2ll_rp, "LLif", "")
-
-BUILTIN(__nvvm_f2ull_rn_ftz, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rn, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rz_ftz, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rz, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rm_ftz, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rm, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rp_ftz, "ULLif", "")
-BUILTIN(__nvvm_f2ull_rp, "ULLif", "")
-
-BUILTIN(__nvvm_d2ll_rn, "LLid", "")
-BUILTIN(__nvvm_d2ll_rz, "LLid", "")
-BUILTIN(__nvvm_d2ll_rm, "LLid", "")
-BUILTIN(__nvvm_d2ll_rp, "LLid", "")
-
-BUILTIN(__nvvm_d2ull_rn, "ULLid", "")
-BUILTIN(__nvvm_d2ull_rz, "ULLid", "")
-BUILTIN(__nvvm_d2ull_rm, "ULLid", "")
-BUILTIN(__nvvm_d2ull_rp, "ULLid", "")
-
-BUILTIN(__nvvm_ll2f_rn, "fLLi", "")
-BUILTIN(__nvvm_ll2f_rz, "fLLi", "")
-BUILTIN(__nvvm_ll2f_rm, "fLLi", "")
-BUILTIN(__nvvm_ll2f_rp, "fLLi", "")
-
-BUILTIN(__nvvm_ull2f_rn, "fULLi", "")
-BUILTIN(__nvvm_ull2f_rz, "fULLi", "")
-BUILTIN(__nvvm_ull2f_rm, "fULLi", "")
-BUILTIN(__nvvm_ull2f_rp, "fULLi", "")
-
-BUILTIN(__nvvm_ll2d_rn, "dLLi", "")
-BUILTIN(__nvvm_ll2d_rz, "dLLi", "")
-BUILTIN(__nvvm_ll2d_rm, "dLLi", "")
-BUILTIN(__nvvm_ll2d_rp, "dLLi", "")
-
-BUILTIN(__nvvm_ull2d_rn, "dULLi", "")
-BUILTIN(__nvvm_ull2d_rz, "dULLi", "")
-BUILTIN(__nvvm_ull2d_rm, "dULLi", "")
-BUILTIN(__nvvm_ull2d_rp, "dULLi", "")
-
-BUILTIN(__nvvm_f2h_rn_ftz, "Usf", "")
-BUILTIN(__nvvm_f2h_rn, "Usf", "")
-
-TARGET_BUILTIN(__nvvm_ff2bf16x2_rn, "V2yff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2bf16x2_rn_relu, "V2yff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2bf16x2_rz, "V2yff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2bf16x2_rz_relu, "V2yff", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_ff2f16x2_rn, "V2hff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2f16x2_rn_relu, "V2hff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2f16x2_rz, "V2hff", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_ff2f16x2_rz_relu, "V2hff", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_f2bf16_rn, "yf", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_f2bf16_rn_relu, "yf", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_f2bf16_rz, "yf", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_f2bf16_rz_relu, "yf", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_f2tf32_rna, "ZUif", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn, "sff", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_ff_to_e4m3x2_rn_relu, "sff", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn, "sff", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_ff_to_e5m2x2_rn_relu, "sff", "", AND(SM_89,PTX81))
-
-TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn, "sV2h", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_f16x2_to_e4m3x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn, "sV2h", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_f16x2_to_e5m2x2_rn_relu, "sV2h", "", AND(SM_89,PTX81))
-
-TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_e4m3x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn, "V2hs", "", AND(SM_89,PTX81))
-TARGET_BUILTIN(__nvvm_e5m2x2_to_f16x2_rn_relu, "V2hs", "", AND(SM_89,PTX81))
-
-// FNS
-TARGET_BUILTIN(__nvvm_fns, "UiUiUii", "n", PTX60)
-
-// Sync
-
-BUILTIN(__syncthreads, "v", "")
-BUILTIN(__nvvm_bar0_popc, "ii", "")
-BUILTIN(__nvvm_bar0_and, "ii", "")
-BUILTIN(__nvvm_bar0_or, "ii", "")
-BUILTIN(__nvvm_bar_sync, "vi", "n")
-TARGET_BUILTIN(__nvvm_bar_warp_sync, "vUi", "n", PTX60)
-TARGET_BUILTIN(__nvvm_barrier_sync, "vUi", "n", PTX60)
-TARGET_BUILTIN(__nvvm_barrier_sync_cnt, "vUiUi", "n", PTX60)
-
-TARGET_BUILTIN(__nvvm_barrier_cluster_arrive, "v", "n", AND(SM_90,PTX78))
-TARGET_BUILTIN(__nvvm_barrier_cluster_arrive_relaxed, "v", "n", AND(SM_90,PTX80))
-TARGET_BUILTIN(__nvvm_barrier_cluster_wait, "v", "n", AND(SM_90,PTX78))
-TARGET_BUILTIN(__nvvm_fence_sc_cluster, "v", "n", AND(SM_90,PTX78))
-
-// Shuffle
-
-BUILTIN(__nvvm_shfl_down_i32, "iiii", "")
-BUILTIN(__nvvm_shfl_down_f32, "ffii", "")
-BUILTIN(__nvvm_shfl_up_i32, "iiii", "")
-BUILTIN(__nvvm_shfl_up_f32, "ffii", "")
-BUILTIN(__nvvm_shfl_bfly_i32, "iiii", "")
-BUILTIN(__nvvm_shfl_bfly_f32, "ffii", "")
-BUILTIN(__nvvm_shfl_idx_i32, "iiii", "")
-BUILTIN(__nvvm_shfl_idx_f32, "ffii", "")
-
-TARGET_BUILTIN(__nvvm_shfl_sync_down_i32, "iUiiii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_down_f32, "fUifii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_up_i32, "iUiiii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_up_f32, "fUifii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_bfly_i32, "iUiiii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_bfly_f32, "fUifii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_idx_i32, "iUiiii", "", PTX60)
-TARGET_BUILTIN(__nvvm_shfl_sync_idx_f32, "fUifii", "", PTX60)
-
-// Vote
-BUILTIN(__nvvm_vote_all, "bb", "")
-BUILTIN(__nvvm_vote_any, "bb", "")
-BUILTIN(__nvvm_vote_uni, "bb", "")
-BUILTIN(__nvvm_vote_ballot, "Uib", "")
-
-TARGET_BUILTIN(__nvvm_vote_all_sync, "bUib", "", PTX60)
-TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
-TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
-TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)
-
-// Mask
-TARGET_BUILTIN(__nvvm_activemask, "Ui", "n", PTX62)
-
-// Match
-TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
-// These return a pair {value, predicate}, which requires custom lowering.
-TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "UiUiWii*", "", AND(SM_70,PTX60))
-
-// Redux
-TARGET_BUILTIN(__nvvm_redux_sync_add, "iii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_min, "iii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_max, "iii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_umin, "UiUii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_umax, "UiUii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_and, "iii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_xor, "iii", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_redux_sync_or, "iii", "", AND(SM_80,PTX70))
-
-// Membar
-
-BUILTIN(__nvvm_membar_cta, "v", "")
-BUILTIN(__nvvm_membar_gl, "v", "")
-BUILTIN(__nvvm_membar_sys, "v", "")
-
-// mbarrier
-
-TARGET_BUILTIN(__nvvm_mbarrier_init, "vWi*i", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_init_shared, "vWi*3i", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mbarrier_inval, "vWi*", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_inval_shared, "vWi*3", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mbarrier_arrive, "WiWi*", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_shared, "WiWi*3", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop, "WiWi*", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_shared, "WiWi*3", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete, "WiWi*i", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_arrive_drop_noComplete_shared, "WiWi*3i", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mbarrier_test_wait, "bWi*Wi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_mbarrier_test_wait_shared, "bWi*3Wi", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mbarrier_pending_count, "iWi", "", AND(SM_80,PTX70))
-
-// Memcpy, Memset
-
-BUILTIN(__nvvm_memcpy, "vUc*Uc*zi","")
-BUILTIN(__nvvm_memset, "vUc*Uczi","")
-
-// Image
-
-BUILTIN(__builtin_ptx_read_image2Dfi_, "V4fiiii", "")
-BUILTIN(__builtin_ptx_read_image2Dff_, "V4fiiff", "")
-BUILTIN(__builtin_ptx_read_image2Dii_, "V4iiiii", "")
-BUILTIN(__builtin_ptx_read_image2Dif_, "V4iiiff", "")
-
-BUILTIN(__builtin_ptx_read_image3Dfi_, "V4fiiiiii", "")
-BUILTIN(__builtin_ptx_read_image3Dff_, "V4fiiffff", "")
-BUILTIN(__builtin_ptx_read_image3Dii_, "V4iiiiiii", "")
-BUILTIN(__builtin_ptx_read_image3Dif_, "V4iiiffff", "")
-
-BUILTIN(__builtin_ptx_write_image2Df_, "viiiffff", "")
-BUILTIN(__builtin_ptx_write_image2Di_, "viiiiiii", "")
-BUILTIN(__builtin_ptx_write_image2Dui_, "viiiUiUiUiUi", "")
-BUILTIN(__builtin_ptx_get_image_depthi_, "ii", "")
-BUILTIN(__builtin_ptx_get_image_heighti_, "ii", "")
-BUILTIN(__builtin_ptx_get_image_widthi_, "ii", "")
-BUILTIN(__builtin_ptx_get_image_channel_data_typei_, "ii", "")
-BUILTIN(__builtin_ptx_get_image_channel_orderi_, "ii", "")
-
-// Atomic
-//
-// We need the atom intrinsics because
-// - they are used in converging analysis
-// - they are used in address space analysis and optimization
-// So it does not hurt to expose them as builtins.
-//
-BUILTIN(__nvvm_atom_add_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_add_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_add_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_add_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_add_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_add_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_add_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_add_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_add_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-BUILTIN(__nvvm_atom_add_gen_f, "ffD*f", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_add_gen_f, "ffD*f", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_add_gen_f, "ffD*f", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_add_gen_d, "ddD*d", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_cta_add_gen_d, "ddD*d", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_add_gen_d, "ddD*d", "n", SM_60)
-
-BUILTIN(__nvvm_atom_sub_gen_i, "iiD*i", "n")
-BUILTIN(__nvvm_atom_sub_gen_l, "LiLiD*Li", "n")
-BUILTIN(__nvvm_atom_sub_gen_ll, "LLiLLiD*LLi", "n")
-
-BUILTIN(__nvvm_atom_xchg_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_xchg_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_xchg_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xchg_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xchg_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-
-BUILTIN(__nvvm_atom_max_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_max_gen_ui, "UiUiD*Ui", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ui, "UiUiD*Ui", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ui, "UiUiD*Ui", "n", SM_60)
-BUILTIN(__nvvm_atom_max_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_max_gen_ul, "ULiULiD*ULi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ul, "ULiULiD*ULi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ul, "ULiULiD*ULi", "n", SM_60)
-BUILTIN(__nvvm_atom_max_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-BUILTIN(__nvvm_atom_max_gen_ull, "ULLiULLiD*ULLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_max_gen_ull, "ULLiULLiD*ULLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_max_gen_ull, "ULLiULLiD*ULLi", "n", SM_60)
-
-BUILTIN(__nvvm_atom_min_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_min_gen_ui, "UiUiD*Ui", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_ui, "UiUiD*Ui", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_ui, "UiUiD*Ui", "n", SM_60)
-BUILTIN(__nvvm_atom_min_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_min_gen_ul, "ULiULiD*ULi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_ul, "ULiULiD*ULi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_ul, "ULiULiD*ULi", "n", SM_60)
-BUILTIN(__nvvm_atom_min_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-BUILTIN(__nvvm_atom_min_gen_ull, "ULLiULLiD*ULLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_min_gen_ull, "ULLiULLiD*ULLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_min_gen_ull, "ULLiULLiD*ULLi", "n", SM_60)
-
-BUILTIN(__nvvm_atom_inc_gen_ui, "UiUiD*Ui", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_inc_gen_ui, "UiUiD*Ui", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_inc_gen_ui, "UiUiD*Ui", "n", SM_60)
-BUILTIN(__nvvm_atom_dec_gen_ui, "UiUiD*Ui", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_dec_gen_ui, "UiUiD*Ui", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_dec_gen_ui, "UiUiD*Ui", "n", SM_60)
-
-BUILTIN(__nvvm_atom_and_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_and_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_and_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_and_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_and_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_and_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_and_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_and_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_and_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-
-BUILTIN(__nvvm_atom_or_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_or_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_or_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_or_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_or_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_or_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_or_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_or_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_or_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-
-BUILTIN(__nvvm_atom_xor_gen_i, "iiD*i", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_i, "iiD*i", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_i, "iiD*i", "n", SM_60)
-BUILTIN(__nvvm_atom_xor_gen_l, "LiLiD*Li", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_l, "LiLiD*Li", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_l, "LiLiD*Li", "n", SM_60)
-BUILTIN(__nvvm_atom_xor_gen_ll, "LLiLLiD*LLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_xor_gen_ll, "LLiLLiD*LLi", "n", SM_60)
-
-TARGET_BUILTIN(__nvvm_atom_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
-TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
-TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_us, "UsUsD*UsUs", "n", SM_70)
-BUILTIN(__nvvm_atom_cas_gen_i, "iiD*ii", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_i, "iiD*ii", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_i, "iiD*ii", "n", SM_60)
-BUILTIN(__nvvm_atom_cas_gen_l, "LiLiD*LiLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_l, "LiLiD*LiLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_l, "LiLiD*LiLi", "n", SM_60)
-BUILTIN(__nvvm_atom_cas_gen_ll, "LLiLLiD*LLiLLi", "n")
-TARGET_BUILTIN(__nvvm_atom_cta_cas_gen_ll, "LLiLLiD*LLiLLi", "n", SM_60)
-TARGET_BUILTIN(__nvvm_atom_sys_cas_gen_ll, "LLiLLiD*LLiLLi", "n", SM_60)
-
-// Compiler Error Warn
-BUILTIN(__nvvm_compiler_error, "vcC*4", "n")
-BUILTIN(__nvvm_compiler_warn, "vcC*4", "n")
-
-BUILTIN(__nvvm_ldu_c, "ccC*", "")
-BUILTIN(__nvvm_ldu_sc, "ScScC*", "")
-BUILTIN(__nvvm_ldu_s, "ssC*", "")
-BUILTIN(__nvvm_ldu_i, "iiC*", "")
-BUILTIN(__nvvm_ldu_l, "LiLiC*", "")
-BUILTIN(__nvvm_ldu_ll, "LLiLLiC*", "")
-
-BUILTIN(__nvvm_ldu_uc, "UcUcC*", "")
-BUILTIN(__nvvm_ldu_us, "UsUsC*", "")
-BUILTIN(__nvvm_ldu_ui, "UiUiC*", "")
-BUILTIN(__nvvm_ldu_ul, "ULiULiC*", "")
-BUILTIN(__nvvm_ldu_ull, "ULLiULLiC*", "")
-
-BUILTIN(__nvvm_ldu_h, "hhC*", "")
-BUILTIN(__nvvm_ldu_f, "ffC*", "")
-BUILTIN(__nvvm_ldu_d, "ddC*", "")
-
-BUILTIN(__nvvm_ldu_c2, "E2cE2cC*", "")
-BUILTIN(__nvvm_ldu_sc2, "E2ScE2ScC*", "")
-BUILTIN(__nvvm_ldu_c4, "E4cE4cC*", "")
-BUILTIN(__nvvm_ldu_sc4, "E4ScE4ScC*", "")
-BUILTIN(__nvvm_ldu_s2, "E2sE2sC*", "")
-BUILTIN(__nvvm_ldu_s4, "E4sE4sC*", "")
-BUILTIN(__nvvm_ldu_i2, "E2iE2iC*", "")
-BUILTIN(__nvvm_ldu_i4, "E4iE4iC*", "")
-BUILTIN(__nvvm_ldu_l2, "E2LiE2LiC*", "")
-BUILTIN(__nvvm_ldu_ll2, "E2LLiE2LLiC*", "")
-
-BUILTIN(__nvvm_ldu_uc2, "E2UcE2UcC*", "")
-BUILTIN(__nvvm_ldu_uc4, "E4UcE4UcC*", "")
-BUILTIN(__nvvm_ldu_us2, "E2UsE2UsC*", "")
-BUILTIN(__nvvm_ldu_us4, "E4UsE4UsC*", "")
-BUILTIN(__nvvm_ldu_ui2, "E2UiE2UiC*", "")
-BUILTIN(__nvvm_ldu_ui4, "E4UiE4UiC*", "")
-BUILTIN(__nvvm_ldu_ul2, "E2ULiE2ULiC*", "")
-BUILTIN(__nvvm_ldu_ull2, "E2ULLiE2ULLiC*", "")
-
-BUILTIN(__nvvm_ldu_h2, "E2hE2hC*", "")
-BUILTIN(__nvvm_ldu_f2, "E2fE2fC*", "")
-BUILTIN(__nvvm_ldu_f4, "E4fE4fC*", "")
-BUILTIN(__nvvm_ldu_d2, "E2dE2dC*", "")
-
-BUILTIN(__nvvm_ldg_c, "ccC*", "")
-BUILTIN(__nvvm_ldg_sc, "ScScC*", "")
-BUILTIN(__nvvm_ldg_s, "ssC*", "")
-BUILTIN(__nvvm_ldg_i, "iiC*", "")
-BUILTIN(__nvvm_ldg_l, "LiLiC*", "")
-BUILTIN(__nvvm_ldg_ll, "LLiLLiC*", "")
-
-BUILTIN(__nvvm_ldg_uc, "UcUcC*", "")
-BUILTIN(__nvvm_ldg_us, "UsUsC*", "")
-BUILTIN(__nvvm_ldg_ui, "UiUiC*", "")
-BUILTIN(__nvvm_ldg_ul, "ULiULiC*", "")
-BUILTIN(__nvvm_ldg_ull, "ULLiULLiC*", "")
-
-BUILTIN(__nvvm_ldg_h, "hhC*", "")
-BUILTIN(__nvvm_ldg_f, "ffC*", "")
-BUILTIN(__nvvm_ldg_d, "ddC*", "")
-
-BUILTIN(__nvvm_ldg_c2, "E2cE2cC*", "")
-BUILTIN(__nvvm_ldg_sc2, "E2ScE2ScC*", "")
-BUILTIN(__nvvm_ldg_c4, "E4cE4cC*", "")
-BUILTIN(__nvvm_ldg_sc4, "E4ScE4ScC*", "")
-BUILTIN(__nvvm_ldg_s2, "E2sE2sC*", "")
-BUILTIN(__nvvm_ldg_s4, "E4sE4sC*", "")
-BUILTIN(__nvvm_ldg_i2, "E2iE2iC*", "")
-BUILTIN(__nvvm_ldg_i4, "E4iE4iC*", "")
-BUILTIN(__nvvm_ldg_l2, "E2LiE2LiC*", "")
-BUILTIN(__nvvm_ldg_ll2, "E2LLiE2LLiC*", "")
-
-BUILTIN(__nvvm_ldg_uc2, "E2UcE2UcC*", "")
-BUILTIN(__nvvm_ldg_uc4, "E4UcE4UcC*", "")
-BUILTIN(__nvvm_ldg_us2, "E2UsE2UsC*", "")
-BUILTIN(__nvvm_ldg_us4, "E4UsE4UsC*", "")
-BUILTIN(__nvvm_ldg_ui2, "E2UiE2UiC*", "")
-BUILTIN(__nvvm_ldg_ui4, "E4UiE4UiC*", "")
-BUILTIN(__nvvm_ldg_ul2, "E2ULiE2ULiC*", "")
-BUILTIN(__nvvm_ldg_ull2, "E2ULLiE2ULLiC*", "")
-
-BUILTIN(__nvvm_ldg_h2, "E2hE2hC*", "")
-BUILTIN(__nvvm_ldg_f2, "E2fE2fC*", "")
-BUILTIN(__nvvm_ldg_f4, "E4fE4fC*", "")
-BUILTIN(__nvvm_ldg_d2, "E2dE2dC*", "")
-
-// Address space predicates.
-BUILTIN(__nvvm_isspacep_const, "bvC*", "nc")
-BUILTIN(__nvvm_isspacep_global, "bvC*", "nc")
-BUILTIN(__nvvm_isspacep_local, "bvC*", "nc")
-BUILTIN(__nvvm_isspacep_shared, "bvC*", "nc")
-TARGET_BUILTIN(__nvvm_isspacep_shared_cluster,"bvC*", "nc", AND(SM_90,PTX78))
-
-// Builtins to support WMMA instructions on sm_70
-TARGET_BUILTIN(__hmma_m16n16k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX60))
-
-TARGET_BUILTIN(__hmma_m32n8k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61))
-
-TARGET_BUILTIN(__hmma_m8n32k16_ld_a, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_ld_b, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_ld_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_st_c_f16, "vi*iC*UiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_st_c_f32, "vf*fC*UiIi", "", AND(SM_70,PTX61))
-
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX60))
-TARGET_BUILTIN(__hmma_m16n16k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX60))
-
-TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m32n8k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
-
-TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f16, "vi*iC*iC*iC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
-TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61))
-
-// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75
-TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__bmma_m8n8k128_mma_and_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_80,PTX71))
-TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63))
-TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63))
-
-// Builtins to support double and alternate float WMMA instructions on sm_80
-TARGET_BUILTIN(__dmma_m8n8k4_ld_a, "vd*dC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__dmma_m8n8k4_ld_b, "vd*dC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__dmma_m8n8k4_ld_c, "vd*dC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__dmma_m8n8k4_st_c_f64, "vd*dC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__dmma_m8n8k4_mma_f64, "vd*dC*dC*dC*IiIi", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__mma_bf16_m16n16k16_ld_a, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m16n16k16_ld_b, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m16n16k16_mma_f32, "vf*iC*iC*fC*IiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m8n32k16_ld_a, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m8n32k16_ld_b, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m8n32k16_mma_f32, "vf*iC*iC*fC*IiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m32n8k16_ld_a, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m32n8k16_ld_b, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_bf16_m32n8k16_mma_f32, "vf*iC*iC*fC*IiIi", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__mma_tf32_m16n16k8_ld_a, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_tf32_m16n16k8_ld_b, "vi*iC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_tf32_m16n16k8_ld_c, "vf*fC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_m16n16k8_st_c_f32, "vf*fC*UiIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__mma_tf32_m16n16k8_mma_f32, "vf*iC*iC*fC*IiIi", "", AND(SM_80,PTX70))
-
-// Async Copy
-TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive, "vWi*", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_shared, "vWi*3", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc, "vWi*", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_mbarrier_arrive_noinc_shared, "vWi*3", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_4, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_8, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_ca_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_cg_shared_global_16, "vv*3vC*1.", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_wait_group, "vIi", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_cp_async_wait_all, "v", "", AND(SM_80,PTX70))
-
-
-// bf16, bf16x2 abs, neg
-TARGET_BUILTIN(__nvvm_abs_bf16, "yy", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_abs_bf16x2, "V2yV2y", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_neg_bf16, "yy", "", AND(SM_80,PTX70))
-TARGET_BUILTIN(__nvvm_neg_bf16x2, "V2yV2y", "", AND(SM_80,PTX70))
-
-TARGET_BUILTIN(__nvvm_mapa, "v*v*i", "", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_mapa_shared_cluster, "v*3v*3i", "", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_getctarank, "iv*", "", AND(SM_90, PTX78))
-TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-#pragma pop_macro("AND")
-#pragma pop_macro("SM_53")
-#pragma pop_macro("SM_60")
-#pragma pop_macro("SM_70")
-#pragma pop_macro("SM_72")
-#pragma pop_macro("SM_75")
-#pragma pop_macro("SM_80")
-#pragma pop_macro("SM_86")
-#pragma pop_macro("SM_87")
-#pragma pop_macro("SM_89")
-#pragma pop_macro("SM_90")
-#pragma pop_macro("SM_90a")
-#pragma pop_macro("SM_100")
-#pragma pop_macro("SM_100a")
-#pragma pop_macro("PTX42")
-#pragma pop_macro("PTX60")
-#pragma pop_macro("PTX61")
-#pragma pop_macro("PTX62")
-#pragma pop_macro("PTX63")
-#pragma pop_macro("PTX64")
-#pragma pop_macro("PTX65")
-#pragma pop_macro("PTX70")
-#pragma pop_macro("PTX71")
-#pragma pop_macro("PTX72")
-#pragma pop_macro("PTX73")
-#pragma pop_macro("PTX74")
-#pragma pop_macro("PTX75")
-#pragma pop_macro("PTX76")
-#pragma pop_macro("PTX77")
-#pragma pop_macro("PTX78")
-#pragma pop_macro("PTX80")
-#pragma pop_macro("PTX81")
-#pragma pop_macro("PTX82")
-#pragma pop_macro("PTX83")
-#pragma pop_macro("PTX84")
-#pragma pop_macro("PTX85")
-#pragma pop_macro("PTX86")
diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td
new file mode 100644
index 0000000000000..b43e8ba57f7a0
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -0,0 +1,1078 @@
+//===--- BuiltinsNVPTX.td - NVPTX Builtin function defs ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PTX-specific builtin function database.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class SMFeatures {
+  string Features;
+}
+
+class SM<string version, list<SMFeatures> newer_list> : SMFeatures {
+  let Features = !foldl(!strconcat("sm_", version), newer_list, f, newer,
+                        !strconcat(f, "|", newer.Features));
+}
+
+let Features = "sm_100a" in def SM_100a : SMFeatures;
+
+def SM_100 : SM<"100", [SM_100a]>;
+
+let Features = "sm_90a" in def SM_90a : SMFeatures;
+
+def SM_90 : SM<"90", [SM_90a, SM_100]>;
+def SM_89 : SM<"89", [SM_90]>;
+def SM_87 : SM<"87", [SM_89]>;
+def SM_86 : SM<"86", [SM_87]>;
+def SM_80 : SM<"80", [SM_86]>;
+def SM_75 : SM<"75", [SM_80]>;
+def SM_72 : SM<"72", [SM_75]>;
+def SM_70 : SM<"70", [SM_72]>;
+def SM_62 : SM<"62", [SM_70]>;
+def SM_61 : SM<"61", [SM_62]>;
+def SM_60 : SM<"60", [SM_61]>;
+def SM_53 : SM<"53", [SM_60]>;
+
+class PTXFeatures {
+  string Features;
+}
+
+class PTX<string version, PTXFeatures newer> : PTXFeatures {
+  let Features = !strconcat("ptx", version, "|", newer.Features);
+}
+
+let Features = "ptx86" in def PTX86 : PTXFeatures;
+
+def PTX85 : PTX<"85", PTX86>;
+def PTX84 : PTX<"84", PTX85>;
+def PTX83 : PTX<"83", PTX84>;
+def PTX82 : PTX<"82", PTX83>;
+def PTX81 : PTX<"81", PTX82>;
+def PTX80 : PTX<"80", PTX81>;
+def PTX78 : PTX<"78", PTX80>;
+def PTX77 : PTX<"77", PTX78>;
+def PTX76 : PTX<"76", PTX77>;
+def PTX75 : PTX<"75", PTX76>;
+def PTX74 : PTX<"74", PTX75>;
+def PTX73 : PTX<"73", PTX74>;
+def PTX72 : PTX<"72", PTX73>;
+def PTX71 : PTX<"71", PTX72>;
+def PTX70 : PTX<"70", PTX71>;
+def PTX65 : PTX<"65", PTX70>;
+def PTX64 : PTX<"64", PTX65>;
+def PTX63 : PTX<"63", PTX64>;
+def PTX62 : PTX<"62", PTX63>;
+def PTX61 : PTX<"61", PTX62>;
+def PTX60 : PTX<"60", PTX61>;
+def PTX42 : PTX<"42", PTX60>;
+
+class NVPTXBuiltin<string prototype> : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+}
+
+class NVPTXBuiltinSM<string prototype, SMFeatures sm> : NVPTXBuiltin<prototype> {
+  let Features = sm.Features;
+}
+
+class NVPTXBuiltinPTX<string prototype, PTXFeatures ptx> : NVPTXBuiltin<prototype> {
+  let Features = ptx.Features;
+}
+
+class NVPTXBuiltinSMAndPTX<string prototype, SMFeatures sm, PTXFeatures ptx> : NVPTXBuiltin<prototype> {
+  let Features = !strconcat("(", sm.Features, "),(", ptx.Features, ")");
+}
+
+// Special Registers
+
+let Attributes = [NoThrow, Const] in {
+  def __nvvm_read_ptx_sreg_tid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_tid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_ntid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ntid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_ctaid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_ctaid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_nctaid_x : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_y : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_z : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nctaid_w : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_clusterid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_clusterid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_nclusterid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_nclusterid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_cluster_ctaid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_ctaid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_cluster_nctaid_x : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctaid_y : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctaid_z : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctaid_w : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_cluster_ctarank : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+  def __nvvm_read_ptx_sreg_cluster_nctarank : NVPTXBuiltinSMAndPTX<"int()", SM_90, PTX78>;
+
+  def __nvvm_is_explicit_cluster : NVPTXBuiltinSMAndPTX<"bool()", SM_90, PTX78>;
+
+  def __nvvm_read_ptx_sreg_laneid : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_warpid : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nwarpid : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_warpsize : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_smid : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_nsmid : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_gridid : NVPTXBuiltin<"int()">;
+
+  def __nvvm_read_ptx_sreg_lanemask_eq : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_lanemask_le : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_lanemask_lt : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_lanemask_ge : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_lanemask_gt : NVPTXBuiltin<"int()">;
+}
+
+let Attributes = [NoThrow] in {
+  def __nvvm_read_ptx_sreg_clock : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_clock64 : NVPTXBuiltin<"long long int()">;
+  def __nvvm_read_ptx_sreg_globaltimer : NVPTXBuiltin<"long long int()">;
+
+  def __nvvm_read_ptx_sreg_pm0 : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_pm1 : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_pm2 : NVPTXBuiltin<"int()">;
+  def __nvvm_read_ptx_sreg_pm3 : NVPTXBuiltin<"int()">;
+}
+
+// MISC
+
+def __nvvm_prmt : NVPTXBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)">;
+let Attributes = [NoReturn] in {
+  def __nvvm_exit : NVPTXBuiltin<"void()">;
+  def __nvvm_reflect : NVPTXBuiltin<"unsigned int(char const *)">;
+}
+let Attributes = [NoThrow] in {
+  def __nvvm_nanosleep : NVPTXBuiltinSMAndPTX<"void(unsigned int)", SM_70, PTX63>;
+}
+
+// Min Max
+
+def __nvvm_fmin_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmin_nan_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_nan_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmin_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmin_nan_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_nan_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmin_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmin_nan_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_nan_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmin_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmin_nan_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_nan_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmin_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmin_nan_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_nan_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmin_xorsign_abs_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_86, PTX72>;
+def __nvvm_fmin_nan_xorsign_abs_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_86, PTX72>;
+def __nvvm_fmin_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmin_nan_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_nan_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmin_xorsign_abs_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_86, PTX72>;
+def __nvvm_fmin_nan_xorsign_abs_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_86, PTX72>;
+def __nvvm_fmin_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_fmin_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_fmin_nan_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_80, PTX70>;
+def __nvvm_fmin_ftz_nan_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_80, PTX70>;
+def __nvvm_fmin_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmin_nan_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmin_ftz_nan_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmin_d : NVPTXBuiltin<"double(double, double)">;
+
+def __nvvm_fmax_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmax_nan_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_nan_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fmax_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmax_nan_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_nan_xorsign_abs_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16)", SM_86, PTX72>;
+def __nvvm_fmax_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmax_nan_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_nan_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fmax_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmax_nan_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_nan_xorsign_abs_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>)", SM_86, PTX72>;
+def __nvvm_fmax_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmax_nan_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_nan_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fmax_xorsign_abs_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_86, PTX72>;
+def __nvvm_fmax_nan_xorsign_abs_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16)", SM_86, PTX72>;
+def __nvvm_fmax_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmax_nan_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_nan_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fmax_xorsign_abs_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_86, PTX72>;
+def __nvvm_fmax_nan_xorsign_abs_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>)", SM_86, PTX72>;
+def __nvvm_fmax_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_fmax_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_fmax_nan_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_80, PTX70>;
+def __nvvm_fmax_ftz_nan_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_80, PTX70>;
+def __nvvm_fmax_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmax_nan_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmax_ftz_nan_xorsign_abs_f : NVPTXBuiltinSMAndPTX<"float(float, float)", SM_86, PTX72>;
+def __nvvm_fmax_d : NVPTXBuiltin<"double(double, double)">;
+
+// Multiplication
+
+def __nvvm_mulhi_i : NVPTXBuiltin<"int(int, int)">;
+def __nvvm_mulhi_ui : NVPTXBuiltin<"unsigned int(unsigned int, unsigned int)">;
+def __nvvm_mulhi_ll : NVPTXBuiltin<"long long int(long long int, long long int)">;
+def __nvvm_mulhi_ull : NVPTXBuiltin<"unsigned long long int(unsigned long long int, unsigned long long int)">;
+
+def __nvvm_mul_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rn_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rz_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rm_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rm_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rp_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_mul_rp_f : NVPTXBuiltin<"float(float, float)">;
+
+def __nvvm_mul_rn_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_mul_rz_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_mul_rm_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_mul_rp_d : NVPTXBuiltin<"double(double, double)">;
+
+def __nvvm_mul24_i : NVPTXBuiltin<"int(int, int)">;
+def __nvvm_mul24_ui : NVPTXBuiltin<"unsigned int(unsigned int, unsigned int)">;
+
+// Div
+
+def __nvvm_div_approx_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_approx_f : NVPTXBuiltin<"float(float, float)">;
+
+def __nvvm_div_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rn_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rz_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rm_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rm_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rp_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_div_rp_f : NVPTXBuiltin<"float(float, float)">;
+
+def __nvvm_div_rn_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_div_rz_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_div_rm_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_div_rp_d : NVPTXBuiltin<"double(double, double)">;
+
+// Sad
+
+def __nvvm_sad_i : NVPTXBuiltin<"int(int, int, int)">;
+def __nvvm_sad_ui : NVPTXBuiltin<"unsigned int(unsigned int, unsigned int, unsigned int)">;
+
+// Floor, Ceil
+
+def __nvvm_floor_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_floor_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_floor_d : NVPTXBuiltin<"double(double)">;
+
+def __nvvm_ceil_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_ceil_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_ceil_d : NVPTXBuiltin<"double(double)">;
+
+// Abs
+
+def __nvvm_fabs_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_fabs_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_fabs_d : NVPTXBuiltin<"double(double)">;
+
+// Round
+
+def __nvvm_round_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_round_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_round_d : NVPTXBuiltin<"double(double)">;
+
+// Trunc
+
+def __nvvm_trunc_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_trunc_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_trunc_d : NVPTXBuiltin<"double(double)">;
+
+// Saturate
+
+def __nvvm_saturate_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_saturate_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_saturate_d : NVPTXBuiltin<"double(double)">;
+
+// Exp2, Log2
+
+def __nvvm_ex2_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_ex2_approx_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_ex2_approx_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_ex2_approx_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16)", SM_75, PTX70>;
+def __nvvm_ex2_approx_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>)", SM_75, PTX70>;
+
+def __nvvm_lg2_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_lg2_approx_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_lg2_approx_d : NVPTXBuiltin<"double(double)">;
+
+// Sin, Cos
+
+def __nvvm_sin_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sin_approx_f : NVPTXBuiltin<"float(float)">;
+
+def __nvvm_cos_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_cos_approx_f : NVPTXBuiltin<"float(float)">;
+
+// Fma
+
+def __nvvm_fma_rn_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_fma_rn_ftz_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_fma_rn_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_fma_rn_ftz_sat_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_53, PTX42>;
+def __nvvm_fma_rn_relu_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fma_rn_ftz_relu_f16 : NVPTXBuiltinSMAndPTX<"__fp16(__fp16, __fp16, __fp16)", SM_80, PTX70>;
+def __nvvm_fma_rn_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_fma_rn_ftz_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_fma_rn_sat_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_fma_rn_ftz_sat_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_53, PTX42>;
+def __nvvm_fma_rn_relu_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fma_rn_ftz_relu_f16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(_Vector<2, __fp16>, _Vector<2, __fp16>, _Vector<2, __fp16>)", SM_80, PTX70>;
+def __nvvm_fma_rn_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fma_rn_relu_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16, __bf16, __bf16)", SM_80, PTX70>;
+def __nvvm_fma_rn_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fma_rn_relu_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>, _Vector<2, __bf16>, _Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_fma_rn_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rn_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rz_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rz_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rm_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rm_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rp_ftz_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rp_f : NVPTXBuiltin<"float(float, float, float)">;
+def __nvvm_fma_rn_d : NVPTXBuiltin<"double(double, double, double)">;
+def __nvvm_fma_rz_d : NVPTXBuiltin<"double(double, double, double)">;
+def __nvvm_fma_rm_d : NVPTXBuiltin<"double(double, double, double)">;
+def __nvvm_fma_rp_d : NVPTXBuiltin<"double(double, double, double)">;
+
+// Rcp
+
+def __nvvm_rcp_rn_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rn_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rz_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rm_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rm_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rp_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_rp_f : NVPTXBuiltin<"float(float)">;
+
+def __nvvm_rcp_rn_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_rcp_rz_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_rcp_rm_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_rcp_rp_d : NVPTXBuiltin<"double(double)">;
+
+def __nvvm_rcp_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rcp_approx_ftz_d : NVPTXBuiltin<"double(double)">;
+
+// Sqrt
+
+def __nvvm_sqrt_rn_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rn_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rz_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rm_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rm_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rp_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_rp_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_sqrt_approx_f : NVPTXBuiltin<"float(float)">;
+
+def __nvvm_sqrt_rn_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_sqrt_rz_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_sqrt_rm_d : NVPTXBuiltin<"double(double)">;
+def __nvvm_sqrt_rp_d : NVPTXBuiltin<"double(double)">;
+
+// Rsqrt
+
+def __nvvm_rsqrt_approx_ftz_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rsqrt_approx_f : NVPTXBuiltin<"float(float)">;
+def __nvvm_rsqrt_approx_d : NVPTXBuiltin<"double(double)">;
+
+// Add
+
+def __nvvm_add_rn_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rn_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rz_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rm_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rm_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rp_ftz_f : NVPTXBuiltin<"float(float, float)">;
+def __nvvm_add_rp_f : NVPTXBuiltin<"float(float, float)">;
+
+def __nvvm_add_rn_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_add_rz_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_add_rm_d : NVPTXBuiltin<"double(double, double)">;
+def __nvvm_add_rp_d : NVPTXBuiltin<"double(double, double)">;
+
+// Convert
+
+def __nvvm_d2f_rn_ftz : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rn : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rz_ftz : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rz : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rm_ftz : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rm : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rp_ftz : NVPTXBuiltin<"float(double)">;
+def __nvvm_d2f_rp : NVPTXBuiltin<"float(double)">;
+
+def __nvvm_d2i_rn : NVPTXBuiltin<"int(double)">;
+def __nvvm_d2i_rz : NVPTXBuiltin<"int(double)">;
+def __nvvm_d2i_rm : NVPTXBuiltin<"int(double)">;
+def __nvvm_d2i_rp : NVPTXBuiltin<"int(double)">;
+
+def __nvvm_d2ui_rn : NVPTXBuiltin<"unsigned int(double)">;
+def __nvvm_d2ui_rz : NVPTXBuiltin<"unsigned int(double)">;
+def __nvvm_d2ui_rm : NVPTXBuiltin<"unsigned int(double)">;
+def __nvvm_d2ui_rp : NVPTXBuiltin<"unsigned int(double)">;
+
+def __nvvm_i2d_rn : NVPTXBuiltin<"double(int)">;
+def __nvvm_i2d_rz : NVPTXBuiltin<"double(int)">;
+def __nvvm_i2d_rm : NVPTXBuiltin<"double(int)">;
+def __nvvm_i2d_rp : NVPTXBuiltin<"double(int)">;
+
+def __nvvm_ui2d_rn : NVPTXBuiltin<"double(unsigned int)">;
+def __nvvm_ui2d_rz : NVPTXBuiltin<"double(unsigned int)">;
+def __nvvm_ui2d_rm : NVPTXBuiltin<"double(unsigned int)">;
+def __nvvm_ui2d_rp : NVPTXBuiltin<"double(unsigned int)">;
+
+def __nvvm_f2i_rn_ftz : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rn : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rz_ftz : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rz : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rm_ftz : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rm : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rp_ftz : NVPTXBuiltin<"int(float)">;
+def __nvvm_f2i_rp : NVPTXBuiltin<"int(float)">;
+
+def __nvvm_f2ui_rn_ftz : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rn : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rz_ftz : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rz : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rm_ftz : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rm : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rp_ftz : NVPTXBuiltin<"unsigned int(float)">;
+def __nvvm_f2ui_rp : NVPTXBuiltin<"unsigned int(float)">;
+
+def __nvvm_i2f_rn : NVPTXBuiltin<"float(int)">;
+def __nvvm_i2f_rz : NVPTXBuiltin<"float(int)">;
+def __nvvm_i2f_rm : NVPTXBuiltin<"float(int)">;
+def __nvvm_i2f_rp : NVPTXBuiltin<"float(int)">;
+
+def __nvvm_ui2f_rn : NVPTXBuiltin<"float(unsigned int)">;
+def __nvvm_ui2f_rz : NVPTXBuiltin<"float(unsigned int)">;
+def __nvvm_ui2f_rm : NVPTXBuiltin<"float(unsigned int)">;
+def __nvvm_ui2f_rp : NVPTXBuiltin<"float(unsigned int)">;
+
+def __nvvm_lohi_i2d : NVPTXBuiltin<"double(int, int)">;
+
+def __nvvm_d2i_lo : NVPTXBuiltin<"int(double)">;
+def __nvvm_d2i_hi : NVPTXBuiltin<"int(double)">;
+
+def __nvvm_f2ll_rn_ftz : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rn : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rz_ftz : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rz : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rm_ftz : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rm : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rp_ftz : NVPTXBuiltin<"long long int(float)">;
+def __nvvm_f2ll_rp : NVPTXBuiltin<"long long int(float)">;
+
+def __nvvm_f2ull_rn_ftz : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rn : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rz_ftz : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rz : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rm_ftz : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rm : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rp_ftz : NVPTXBuiltin<"unsigned long long int(float)">;
+def __nvvm_f2ull_rp : NVPTXBuiltin<"unsigned long long int(float)">;
+
+def __nvvm_d2ll_rn : NVPTXBuiltin<"long long int(double)">;
+def __nvvm_d2ll_rz : NVPTXBuiltin<"long long int(double)">;
+def __nvvm_d2ll_rm : NVPTXBuiltin<"long long int(double)">;
+def __nvvm_d2ll_rp : NVPTXBuiltin<"long long int(double)">;
+
+def __nvvm_d2ull_rn : NVPTXBuiltin<"unsigned long long int(double)">;
+def __nvvm_d2ull_rz : NVPTXBuiltin<"unsigned long long int(double)">;
+def __nvvm_d2ull_rm : NVPTXBuiltin<"unsigned long long int(double)">;
+def __nvvm_d2ull_rp : NVPTXBuiltin<"unsigned long long int(double)">;
+
+def __nvvm_ll2f_rn : NVPTXBuiltin<"float(long long int)">;
+def __nvvm_ll2f_rz : NVPTXBuiltin<"float(long long int)">;
+def __nvvm_ll2f_rm : NVPTXBuiltin<"float(long long int)">;
+def __nvvm_ll2f_rp : NVPTXBuiltin<"float(long long int)">;
+
+def __nvvm_ull2f_rn : NVPTXBuiltin<"float(unsigned long long int)">;
+def __nvvm_ull2f_rz : NVPTXBuiltin<"float(unsigned long long int)">;
+def __nvvm_ull2f_rm : NVPTXBuiltin<"float(unsigned long long int)">;
+def __nvvm_ull2f_rp : NVPTXBuiltin<"float(unsigned long long int)">;
+
+def __nvvm_ll2d_rn : NVPTXBuiltin<"double(long long int)">;
+def __nvvm_ll2d_rz : NVPTXBuiltin<"double(long long int)">;
+def __nvvm_ll2d_rm : NVPTXBuiltin<"double(long long int)">;
+def __nvvm_ll2d_rp : NVPTXBuiltin<"double(long long int)">;
+
+def __nvvm_ull2d_rn : NVPTXBuiltin<"double(unsigned long long int)">;
+def __nvvm_ull2d_rz : NVPTXBuiltin<"double(unsigned long long int)">;
+def __nvvm_ull2d_rm : NVPTXBuiltin<"double(unsigned long long int)">;
+def __nvvm_ull2d_rp : NVPTXBuiltin<"double(unsigned long long int)">;
+
+def __nvvm_f2h_rn_ftz : NVPTXBuiltin<"unsigned short(float)">;
+def __nvvm_f2h_rn : NVPTXBuiltin<"unsigned short(float)">;
+
+def __nvvm_ff2bf16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2bf16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(float, float)", SM_80, PTX70>;
+
+def __nvvm_ff2f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rz : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+def __nvvm_ff2f16x2_rz_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(float, float)", SM_80, PTX70>;
+
+def __nvvm_f2bf16_rn : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+def __nvvm_f2bf16_rn_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
+
+def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+
+def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
+def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
+def __nvvm_ff_to_e5m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
+def __nvvm_ff_to_e5m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>;
+
+def __nvvm_f16x2_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __fp16>)", SM_89, PTX81>;
+def __nvvm_f16x2_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __fp16>)", SM_89, PTX81>;
+def __nvvm_f16x2_to_e5m2x2_rn : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __fp16>)", SM_89, PTX81>;
+def __nvvm_f16x2_to_e5m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __fp16>)", SM_89, PTX81>;
+
+def __nvvm_e4m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>;
+
+// FNS
+let Attributes = [NoThrow] in {
+  def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>;
+}
+
+// Sync
+
+def __syncthreads : NVPTXBuiltin<"void()">;
+def __nvvm_bar0_popc : NVPTXBuiltin<"int(int)">;
+def __nvvm_bar0_and : NVPTXBuiltin<"int(int)">;
+def __nvvm_bar0_or : NVPTXBuiltin<"int(int)">;
+let Attributes = [NoThrow] in {
+  def __nvvm_bar_sync : NVPTXBuiltin<"void(int)">;
+  def __nvvm_bar_warp_sync : NVPTXBuiltinPTX<"void(unsigned int)", PTX60>;
+  def __nvvm_barrier_sync : NVPTXBuiltinPTX<"void(unsigned int)", PTX60>;
+  def __nvvm_barrier_sync_cnt : NVPTXBuiltinPTX<"void(unsigned int, unsigned int)", PTX60>;
+
+  def __nvvm_barrier_cluster_arrive : NVPTXBuiltinSMAndPTX<"void()", SM_90, PTX78>;
+  def __nvvm_barrier_cluster_arrive_relaxed : NVPTXBuiltinSMAndPTX<"void()", SM_90, PTX80>;
+  def __nvvm_barrier_cluster_wait : NVPTXBuiltinSMAndPTX<"void()", SM_90, PTX78>;
+  def __nvvm_fence_sc_cluster : NVPTXBuiltinSMAndPTX<"void()", SM_90, PTX78>;
+}
+
+// Shuffle
+
+def __nvvm_shfl_down_i32 : NVPTXBuiltin<"int(int, int, int)">;
+def __nvvm_shfl_down_f32 : NVPTXBuiltin<"float(float, int, int)">;
+def __nvvm_shfl_up_i32 : NVPTXBuiltin<"int(int, int, int)">;
+def __nvvm_shfl_up_f32 : NVPTXBuiltin<"float(float, int, int)">;
+def __nvvm_shfl_bfly_i32 : NVPTXBuiltin<"int(int, int, int)">;
+def __nvvm_shfl_bfly_f32 : NVPTXBuiltin<"float(float, int, int)">;
+def __nvvm_shfl_idx_i32 : NVPTXBuiltin<"int(int, int, int)">;
+def __nvvm_shfl_idx_f32 : NVPTXBuiltin<"float(float, int, int)">;
+
+def __nvvm_shfl_sync_down_i32 : NVPTXBuiltinPTX<"int(unsigned int, int, int, int)", PTX60>;
+def __nvvm_shfl_sync_down_f32 : NVPTXBuiltinPTX<"float(unsigned int, float, int, int)", PTX60>;
+def __nvvm_shfl_sync_up_i32 : NVPTXBuiltinPTX<"int(unsigned int, int, int, int)", PTX60>;
+def __nvvm_shfl_sync_up_f32 : NVPTXBuiltinPTX<"float(unsigned int, float, int, int)", PTX60>;
+def __nvvm_shfl_sync_bfly_i32 : NVPTXBuiltinPTX<"int(unsigned int, int, int, int)", PTX60>;
+def __nvvm_shfl_sync_bfly_f32 : NVPTXBuiltinPTX<"float(unsigned int, float, int, int)", PTX60>;
+def __nvvm_shfl_sync_idx_i32 : NVPTXBuiltinPTX<"int(unsigned int, int, int, int)", PTX60>;
+def __nvvm_shfl_sync_idx_f32 : NVPTXBuiltinPTX<"float(unsigned int, float, int, int)", PTX60>;
+
+// Vote
+def __nvvm_vote_all : NVPTXBuiltin<"bool(bool)">;
+def __nvvm_vote_any : NVPTXBuiltin<"bool(bool)">;
+def __nvvm_vote_uni : NVPTXBuiltin<"bool(bool)">;
+def __nvvm_vote_ballot : NVPTXBuiltin<"unsigned int(bool)">;
+
+def __nvvm_vote_all_sync : NVPTXBuiltinPTX<"bool(unsigned int, bool)", PTX60>;
+def __nvvm_vote_any_sync : NVPTXBuiltinPTX<"bool(unsigned int, bool)", PTX60>;
+def __nvvm_vote_uni_sync : NVPTXBuiltinPTX<"bool(unsigned int, bool)", PTX60>;
+def __nvvm_vote_ballot_sync : NVPTXBuiltinPTX<"unsigned int(unsigned int, bool)", PTX60>;
+
+// Mask
+let Attributes = [NoThrow] in {
+  def __nvvm_activemask : NVPTXBuiltinPTX<"unsigned int()", PTX62>;
+}
+
+// Match
+def __nvvm_match_any_sync_i32 : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, unsigned int)", SM_70, PTX60>;
+def __nvvm_match_any_sync_i64 : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, int64_t)", SM_70, PTX60>;
+// These return a pair {value, predicate}, which requires custom lowering.
+def __nvvm_match_all_sync_i32p : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, unsigned int, int *)", SM_70, PTX60>;
+def __nvvm_match_all_sync_i64p : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, int64_t, int *)", SM_70, PTX60>;
+
+// Redux
+def __nvvm_redux_sync_add : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_min : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_max : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_umin : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_umax : NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_and : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_xor : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_or : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+
+// Membar
+
+def __nvvm_membar_cta : NVPTXBuiltin<"void()">;
+def __nvvm_membar_gl : NVPTXBuiltin<"void()">;
+def __nvvm_membar_sys : NVPTXBuiltin<"void()">;
+
+// mbarrier
+
+def __nvvm_mbarrier_init : NVPTXBuiltinSMAndPTX<"void(int64_t *, int)", SM_80, PTX70>;
+def __nvvm_mbarrier_init_shared : NVPTXBuiltinSMAndPTX<"void(int64_t address_space<3> *, int)", SM_80, PTX70>;
+
+def __nvvm_mbarrier_inval : NVPTXBuiltinSMAndPTX<"void(int64_t *)", SM_80, PTX70>;
+def __nvvm_mbarrier_inval_shared : NVPTXBuiltinSMAndPTX<"void(int64_t address_space<3> *)", SM_80, PTX70>;
+
+def __nvvm_mbarrier_arrive : NVPTXBuiltinSMAndPTX<"int64_t(int64_t *)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_shared : NVPTXBuiltinSMAndPTX<"int64_t(int64_t address_space<3> *)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_noComplete : NVPTXBuiltinSMAndPTX<"int64_t(int64_t *, int)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_noComplete_shared : NVPTXBuiltinSMAndPTX<"int64_t(int64_t address_space<3> *, int)", SM_80, PTX70>;
+
+def __nvvm_mbarrier_arrive_drop : NVPTXBuiltinSMAndPTX<"int64_t(int64_t *)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_drop_shared : NVPTXBuiltinSMAndPTX<"int64_t(int64_t address_space<3> *)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_drop_noComplete : NVPTXBuiltinSMAndPTX<"int64_t(int64_t *, int)", SM_80, PTX70>;
+def __nvvm_mbarrier_arrive_drop_noComplete_shared : NVPTXBuiltinSMAndPTX<"int64_t(int64_t address_space<3> *, int)", SM_80, PTX70>;
+
+def __nvvm_mbarrier_test_wait : NVPTXBuiltinSMAndPTX<"bool(int64_t *, int64_t)", SM_80, PTX70>;
+def __nvvm_mbarrier_test_wait_shared : NVPTXBuiltinSMAndPTX<"bool(int64_t address_space<3> *, int64_t)", SM_80, PTX70>;
+
+def __nvvm_mbarrier_pending_count : NVPTXBuiltinSMAndPTX<"int(int64_t)", SM_80, PTX70>;
+
+// Memcpy, Memset
+
+def __nvvm_memcpy : NVPTXBuiltin<"void(unsigned char *, unsigned char *, size_t, int)">;
+def __nvvm_memset : NVPTXBuiltin<"void(unsigned char *, unsigned char, size_t, int)">;
+
+// Image
+
+def __builtin_ptx_read_image2Dfi_ : NVPTXBuiltin<"_Vector<4, float>(int, int, int, int)">;
+def __builtin_ptx_read_image2Dff_ : NVPTXBuiltin<"_Vector<4, float>(int, int, float, float)">;
+def __builtin_ptx_read_image2Dii_ : NVPTXBuiltin<"_Vector<4, int>(int, int, int, int)">;
+def __builtin_ptx_read_image2Dif_ : NVPTXBuiltin<"_Vector<4, int>(int, int, float, float)">;
+
+def __builtin_ptx_read_image3Dfi_ : NVPTXBuiltin<"_Vector<4, float>(int, int, int, int, int, int)">;
+def __builtin_ptx_read_image3Dff_ : NVPTXBuiltin<"_Vector<4, float>(int, int, float, float, float, float)">;
+def __builtin_ptx_read_image3Dii_ : NVPTXBuiltin<"_Vector<4, int>(int, int, int, int, int, int)">;
+def __builtin_ptx_read_image3Dif_ : NVPTXBuiltin<"_Vector<4, int>(int, int, float, float, float, float)">;
+
+def __builtin_ptx_write_image2Df_ : NVPTXBuiltin<"void(int, int, int, float, float, float, float)">;
+def __builtin_ptx_write_image2Di_ : NVPTXBuiltin<"void(int, int, int, int, int, int, int)">;
+def __builtin_ptx_write_image2Dui_ : NVPTXBuiltin<"void(int, int, int, unsigned int, unsigned int, unsigned int, unsigned int)">;
+def __builtin_ptx_get_image_depthi_ : NVPTXBuiltin<"int(int)">;
+def __builtin_ptx_get_image_heighti_ : NVPTXBuiltin<"int(int)">;
+def __builtin_ptx_get_image_widthi_ : NVPTXBuiltin<"int(int)">;
+def __builtin_ptx_get_image_channel_data_typei_ : NVPTXBuiltin<"int(int)">;
+def __builtin_ptx_get_image_channel_orderi_ : NVPTXBuiltin<"int(int)">;
+
+// Atomic
+//
+// We need the atom intrinsics because
+// - they are used in converging analysis
+// - they are used in address space analysis and optimization
+// So it does not hurt to expose them as builtins.
+//
+let Attributes = [NoThrow] in {
+  def __nvvm_atom_add_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_add_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_add_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_add_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_add_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_add_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_add_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_add_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_add_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_add_gen_f : NVPTXBuiltin<"float(float volatile *, float)">;
+  def __nvvm_atom_cta_add_gen_f : NVPTXBuiltinSM<"float(float volatile *, float)", SM_60>;
+  def __nvvm_atom_sys_add_gen_f : NVPTXBuiltinSM<"float(float volatile *, float)", SM_60>;
+  def __nvvm_atom_add_gen_d : NVPTXBuiltinSM<"double(double volatile *, double)", SM_60>;
+  def __nvvm_atom_cta_add_gen_d : NVPTXBuiltinSM<"double(double volatile *, double)", SM_60>;
+  def __nvvm_atom_sys_add_gen_d : NVPTXBuiltinSM<"double(double volatile *, double)", SM_60>;
+
+  def __nvvm_atom_sub_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_sub_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_sub_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+
+  def __nvvm_atom_xchg_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_xchg_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_xchg_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_xchg_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_xchg_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_xchg_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_xchg_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_xchg_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_xchg_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+
+  def __nvvm_atom_max_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_max_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_max_gen_ui : NVPTXBuiltin<"unsigned int(unsigned int volatile *, unsigned int)">;
+  def __nvvm_atom_cta_max_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_max_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_max_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_max_gen_ul : NVPTXBuiltin<"unsigned long int(unsigned long int volatile *, unsigned long int)">;
+  def __nvvm_atom_cta_max_gen_ul : NVPTXBuiltinSM<"unsigned long int(unsigned long int volatile *, unsigned long int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_ul : NVPTXBuiltinSM<"unsigned long int(unsigned long int volatile *, unsigned long int)", SM_60>;
+  def __nvvm_atom_max_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_max_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_max_gen_ull : NVPTXBuiltin<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)">;
+  def __nvvm_atom_cta_max_gen_ull : NVPTXBuiltinSM<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)", SM_60>;
+  def __nvvm_atom_sys_max_gen_ull : NVPTXBuiltinSM<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)", SM_60>;
+
+  def __nvvm_atom_min_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_min_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_min_gen_ui : NVPTXBuiltin<"unsigned int(unsigned int volatile *, unsigned int)">;
+  def __nvvm_atom_cta_min_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_min_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_min_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_min_gen_ul : NVPTXBuiltin<"unsigned long int(unsigned long int volatile *, unsigned long int)">;
+  def __nvvm_atom_cta_min_gen_ul : NVPTXBuiltinSM<"unsigned long int(unsigned long int volatile *, unsigned long int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_ul : NVPTXBuiltinSM<"unsigned long int(unsigned long int volatile *, unsigned long int)", SM_60>;
+  def __nvvm_atom_min_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_min_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_min_gen_ull : NVPTXBuiltin<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)">;
+  def __nvvm_atom_cta_min_gen_ull : NVPTXBuiltinSM<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)", SM_60>;
+  def __nvvm_atom_sys_min_gen_ull : NVPTXBuiltinSM<"unsigned long long int(unsigned long long int volatile *, unsigned long long int)", SM_60>;
+
+  def __nvvm_atom_inc_gen_ui : NVPTXBuiltin<"unsigned int(unsigned int volatile *, unsigned int)">;
+  def __nvvm_atom_cta_inc_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_sys_inc_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_dec_gen_ui : NVPTXBuiltin<"unsigned int(unsigned int volatile *, unsigned int)">;
+  def __nvvm_atom_cta_dec_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+  def __nvvm_atom_sys_dec_gen_ui : NVPTXBuiltinSM<"unsigned int(unsigned int volatile *, unsigned int)", SM_60>;
+
+  def __nvvm_atom_and_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_and_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_and_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_and_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_and_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_and_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_and_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_and_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_and_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+
+  def __nvvm_atom_or_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_or_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_or_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_or_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_or_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_or_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_or_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_or_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_or_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+
+  def __nvvm_atom_xor_gen_i : NVPTXBuiltin<"int(int volatile *, int)">;
+  def __nvvm_atom_cta_xor_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_sys_xor_gen_i : NVPTXBuiltinSM<"int(int volatile *, int)", SM_60>;
+  def __nvvm_atom_xor_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int)">;
+  def __nvvm_atom_cta_xor_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_sys_xor_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int)", SM_60>;
+  def __nvvm_atom_xor_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int)">;
+  def __nvvm_atom_cta_xor_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+  def __nvvm_atom_sys_xor_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int)", SM_60>;
+
+  def __nvvm_atom_cas_gen_us : NVPTXBuiltinSM<"unsigned short(unsigned short volatile *, unsigned short, unsigned short)", SM_70>;
+  def __nvvm_atom_cta_cas_gen_us : NVPTXBuiltinSM<"unsigned short(unsigned short volatile *, unsigned short, unsigned short)", SM_70>;
+  def __nvvm_atom_sys_cas_gen_us : NVPTXBuiltinSM<"unsigned short(unsigned short volatile *, unsigned short, unsigned short)", SM_70>;
+  def __nvvm_atom_cas_gen_i : NVPTXBuiltin<"int(int volatile *, int, int)">;
+  def __nvvm_atom_cta_cas_gen_i : NVPTXBuiltinSM<"int(int volatile *, int, int)", SM_60>;
+  def __nvvm_atom_sys_cas_gen_i : NVPTXBuiltinSM<"int(int volatile *, int, int)", SM_60>;
+  def __nvvm_atom_cas_gen_l : NVPTXBuiltin<"long int(long int volatile *, long int, long int)">;
+  def __nvvm_atom_cta_cas_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int, long int)", SM_60>;
+  def __nvvm_atom_sys_cas_gen_l : NVPTXBuiltinSM<"long int(long int volatile *, long int, long int)", SM_60>;
+  def __nvvm_atom_cas_gen_ll : NVPTXBuiltin<"long long int(long long int volatile *, long long int, long long int)">;
+  def __nvvm_atom_cta_cas_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int, long long int)", SM_60>;
+  def __nvvm_atom_sys_cas_gen_ll : NVPTXBuiltinSM<"long long int(long long int volatile *, long long int, long long int)", SM_60>;
+}
+
+// Compiler Error Warn
+let Attributes = [NoThrow] in {
+  def __nvvm_compiler_error : NVPTXBuiltin<"void(char const address_space<4> *)">;
+  def __nvvm_compiler_warn : NVPTXBuiltin<"void(char const address_space<4> *)">;
+}
+
+def __nvvm_ldu_c : NVPTXBuiltin<"char(char const *)">;
+def __nvvm_ldu_sc : NVPTXBuiltin<"signed char(signed char const *)">;
+def __nvvm_ldu_s : NVPTXBuiltin<"short(short const *)">;
+def __nvvm_ldu_i : NVPTXBuiltin<"int(int const *)">;
+def __nvvm_ldu_l : NVPTXBuiltin<"long int(long int const *)">;
+def __nvvm_ldu_ll : NVPTXBuiltin<"long long int(long long int const *)">;
+
+def __nvvm_ldu_uc : NVPTXBuiltin<"unsigned char(unsigned char const *)">;
+def __nvvm_ldu_us : NVPTXBuiltin<"unsigned short(unsigned short const *)">;
+def __nvvm_ldu_ui : NVPTXBuiltin<"unsigned int(unsigned int const *)">;
+def __nvvm_ldu_ul : NVPTXBuiltin<"unsigned long int(unsigned long int const *)">;
+def __nvvm_ldu_ull : NVPTXBuiltin<"unsigned long long int(unsigned long long int const *)">;
+
+def __nvvm_ldu_h : NVPTXBuiltin<"__fp16(__fp16 const *)">;
+def __nvvm_ldu_f : NVPTXBuiltin<"float(float const *)">;
+def __nvvm_ldu_d : NVPTXBuiltin<"double(double const *)">;
+
+def __nvvm_ldu_c2 : NVPTXBuiltin<"_ExtVector<2, char>(_ExtVector<2, char const *>)">;
+def __nvvm_ldu_sc2 : NVPTXBuiltin<"_ExtVector<2, signed char>(_ExtVector<2, signed char const *>)">;
+def __nvvm_ldu_c4 : NVPTXBuiltin<"_ExtVector<4, char>(_ExtVector<4, char const *>)">;
+def __nvvm_ldu_sc4 : NVPTXBuiltin<"_ExtVector<4, signed char>(_ExtVector<4, signed char const *>)">;
+def __nvvm_ldu_s2 : NVPTXBuiltin<"_ExtVector<2, short>(_ExtVector<2, short const *>)">;
+def __nvvm_ldu_s4 : NVPTXBuiltin<"_ExtVector<4, short>(_ExtVector<4, short const *>)">;
+def __nvvm_ldu_i2 : NVPTXBuiltin<"_ExtVector<2, int>(_ExtVector<2, int const *>)">;
+def __nvvm_ldu_i4 : NVPTXBuiltin<"_ExtVector<4, int>(_ExtVector<4, int const *>)">;
+def __nvvm_ldu_l2 : NVPTXBuiltin<"_ExtVector<2, long int>(_ExtVector<2, long int const *>)">;
+def __nvvm_ldu_ll2 : NVPTXBuiltin<"_ExtVector<2, long long int>(_ExtVector<2, long long int const *>)">;
+
+def __nvvm_ldu_uc2 : NVPTXBuiltin<"_ExtVector<2, unsigned char>(_ExtVector<2, unsigned char const *>)">;
+def __nvvm_ldu_uc4 : NVPTXBuiltin<"_ExtVector<4, unsigned char>(_ExtVector<4, unsigned char const *>)">;
+def __nvvm_ldu_us2 : NVPTXBuiltin<"_ExtVector<2, unsigned short>(_ExtVector<2, unsigned short const *>)">;
+def __nvvm_ldu_us4 : NVPTXBuiltin<"_ExtVector<4, unsigned short>(_ExtVector<4, unsigned short const *>)">;
+def __nvvm_ldu_ui2 : NVPTXBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<2, unsigned int const *>)">;
+def __nvvm_ldu_ui4 : NVPTXBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int const *>)">;
+def __nvvm_ldu_ul2 : NVPTXBuiltin<"_ExtVector<2, unsigned long int>(_ExtVector<2, unsigned long int const *>)">;
+def __nvvm_ldu_ull2 : NVPTXBuiltin<"_ExtVector<2, unsigned long long int>(_ExtVector<2, unsigned long long int const *>)">;
+
+def __nvvm_ldu_h2 : NVPTXBuiltin<"_ExtVector<2, __fp16>(_ExtVector<2, __fp16 const *>)">;
+def __nvvm_ldu_f2 : NVPTXBuiltin<"_ExtVector<2, float>(_ExtVector<2, float const *>)">;
+def __nvvm_ldu_f4 : NVPTXBuiltin<"_ExtVector<4, float>(_ExtVector<4, float const *>)">;
+def __nvvm_ldu_d2 : NVPTXBuiltin<"_ExtVector<2, double>(_ExtVector<2, double const *>)">;
+
+def __nvvm_ldg_c : NVPTXBuiltin<"char(char const *)">;
+def __nvvm_ldg_sc : NVPTXBuiltin<"signed char(signed char const *)">;
+def __nvvm_ldg_s : NVPTXBuiltin<"short(short const *)">;
+def __nvvm_ldg_i : NVPTXBuiltin<"int(int const *)">;
+def __nvvm_ldg_l : NVPTXBuiltin<"long int(long int const *)">;
+def __nvvm_ldg_ll : NVPTXBuiltin<"long long int(long long int const *)">;
+
+def __nvvm_ldg_uc : NVPTXBuiltin<"unsigned char(unsigned char const *)">;
+def __nvvm_ldg_us : NVPTXBuiltin<"unsigned short(unsigned short const *)">;
+def __nvvm_ldg_ui : NVPTXBuiltin<"unsigned int(unsigned int const *)">;
+def __nvvm_ldg_ul : NVPTXBuiltin<"unsigned long int(unsigned long int const *)">;
+def __nvvm_ldg_ull : NVPTXBuiltin<"unsigned long long int(unsigned long long int const *)">;
+
+def __nvvm_ldg_h : NVPTXBuiltin<"__fp16(__fp16 const *)">;
+def __nvvm_ldg_f : NVPTXBuiltin<"float(float const *)">;
+def __nvvm_ldg_d : NVPTXBuiltin<"double(double const *)">;
+
+def __nvvm_ldg_c2 : NVPTXBuiltin<"_ExtVector<2, char>(_ExtVector<2, char const *>)">;
+def __nvvm_ldg_sc2 : NVPTXBuiltin<"_ExtVector<2, signed char>(_ExtVector<2, signed char const *>)">;
+def __nvvm_ldg_c4 : NVPTXBuiltin<"_ExtVector<4, char>(_ExtVector<4, char const *>)">;
+def __nvvm_ldg_sc4 : NVPTXBuiltin<"_ExtVector<4, signed char>(_ExtVector<4, signed char const *>)">;
+def __nvvm_ldg_s2 : NVPTXBuiltin<"_ExtVector<2, short>(_ExtVector<2, short const *>)">;
+def __nvvm_ldg_s4 : NVPTXBuiltin<"_ExtVector<4, short>(_ExtVector<4, short const *>)">;
+def __nvvm_ldg_i2 : NVPTXBuiltin<"_ExtVector<2, int>(_ExtVector<2, int const *>)">;
+def __nvvm_ldg_i4 : NVPTXBuiltin<"_ExtVector<4, int>(_ExtVector<4, int const *>)">;
+def __nvvm_ldg_l2 : NVPTXBuiltin<"_ExtVector<2, long int>(_ExtVector<2, long int const *>)">;
+def __nvvm_ldg_ll2 : NVPTXBuiltin<"_ExtVector<2, long long int>(_ExtVector<2, long long int const *>)">;
+
+def __nvvm_ldg_uc2 : NVPTXBuiltin<"_ExtVector<2, unsigned char>(_ExtVector<2, unsigned char const *>)">;
+def __nvvm_ldg_uc4 : NVPTXBuiltin<"_ExtVector<4, unsigned char>(_ExtVector<4, unsigned char const *>)">;
+def __nvvm_ldg_us2 : NVPTXBuiltin<"_ExtVector<2, unsigned short>(_ExtVector<2, unsigned short const *>)">;
+def __nvvm_ldg_us4 : NVPTXBuiltin<"_ExtVector<4, unsigned short>(_ExtVector<4, unsigned short const *>)">;
+def __nvvm_ldg_ui2 : NVPTXBuiltin<"_ExtVector<2, unsigned int>(_ExtVector<2, unsigned int const *>)">;
+def __nvvm_ldg_ui4 : NVPTXBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int const *>)">;
+def __nvvm_ldg_ul2 : NVPTXBuiltin<"_ExtVector<2, unsigned long int>(_ExtVector<2, unsigned long int const *>)">;
+def __nvvm_ldg_ull2 : NVPTXBuiltin<"_ExtVector<2, unsigned long long int>(_ExtVector<2, unsigned long long int const *>)">;
+
+def __nvvm_ldg_h2 : NVPTXBuiltin<"_ExtVector<2, __fp16>(_ExtVector<2, __fp16 const *>)">;
+def __nvvm_ldg_f2 : NVPTXBuiltin<"_ExtVector<2, float>(_ExtVector<2, float const *>)">;
+def __nvvm_ldg_f4 : NVPTXBuiltin<"_ExtVector<4, float>(_ExtVector<4, float const *>)">;
+def __nvvm_ldg_d2 : NVPTXBuiltin<"_ExtVector<2, double>(_ExtVector<2, double const *>)">;
+
+// Address space predicates.
+let Attributes = [NoThrow, Const] in {
+  def __nvvm_isspacep_const : NVPTXBuiltin<"bool(void const *)">;
+  def __nvvm_isspacep_global : NVPTXBuiltin<"bool(void const *)">;
+  def __nvvm_isspacep_local : NVPTXBuiltin<"bool(void const *)">;
+  def __nvvm_isspacep_shared : NVPTXBuiltin<"bool(void const *)">;
+  def __nvvm_isspacep_shared_cluster : NVPTXBuiltinSMAndPTX<"bool(void const *)", SM_90, PTX78>;
+}
+
+// Builtins to support WMMA instructions on sm_70
+def __hmma_m16n16k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_ld_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_ld_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_st_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_st_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX60>;
+
+def __hmma_m32n8k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_ld_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_ld_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_st_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_st_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX61>;
+
+def __hmma_m8n32k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_ld_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_ld_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_st_c_f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_st_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_70, PTX61>;
+
+def __hmma_m16n16k16_mma_f16f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_mma_f32f16 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_mma_f32f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX60>;
+def __hmma_m16n16k16_mma_f16f32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX60>;
+
+def __hmma_m32n8k16_mma_f16f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_mma_f32f16 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_mma_f32f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m32n8k16_mma_f16f32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX61>;
+
+def __hmma_m8n32k16_mma_f16f16 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_mma_f32f16 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_mma_f32f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX61>;
+def __hmma_m8n32k16_mma_f16f32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_70, PTX61>;
+
+// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75
+def __bmma_m8n8k128_ld_a_b1 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __bmma_m8n8k128_ld_b_b1 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __bmma_m8n8k128_ld_c : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __bmma_m8n8k128_mma_and_popc_b1 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int)", SM_80, PTX71>;
+def __bmma_m8n8k128_mma_xor_popc_b1 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int)", SM_75, PTX63>;
+def __bmma_m8n8k128_st_c_i32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m16n16k16_ld_a_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_ld_a_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_ld_b_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_ld_b_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_ld_c : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_mma_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_mma_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m16n16k16_st_c_i32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_ld_a_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_ld_a_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_ld_b_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_ld_b_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_ld_c : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_mma_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_mma_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m32n8k16_st_c_i32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_ld_a_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_ld_a_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_ld_b_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_ld_b_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_ld_c : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_mma_s8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_mma_u8 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n32k16_st_c_i32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_72, PTX63>;
+def __imma_m8n8k32_ld_a_s4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_ld_a_u4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_ld_b_s4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_ld_b_u4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_ld_c : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_mma_s4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_mma_u4 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, int const *, int const *, _Constant int, _Constant int)", SM_75, PTX63>;
+def __imma_m8n8k32_st_c_i32 : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_75, PTX63>;
+
+// Builtins to support double and alternate float WMMA instructions on sm_80
+def __dmma_m8n8k4_ld_a : NVPTXBuiltinSMAndPTX<"void(double *, double const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __dmma_m8n8k4_ld_b : NVPTXBuiltinSMAndPTX<"void(double *, double const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __dmma_m8n8k4_ld_c : NVPTXBuiltinSMAndPTX<"void(double *, double const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __dmma_m8n8k4_st_c_f64 : NVPTXBuiltinSMAndPTX<"void(double *, double const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __dmma_m8n8k4_mma_f64 : NVPTXBuiltinSMAndPTX<"void(double *, double const *, double const *, double const *, _Constant int, _Constant int)", SM_80, PTX70>;
+
+def __mma_bf16_m16n16k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m16n16k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m16n16k16_mma_f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m8n32k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m8n32k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m8n32k16_mma_f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m32n8k16_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m32n8k16_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_bf16_m32n8k16_mma_f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_80, PTX70>;
+
+def __mma_tf32_m16n16k8_ld_a : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_tf32_m16n16k8_ld_b : NVPTXBuiltinSMAndPTX<"void(int *, int const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_tf32_m16n16k8_ld_c : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_m16n16k8_st_c_f32 : NVPTXBuiltinSMAndPTX<"void(float *, float const *, unsigned int, _Constant int)", SM_80, PTX70>;
+def __mma_tf32_m16n16k8_mma_f32 : NVPTXBuiltinSMAndPTX<"void(float *, int const *, int const *, float const *, _Constant int, _Constant int)", SM_80, PTX70>;
+
+// Async Copy
+def __nvvm_cp_async_mbarrier_arrive : NVPTXBuiltinSMAndPTX<"void(int64_t *)", SM_80, PTX70>;
+def __nvvm_cp_async_mbarrier_arrive_shared : NVPTXBuiltinSMAndPTX<"void(int64_t address_space<3> *)", SM_80, PTX70>;
+def __nvvm_cp_async_mbarrier_arrive_noinc : NVPTXBuiltinSMAndPTX<"void(int64_t *)", SM_80, PTX70>;
+def __nvvm_cp_async_mbarrier_arrive_noinc_shared : NVPTXBuiltinSMAndPTX<"void(int64_t address_space<3> *)", SM_80, PTX70>;
+
+def __nvvm_cp_async_ca_shared_global_4 : NVPTXBuiltinSMAndPTX<"void(void address_space<3> *, void const address_space<1> *, ...)", SM_80, PTX70>;
+def __nvvm_cp_async_ca_shared_global_8 : NVPTXBuiltinSMAndPTX<"void(void address_space<3> *, void const address_space<1> *, ...)", SM_80, PTX70>;
+def __nvvm_cp_async_ca_shared_global_16 : NVPTXBuiltinSMAndPTX<"void(void address_space<3> *, void const address_space<1> *, ...)", SM_80, PTX70>;
+def __nvvm_cp_async_cg_shared_global_16 : NVPTXBuiltinSMAndPTX<"void(void address_space<3> *, void const address_space<1> *, ...)", SM_80, PTX70>;
+
+def __nvvm_cp_async_commit_group : NVPTXBuiltinSMAndPTX<"void()", SM_80, PTX70>;
+def __nvvm_cp_async_wait_group : NVPTXBuiltinSMAndPTX<"void(_Constant int)", SM_80, PTX70>;
+def __nvvm_cp_async_wait_all : NVPTXBuiltinSMAndPTX<"void()", SM_80, PTX70>;
+
+
+// bf16, bf16x2 abs, neg
+def __nvvm_abs_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16)", SM_80, PTX70>;
+def __nvvm_abs_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>)", SM_80, PTX70>;
+def __nvvm_neg_bf16 : NVPTXBuiltinSMAndPTX<"__bf16(__bf16)", SM_80, PTX70>;
+def __nvvm_neg_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(_Vector<2, __bf16>)", SM_80, PTX70>;
+
+def __nvvm_mapa : NVPTXBuiltinSMAndPTX<"void *(void *, int)", SM_90, PTX78>;
+def __nvvm_mapa_shared_cluster : NVPTXBuiltinSMAndPTX<"void address_space<3> *(void address_space<3> *, int)", SM_90, PTX78>;
+def __nvvm_getctarank : NVPTXBuiltinSMAndPTX<"int(void *)", SM_90, PTX78>;
+def __nvvm_getctarank_shared_cluster : NVPTXBuiltinSMAndPTX<"int(void address_space<3> *)", SM_90, PTX78>;
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index 4103d2753abc5..d9e62112f21a3 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -72,6 +72,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins
   SOURCE BuiltinsBPF.td
   TARGET ClangBuiltinsBPF)
 
+clang_tablegen(BuiltinsNVPTX.inc -gen-clang-builtins
+  SOURCE BuiltinsNVPTX.td
+  TARGET ClangBuiltinsNVPTX)
+
 clang_tablegen(BuiltinsRISCV.inc -gen-clang-builtins
   SOURCE BuiltinsRISCV.td
   TARGET ClangBuiltinsRISCV)
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 83ef015018f1a..e46061c25fb75 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -101,12 +101,12 @@ namespace clang {
 
   /// NVPTX builtins
   namespace NVPTX {
-    enum {
-        LastTIBuiltin = clang::Builtin::FirstTSBuiltin-1,
+  enum {
+    LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsNVPTX.def"
-        LastTSBuiltin
-    };
+#include "clang/Basic/BuiltinsNVPTX.inc"
+    LastTSBuiltin
+  };
   }
 
   /// AMDGPU builtins
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index f00dede7fd526..0a2911c2cea0a 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -53,7 +53,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsLoongArchLSX.def"
   textual header "clang/Basic/BuiltinsMips.def"
   textual header "clang/Basic/BuiltinsNEON.def"
-  textual header "clang/Basic/BuiltinsNVPTX.def"
   textual header "clang/Basic/BuiltinsPPC.def"
   textual header "clang/Basic/BuiltinsRISCVVector.def"
   textual header "clang/Basic/BuiltinsSME.def"
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 56efad90cb7c8..a03f4983b9d03 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -21,13 +21,9 @@ using namespace clang;
 using namespace clang::targets;
 
 static constexpr Builtin::Info BuiltinInfo[] = {
-#define BUILTIN(ID, TYPE, ATTRS)                                               \
-  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER)                                    \
-  {#ID, TYPE, ATTRS, nullptr, HeaderDesc::HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#include "clang/Basic/BuiltinsNVPTX.def"
+#include "clang/Basic/BuiltinsNVPTX.inc"
 };
 
 const char *const NVPTXTargetInfo::GCCRegNames[] = {"r0"};
diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c
index 163aee4799ff0..26c465eef306a 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -202,7 +202,7 @@ __device__ void exit() {
 // NVVM intrinsics
 
 // The idea is not to test all intrinsics, just that Clang is recognizing the
-// builtins defined in BuiltinsNVPTX.def
+// builtins defined in BuiltinsNVPTX.td
 __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
 // CHECK: call float @llvm.nvvm.fmax.f
   float t1 = __nvvm_fmax_f(f1, f2);
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
index 6aca4edfdfb88..5c5f011cd940e 100644
--- a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -104,9 +104,39 @@ class PrototypeParser {
 
   void ParseType(StringRef T) {
     T = T.trim();
+
+    auto ConsumeAddrSpace = [&]() -> std::optional<unsigned> {
+      T = T.trim();
+      if (!T.consume_back(">"))
+        return std::nullopt;
+
+      auto Open = T.find_last_of('<');
+      if (Open == StringRef::npos)
+        PrintFatalError(Loc, "Mismatched angle-brackets in type");
+
+      StringRef ArgStr = T.substr(Open + 1);
+      T = T.slice(0, Open);
+      if (!T.consume_back("address_space"))
+        PrintFatalError(Loc,
+                        "Only `address_space<N>` supported as a parameterized "
+                        "pointer or reference type qualifier");
+
+      unsigned Number = 0;
+      if (ArgStr.getAsInteger(10, Number))
+        PrintFatalError(
+            Loc, "Expected an integer argument to the address_space qualifier");
+      if (Number == 0)
+        PrintFatalError(Loc, "No need for a qualifier for address space `0`");
+      return Number;
+    };
+
     if (T.consume_back("*")) {
+      // Pointers may have an address space qualifier immediately before them.
+      std::optional<unsigned> AS = ConsumeAddrSpace();
       ParseType(T);
       Type += "*";
+      if (AS)
+        Type += std::to_string(*AS);
     } else if (T.consume_back("const")) {
       ParseType(T);
       Type += "C";
@@ -117,6 +147,13 @@ class PrototypeParser {
       ParseType(T);
       Type += "R";
     } else if (T.consume_back("&")) {
+      // References may have an address space qualifier immediately before them.
+      std::optional<unsigned> AS = ConsumeAddrSpace();
+      ParseType(T);
+      Type += "&";
+      if (AS)
+        Type += std::to_string(*AS);
+    } else if (T.consume_back(")")) {
       ParseType(T);
       Type += "&";
     } else if (EnableOpenCLLong && T.consume_front("long long")) {

From 0cb7636a462a8d4209e2b6344304eb43f02853eb Mon Sep 17 00:00:00 2001
From: Djordje Todorovic <djordje.todorovic@htecgroup.com>
Date: Tue, 28 Jan 2025 08:04:09 +0100
Subject: [PATCH 353/432] [RISCV] Add MIPS extensions (#121394)

Adding two extensions for MIPS p8700 CPU:
  1. cmove (conditional move)
  2. lsp (load/store pair)

The official product page here:
https://mips.com/products/hardware/p8700
---
 .../Driver/print-supported-extensions-riscv.c |   2 +
 llvm/docs/RISCVUsage.rst                      |   6 +
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  10 ++
 .../RISCV/Disassembler/RISCVDisassembler.cpp  |   5 +
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   1 +
 llvm/lib/Target/RISCV/RISCV.h                 |   2 +
 llvm/lib/Target/RISCV/RISCVFeatures.td        |  15 ++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   3 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   5 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td  | 169 ++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVProcessors.td      |   4 +-
 llvm/lib/Target/RISCV/RISCVSubtarget.cpp      |  13 ++
 llvm/lib/Target/RISCV/RISCVSubtarget.h        |   2 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   1 +
 llvm/test/CodeGen/RISCV/select-and.ll         |  25 +++
 llvm/test/CodeGen/RISCV/select-bare.ll        |  14 ++
 llvm/test/CodeGen/RISCV/select-cc.ll          |  86 +++++++++
 llvm/test/CodeGen/RISCV/select-or.ll          |  25 +++
 llvm/test/MC/RISCV/xmips-invalid.s            |  26 +++
 llvm/test/MC/RISCV/xmips-valid.s              |  35 ++++
 .../TargetParser/RISCVISAInfoTest.cpp         |   2 +
 22 files changed, 453 insertions(+), 2 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
 create mode 100644 llvm/test/MC/RISCV/xmips-invalid.s
 create mode 100644 llvm/test/MC/RISCV/xmips-valid.s

diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index c72b65316ae8a..3443ff0b69de9 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -157,6 +157,8 @@
 // CHECK-NEXT:     xcvmac               1.0       'XCVmac' (CORE-V Multiply-Accumulate)
 // CHECK-NEXT:     xcvmem               1.0       'XCVmem' (CORE-V Post-incrementing Load & Store)
 // CHECK-NEXT:     xcvsimd              1.0       'XCVsimd' (CORE-V SIMD ALU)
+// CHECK-NEXT:     xmipscmove           1.0       'XMIPSCMove' (MIPS conditional move instruction(s) (ccmov))
+// CHECK-NEXT:     xmipslsp             1.0       'XMIPSLSP' (MIPS optimization for hardware load-store bonding)
 // CHECK-NEXT:     xsfcease             1.0       'XSfcease' (SiFive sf.cease Instruction)
 // CHECK-NEXT:     xsfvcp               1.0       'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)
 // CHECK-NEXT:     xsfvfnrclipxfqf      1.0       'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 71927d57a8f0f..09fb59f94e84d 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -459,6 +459,12 @@ The current vendor extensions supported are:
 ``experimental-Xqcisls``
   LLVM implements `version 0.2 of the Qualcomm uC Scaled Load Store extension specification <https://github.com/quic/riscv-unified-db/releases/latest>`__ by Qualcomm.  All instructions are prefixed with `qc.` as described in the specification. These instructions are only available for riscv32.
 
+``Xmipscmove``
+  LLVM implements conditional move for the `p8700 processor <https://mips.com/products/hardware/p8700/>` by MIPS.
+
+``Xmipslsp``
+  LLVM implements load/store pair instructions for the `p8700 processor <https://mips.com/products/hardware/p8700/>` by MIPS.
+
 Experimental C Intrinsics
 =========================
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 2e86a891863fd..c51c4201ebd18 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -877,6 +877,16 @@ struct RISCVOperand final : public MCParsedAsmOperand {
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isUImm7Lsb000() const {
+    if (!isImm())
+      return false;
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isShiftedUInt<4, 3>(Imm) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isUImm8Lsb00() const {
     if (!isImm())
       return false;
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 1c4f322e2104e..3ec465810b1d1 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -681,6 +681,11 @@ DecodeStatus RISCVDisassembler::getInstruction32(MCInst &MI, uint64_t &Size,
                         "SiFive sf.cflush.d.l1 custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXSfcease, DecoderTableXSfcease32,
                         "SiFive sf.cease custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXMIPSLSP, DecoderTableXmipslsp32,
+                        "MIPS mips.lsp custom opcode table");
+  TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXMIPSCMove,
+                        DecoderTableXmipscmove32,
+                        "MIPS mips.ccmov custom opcode table");
   TRY_TO_DECODE_FEATURE(RISCV::FeatureVendorXCVbitmanip,
                         DecoderTableXCVbitmanip32,
                         "CORE-V Bit Manipulation custom opcode table");
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index e9abc90d69a13..2f4b569041a6f 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -308,6 +308,7 @@ enum OperandType : unsigned {
   OPERAND_UIMM6_LSB0,
   OPERAND_UIMM7,
   OPERAND_UIMM7_LSB00,
+  OPERAND_UIMM7_LSB000,
   OPERAND_UIMM8_LSB00,
   OPERAND_UIMM8,
   OPERAND_UIMM8_LSB000,
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index d7bab601d545c..b1aee98739e85 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -84,6 +84,8 @@ void initializeRISCVMoveMergePass(PassRegistry &);
 
 FunctionPass *createRISCVPushPopOptimizationPass();
 void initializeRISCVPushPopOptPass(PassRegistry &);
+FunctionPass *createRISCVLoadStoreOptPass();
+void initializeRISCVLoadStoreOptPass(PassRegistry &);
 
 FunctionPass *createRISCVZacasABIFixPass();
 void initializeRISCVZacasABIFixPass(PassRegistry &);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index a8f433ea91387..f050977c55e19 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1228,6 +1228,21 @@ def HasVendorXCVbi
     : Predicate<"Subtarget->hasVendorXCVbi()">,
       AssemblerPredicate<(all_of FeatureVendorXCVbi),
                          "'XCVbi' (CORE-V Immediate Branching)">;
+// MIPS Extensions
+
+def FeatureVendorXMIPSCMove
+    : RISCVExtension<1, 0, "MIPS conditional move instruction(s) (ccmov)">;
+def HasVendorXMIPSCMove
+    : Predicate<"Subtarget->hasVendorXMIPSCMove()">,
+      AssemblerPredicate<(all_of FeatureVendorXMIPSCMove),
+                         "'Xmipscmove' ('mips.ccmov' instruction)">;
+def UseCCMovInsn : Predicate<"Subtarget->useCCMovInsn()">;
+def FeatureVendorXMIPSLSP
+    : RISCVExtension<1, 0, "MIPS optimization for hardware load-store bonding">;
+def HasVendorXMIPSLSP
+    : Predicate<"Subtarget->hasVendorXMIPSLSP()">,
+      AssemblerPredicate<(all_of FeatureVendorXMIPSLSP),
+                         "'Xmipslsp' (load and store pair instructions)">;
 
 // WCH / Nanjing Qinheng Microelectronics Extension(s)
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5e5bc0819a10c..8d09e534b1858 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -409,7 +409,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ABS, MVT::i32, Custom);
   }
 
-  if (!Subtarget.hasVendorXTHeadCondMov())
+  if (Subtarget.useCCMovInsn())
+    setOperationAction(ISD::SELECT, XLenVT, Legal);
+  else if (!Subtarget.hasVendorXTHeadCondMov())
     setOperationAction(ISD::SELECT, XLenVT, Custom);
 
   static const unsigned FPLegalNodeTypes[] = {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index bd02880b0d712..bb9ebedeea4f7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2488,6 +2488,9 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
         case RISCVOp::OPERAND_UIMM7_LSB00:
           Ok = isShiftedUInt<5, 2>(Imm);
           break;
+        case RISCVOp::OPERAND_UIMM7_LSB000:
+          Ok = isShiftedUInt<4, 3>(Imm);
+          break;
         case RISCVOp::OPERAND_UIMM8_LSB00:
           Ok = isShiftedUInt<6, 2>(Imm);
           break;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index bb5bb6352c32a..fec10864f95dc 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -399,6 +399,10 @@ def ixlenimm_li_restricted : Operand<XLenVT> {
 
 // Standalone (codegen-only) immleaf patterns.
 
+// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
+def simm12_plus1 : ImmLeaf<XLenVT,
+  [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>;
+
 // A 6-bit constant greater than 32.
 def uimm6gt32 : ImmLeaf<XLenVT, [{
   return isUInt<6>(Imm) && Imm > 32;
@@ -2133,6 +2137,7 @@ include "RISCVInstrInfoSFB.td"
 include "RISCVInstrInfoXCV.td"
 include "RISCVInstrInfoXwch.td"
 include "RISCVInstrInfoXqci.td"
+include "RISCVInstrInfoXMips.td"
 
 //===----------------------------------------------------------------------===//
 // Global ISel
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
new file mode 100644
index 0000000000000..281829e99cc56
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td
@@ -0,0 +1,169 @@
+//===-- RISCVInstrInfoXMips.td -----------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the vendor extensions defined by MIPS.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand definitions.
+//===----------------------------------------------------------------------===//
+
+// A 7-bit unsigned immediate where the least significant three bits are zero.
+def uimm7_lsb000 : RISCVOp,
+                   ImmLeaf<XLenVT, [{return isShiftedUInt<4, 3>(Imm);}]> {
+  let ParserMatchClass = UImmAsmOperand<7, "Lsb000">;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeUImmOperand<7>";
+  let OperandType = "OPERAND_UIMM7_LSB000";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    return isShiftedUInt<4, 3>(Imm);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS custom instruction formats
+//===----------------------------------------------------------------------===//
+
+// Load double pair format.
+class LDPFormat<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<7> imm7;
+  bits<5> rs1;
+  bits<5> rd1;
+  bits<5> rd2;
+
+  let Inst{31-27} = rd2;
+  let Inst{26-23} = imm7{6-3};
+  let Inst{22-20} = 0b000;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b100;
+  let Inst{11-7} = rd1;
+  let Inst{6-0} = OPC_CUSTOM_0.Value;
+}
+
+// Load word pair format.
+class LWPFormat<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<7> imm7;
+  bits<5> rs1;
+  bits<5> rd1;
+  bits<5> rd2;
+
+  let Inst{31-27} = rd2;
+  let Inst{26-22} = imm7{6-2};
+  let Inst{21-20} = 0b01;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b100;
+  let Inst{11-7} = rd1;
+  let Inst{6-0} = OPC_CUSTOM_0.Value;
+}
+
+// Store double pair format.
+class SDPFormat<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<7> imm7;
+  bits<5> rs3;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31-27} = rs3;
+  let Inst{26-25} = imm7{6-5};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b101;
+  let Inst{11-10} = imm7{4-3};
+  let Inst{9-7} = 0b000;
+  let Inst{6-0} = OPC_CUSTOM_0.Value;
+}
+
+// Store word pair format.
+class SWPFormat<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<7> imm7;
+  bits<5> rs3;
+  bits<5> rs2;
+  bits<5> rs1;
+
+  let Inst{31-27} = rs3;
+  let Inst{26-25} = imm7{6-5};
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b101;
+  let Inst{11-9} = imm7{4-2};
+  let Inst{8-7} = 0b01;
+  let Inst{6-0} = OPC_CUSTOM_0.Value;
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS extensions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXMIPSCMove], hasSideEffects = 0, mayLoad = 0, mayStore = 0,
+                 DecoderNamespace = "Xmipscmove" in {
+def CCMOV : RVInstR4<0b11, 0b011, OPC_CUSTOM_0, (outs GPR:$rd),
+                     (ins GPR:$rs1, GPR:$rs2, GPR:$rs3),
+                    "mips.ccmov", "$rd, $rs2, $rs1, $rs3">,
+           Sched<[]>;
+}
+
+let Predicates = [UseCCMovInsn] in {
+def : Pat<(select (XLenVT (setne (XLenVT GPR:$rs2), (XLenVT 0))),
+                  (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
+          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(select (XLenVT (setne (XLenVT GPR:$x), (XLenVT simm12_plus1:$y))),
+                  (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
+          (CCMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (setne (XLenVT GPR:$x), (XLenVT GPR:$y))),
+                  (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
+          (CCMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq (XLenVT GPR:$rs2), (XLenVT 0))),
+                  (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
+          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq (XLenVT GPR:$x), (XLenVT simm12_plus1:$y))),
+                  (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
+          (CCMOV GPR:$rs1, (ADDI GPR:$x, (NegImm simm12_plus1:$y)), GPR:$rs3)>;
+def : Pat<(select (XLenVT (seteq (XLenVT GPR:$x), (XLenVT GPR:$y))),
+                  (XLenVT GPR:$rs3), (XLenVT GPR:$rs1)),
+          (CCMOV GPR:$rs1, (XOR GPR:$x, GPR:$y), GPR:$rs3)>;
+def : Pat<(select (XLenVT GPR:$rs2), (XLenVT GPR:$rs1), (XLenVT GPR:$rs3)),
+          (CCMOV GPR:$rs1, GPR:$rs2, GPR:$rs3)>;
+}
+
+let Predicates = [HasVendorXMIPSLSP], hasSideEffects = 0,
+                 DecoderNamespace = "Xmipslsp" in {
+
+def LWP : LWPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb00:$imm7),
+                    "mips.lwp", "$rd1, $rd2, ${imm7}(${rs1})">,
+          Sched<[WriteLDW, WriteLDW, ReadMemBase]> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+def LDP : LDPFormat<(outs GPR:$rd1, GPR:$rd2), (ins GPR:$rs1, uimm7_lsb000:$imm7),
+                    "mips.ldp", "$rd1, $rd2, ${imm7}(${rs1})">,
+          Sched<[WriteLDD, WriteLDD, ReadMemBase]> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+def SWP : SWPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb00:$imm7),
+                    "mips.swp", "$rs2, $rs3, ${imm7}(${rs1})">,
+          Sched<[WriteSTW, ReadStoreData, ReadStoreData, ReadMemBase]> {
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+def SDP : SDPFormat<(outs), (ins GPR:$rs2, GPR:$rs3, GPR:$rs1, uimm7_lsb000:$imm7),
+                    "mips.sdp", "$rs2, $rs3, ${imm7}(${rs1})">,
+          Sched<[WriteSTD, ReadStoreData, ReadStoreData, ReadMemBase]> {
+  let mayLoad = 0;
+  let mayStore = 1;
+}
+
+}
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 28b13f74c2991..b5eea138732a5 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -116,7 +116,9 @@ def MIPS_P8700 : RISCVProcessorModel<"mips-p8700",
                                       FeatureStdExtZba,
                                       FeatureStdExtZbb,
                                       FeatureStdExtZifencei,
-                                      FeatureStdExtZicsr],
+                                      FeatureStdExtZicsr,
+                                      FeatureVendorXMIPSCMove,
+                                      FeatureVendorXMIPSLSP],
                                      [TuneMIPSP8700]>;
 
 def ROCKET_RV32 : RISCVProcessorModel<"rocket-rv32",
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 6e212dc58e6dd..1b54c278820fc 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -62,6 +62,15 @@ static cl::opt<unsigned> RISCVMinimumJumpTableEntries(
     "riscv-min-jump-table-entries", cl::Hidden,
     cl::desc("Set minimum number of entries to use a jump table on RISCV"));
 
+static cl::opt<bool>
+    UseMIPSLoadStorePairsOpt("mips-riscv-load-store-pairs",
+                             cl::desc("RISCV: Optimize for load-store bonding"),
+                             cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    UseCCMovInsn("riscv-ccmov", cl::desc("RISCV: Use 'mips.ccmov' instruction"),
+                 cl::init(true), cl::Hidden);
+
 void RISCVSubtarget::anchor() {}
 
 RISCVSubtarget &
@@ -238,3 +247,7 @@ void RISCVSubtarget::overridePostRASchedPolicy(MachineSchedPolicy &Policy,
     Policy.OnlyBottomUp = false;
   }
 }
+
+bool RISCVSubtarget::useCCMovInsn() const {
+  return UseCCMovInsn && HasVendorXMIPSCMove;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 87d508c394173..8bec6edb324b1 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -188,6 +188,8 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   unsigned getXLen() const {
     return is64Bit() ? 64 : 32;
   }
+  bool useLoadStorePairs() const;
+  bool useCCMovInsn() const;
   unsigned getFLen() const {
     if (HasStdExtD)
       return 64;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index f6ccbfbe217df..dde808ad90413 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -389,6 +389,7 @@ class RISCVPassConfig : public TargetPassConfig {
       DAG->addMutation(createStoreClusterDAGMutation(
           DAG->TII, DAG->TRI, /*ReorderWhileClustering=*/true));
     }
+
     return DAG;
   }
   
diff --git a/llvm/test/CodeGen/RISCV/select-and.ll b/llvm/test/CodeGen/RISCV/select-and.ll
index d305993f0e966..f827e840f4a36 100644
--- a/llvm/test/CodeGen/RISCV/select-and.ll
+++ b/llvm/test/CodeGen/RISCV/select-and.ll
@@ -3,6 +3,8 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-CCMOV %s
 
 ;; There are a few different ways to lower (select (and A, B), X, Y). This test
 ;; ensures that we do so with as few branches as possible.
@@ -27,6 +29,12 @@ define signext i32 @select_of_and(i1 zeroext %a, i1 zeroext %b, i32 signext %c,
 ; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:  .LBB0_2:
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: select_of_and:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    and a0, a0, a1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64I-CCMOV-NEXT:    ret
   %1 = and i1 %a, %b
   %2 = select i1 %1, i32 %c, i32 %d
   ret i32 %2
@@ -69,6 +77,23 @@ define signext i32 @if_of_and(i1 zeroext %a, i1 zeroext %b) nounwind {
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: if_of_and:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    addi sp, sp, -16
+; RV64I-CCMOV-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-CCMOV-NEXT:    beqz a0, .LBB1_3
+; RV64I-CCMOV-NEXT:  # %bb.1:
+; RV64I-CCMOV-NEXT:    beqz a1, .LBB1_3
+; RV64I-CCMOV-NEXT:  # %bb.2: # %if.then
+; RV64I-CCMOV-NEXT:    call both
+; RV64I-CCMOV-NEXT:    j .LBB1_4
+; RV64I-CCMOV-NEXT:  .LBB1_3: # %if.else
+; RV64I-CCMOV-NEXT:    call neither
+; RV64I-CCMOV-NEXT:  .LBB1_4: # %if.end
+; RV64I-CCMOV-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-CCMOV-NEXT:    addi sp, sp, 16
+; RV64I-CCMOV-NEXT:    ret
   %1 = and i1 %a, %b
   br i1 %1, label %if.then, label %if.else
 
diff --git a/llvm/test/CodeGen/RISCV/select-bare.ll b/llvm/test/CodeGen/RISCV/select-bare.ll
index cf8fe96742bfb..c9e108a1ca9d0 100644
--- a/llvm/test/CodeGen/RISCV/select-bare.ll
+++ b/llvm/test/CodeGen/RISCV/select-bare.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-CCMOV %s
 
 define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 ; RV32I-LABEL: bare_select:
@@ -12,6 +14,12 @@ define i32 @bare_select(i1 %a, i32 %b, i32 %c) nounwind {
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB0_2:
 ; RV32I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: bare_select:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    andi a0, a0, 1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64I-CCMOV-NEXT:    ret
   %1 = select i1 %a, i32 %b, i32 %c
   ret i32 %1
 }
@@ -26,6 +34,12 @@ define float @bare_select_float(i1 %a, float %b, float %c) nounwind {
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:  .LBB1_2:
 ; RV32I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: bare_select_float:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    andi a0, a0, 1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64I-CCMOV-NEXT:    ret
   %1 = select i1 %a, float %b, float %c
   ret float %1
 }
diff --git a/llvm/test/CodeGen/RISCV/select-cc.ll b/llvm/test/CodeGen/RISCV/select-cc.ll
index 31e25702da8ba..1c2a0cf007d11 100644
--- a/llvm/test/CodeGen/RISCV/select-cc.ll
+++ b/llvm/test/CodeGen/RISCV/select-cc.ll
@@ -3,6 +3,8 @@
 ; RUN:   | FileCheck -check-prefixes=RV32I %s
 ; RUN: llc -mtriple=riscv64 -disable-block-placement -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-CCMOV %s
 
 define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ; RV32I-LABEL: foo:
@@ -156,6 +158,57 @@ define signext i32 @foo(i32 signext %a, ptr %b) nounwind {
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:  .LBB0_28:
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: foo:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
+; RV64I-CCMOV-NEXT:    lw a5, 0(a1)
+; RV64I-CCMOV-NEXT:    xor a6, a0, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a6, a2, a0
+; RV64I-CCMOV-NEXT:    xor a2, a0, a3
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a2, a0, a3
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    sltu a3, a4, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a3, a0, a4
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    sltu a4, a0, a5
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a4, a5, a0
+; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
+; RV64I-CCMOV-NEXT:    sltu a5, a0, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a2
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    sltu a5, a3, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a3, a0
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a5, a0
+; RV64I-CCMOV-NEXT:    slt a5, a4, a5
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a4
+; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a5, a0
+; RV64I-CCMOV-NEXT:    slt a5, a5, a2
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a2, a0
+; RV64I-CCMOV-NEXT:    lw a2, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a5, a0
+; RV64I-CCMOV-NEXT:    slt a5, a5, a3
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a3
+; RV64I-CCMOV-NEXT:    lw a3, 0(a1)
+; RV64I-CCMOV-NEXT:    sext.w a5, a0
+; RV64I-CCMOV-NEXT:    slt a5, a4, a5
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a4, a0
+; RV64I-CCMOV-NEXT:    lw a4, 0(a1)
+; RV64I-CCMOV-NEXT:    slti a5, a2, 1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a0, a2
+; RV64I-CCMOV-NEXT:    slti a5, a2, 0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a5, a3, a0
+; RV64I-CCMOV-NEXT:    lw a1, 0(a1)
+; RV64I-CCMOV-NEXT:    slti a3, a4, 1025
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a3, a4, a0
+; RV64I-CCMOV-NEXT:    sltiu a2, a2, 2047
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a2, a1, a0
+; RV64I-CCMOV-NEXT:    sext.w a0, a0
+; RV64I-CCMOV-NEXT:    ret
   %val1 = load volatile i32, ptr %b
   %tst1 = icmp eq i32 %a, %val1
   %val2 = select i1 %tst1, i32 %a, i32 %val1
@@ -258,6 +311,23 @@ define signext i16 @numsignbits(i16 signext %0, i16 signext %1, i16 signext %2,
 ; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: numsignbits:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    addi sp, sp, -16
+; RV64I-CCMOV-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-CCMOV-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-CCMOV-NEXT:    mips.ccmov s0, a0, a2, a3
+; RV64I-CCMOV-NEXT:    beqz a1, .LBB1_2
+; RV64I-CCMOV-NEXT:  # %bb.1:
+; RV64I-CCMOV-NEXT:    mv a0, s0
+; RV64I-CCMOV-NEXT:    call bar
+; RV64I-CCMOV-NEXT:  .LBB1_2:
+; RV64I-CCMOV-NEXT:    mv a0, s0
+; RV64I-CCMOV-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-CCMOV-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-CCMOV-NEXT:    addi sp, sp, 16
+; RV64I-CCMOV-NEXT:    ret
   %5 = icmp eq i16 %0, 0
   %6 = select i1 %5, i16 %3, i16 %2
   %7 = icmp eq i16 %1, 0
@@ -295,6 +365,14 @@ define i32 @select_sge_int16min(i32 signext %x, i32 signext %y, i32 signext %z)
 ; RV64I-NEXT:  .LBB2_2:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: select_sge_int16min:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    lui a3, 1048560
+; RV64I-CCMOV-NEXT:    addiw a3, a3, -1
+; RV64I-CCMOV-NEXT:    slt a0, a3, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64I-CCMOV-NEXT:    ret
   %a = icmp sge i32 %x, -65536
   %b = select i1 %a, i32 %y, i32 %z
   ret i32 %b
@@ -331,6 +409,14 @@ define i64 @select_sge_int32min(i64 %x, i64 %y, i64 %z) {
 ; RV64I-NEXT:  .LBB3_2:
 ; RV64I-NEXT:    mv a0, a1
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: select_sge_int32min:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    lui a3, 524288
+; RV64I-CCMOV-NEXT:    addi a3, a3, -1
+; RV64I-CCMOV-NEXT:    slt a0, a3, a0
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64I-CCMOV-NEXT:    ret
   %a = icmp sge i64 %x, -2147483648
   %b = select i1 %a, i64 %y, i64 %z
   ret i64 %b
diff --git a/llvm/test/CodeGen/RISCV/select-or.ll b/llvm/test/CodeGen/RISCV/select-or.ll
index 20a5ec15290cd..338c7c06c3ab8 100644
--- a/llvm/test/CodeGen/RISCV/select-or.ll
+++ b/llvm/test/CodeGen/RISCV/select-or.ll
@@ -3,6 +3,8 @@
 ; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmove -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64I-CCMOV %s
 
 ;; There are a few different ways to lower (select (or A, B), X, Y). This test
 ;; ensures that we do so with as few branches as possible.
@@ -27,6 +29,12 @@ define signext i32 @select_of_or(i1 zeroext %a, i1 zeroext %b, i32 signext %c, i
 ; RV64I-NEXT:    mv a0, a3
 ; RV64I-NEXT:  .LBB0_2:
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: select_of_or:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    or a0, a0, a1
+; RV64I-CCMOV-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64I-CCMOV-NEXT:    ret
   %1 = or i1 %a, %b
   %2 = select i1 %1, i32 %c, i32 %d
   ret i32 %2
@@ -69,6 +77,23 @@ define signext i32 @if_of_or(i1 zeroext %a, i1 zeroext %b) nounwind {
 ; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
 ; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
+;
+; RV64I-CCMOV-LABEL: if_of_or:
+; RV64I-CCMOV:       # %bb.0:
+; RV64I-CCMOV-NEXT:    addi sp, sp, -16
+; RV64I-CCMOV-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-CCMOV-NEXT:    bnez a0, .LBB1_3
+; RV64I-CCMOV-NEXT:  # %bb.1:
+; RV64I-CCMOV-NEXT:    bnez a1, .LBB1_3
+; RV64I-CCMOV-NEXT:  # %bb.2: # %if.else
+; RV64I-CCMOV-NEXT:    call neither
+; RV64I-CCMOV-NEXT:    j .LBB1_4
+; RV64I-CCMOV-NEXT:  .LBB1_3: # %if.then
+; RV64I-CCMOV-NEXT:    call either
+; RV64I-CCMOV-NEXT:  .LBB1_4: # %if.end
+; RV64I-CCMOV-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-CCMOV-NEXT:    addi sp, sp, 16
+; RV64I-CCMOV-NEXT:    ret
   %1 = or i1 %a, %b
   br i1 %1, label %if.then, label %if.else
 
diff --git a/llvm/test/MC/RISCV/xmips-invalid.s b/llvm/test/MC/RISCV/xmips-invalid.s
new file mode 100644
index 0000000000000..a1c1fd0666e0a
--- /dev/null
+++ b/llvm/test/MC/RISCV/xmips-invalid.s
@@ -0,0 +1,26 @@
+# RUN: not llvm-mc -triple=riscv64 < %s 2>&1 | FileCheck %s -check-prefixes=CHECK-FEATURE
+# RUN: not llvm-mc -triple=riscv64 -mattr=+xmipslsp,+xmipscmove < %s 2>&1 | FileCheck %s
+
+mips.ccmov x0, x1, 0x10
+# CHECK: error: invalid operand for instruction
+
+mips.ccmov x10
+# CHECK: error: too few operands for instruction
+
+mips.ccmov	s0, s1, s2, s3
+# CHECK-FEATURE: error: instruction requires the following: 'Xmipscmove' ('mips.ccmov' instruction)
+
+mips.lwp x10, x11
+# CHECK: error: too few operands for instruction
+
+mips.ldp x9, 0x20
+# CHECK: error: invalid operand for instruction
+
+mips.lwp x11, x12, 0(x13)
+# CHECK-FEATURE: error: instruction requires the following: 'Xmipslsp' (load and store pair instructions)
+
+mips.swp x18, x19, 8(x2)
+# CHECK-FEATURE: error: instruction requires the following: 'Xmipslsp' (load and store pair instructions)
+
+mips.sdp 0x10, x3, 12(x4)
+# CHECK: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/xmips-valid.s b/llvm/test/MC/RISCV/xmips-valid.s
new file mode 100644
index 0000000000000..ba256a823f511
--- /dev/null
+++ b/llvm/test/MC/RISCV/xmips-valid.s
@@ -0,0 +1,35 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+xmipslsp,+xmipscmove -M no-aliases -show-encoding \
+# RUN:   | FileCheck -check-prefixes=CHECK-INST,CHECK-ENC %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+xmipslsp,+xmipscmove < %s \
+# RUN:   | llvm-objdump --mattr=+xmipslsp,+xmipscmove -M no-aliases -d - \
+# RUN:   | FileCheck -check-prefix=CHECK-DIS %s
+
+# CHECK-INST: mips.ccmov	s0, s1, s2, s3
+# CHECK-ENC:  encoding: [0x0b,0x34,0x99,0x9e]
+mips.ccmov s0, s1, s2, s3
+
+# CHECK-DIS: mips.ccmov	s0, s1, s2, s3
+
+# CHECK-INST: mips.swp s3, s2, 0(sp)
+# CHECK-ENC: encoding: [0x8b,0x50,0x31,0x91]
+mips.swp s3, s2, 0(sp)
+
+# CHECK-DIS: mips.swp s3, s2, 0x0(sp)
+
+# CHECK-INST: mips.sdp s5, s6, 16(s7)
+# CHECK-ENC: encoding: [0x0b,0xd8,0x5b,0xb1]
+mips.sdp s5, s6, 16(s7)
+
+# CHECK-DIS: mips.sdp s5, s6, 0x10(s7)
+
+# CHECK-INST: mips.ldp s1, s2, 8(sp)
+# CHECK-ENC: encoding: [0x8b,0x44,0x81,0x90]
+mips.ldp s1, s2, 8(sp)
+
+# CHECK-DIS: mips.ldp s1, s2, 0x8(sp)
+
+# CHECK-INST: mips.lwp a0, a1, 20(a2)
+# CHECK-ENC: encoding: [0x0b,0x45,0x56,0x59]
+mips.lwp x10, x11, 20(x12)
+
+# CHECK-DIS: mips.lwp a0, a1, 0x14(a2)
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index aa8851e4897e2..7ebfcf915a7c5 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -1083,6 +1083,8 @@ R"(All available -march extensions for RISC-V
     xcvmac               1.0
     xcvmem               1.0
     xcvsimd              1.0
+    xmipscmove           1.0
+    xmipslsp             1.0
     xsfcease             1.0
     xsfvcp               1.0
     xsfvfnrclipxfqf      1.0

From 00f692b94f9aa08ede4aaba6f2aafe17857599c4 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 28 Jan 2025 06:49:38 +0000
Subject: [PATCH 354/432] Reland "[MLGO] Count LR Evictions Rather than Relying
 on Cascade (#124440)"

This reverts commit aa65f93b71dee8cacb22be1957673c8be6a3ec24.

This relands commit 8cc83b66e20e72cdb3bb5fbd549c941797b0e0c9.

It looks like this was a transitive include issue.
---
 llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp | 45 ++++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
index 9c6487b40d606..9656774c6eaae 100644
--- a/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
+++ b/llvm/lib/CodeGen/MLRegAllocEvictAdvisor.cpp
@@ -42,6 +42,7 @@
 #include <array>
 #include <bitset>
 #include <memory>
+#include <unordered_map>
 
 using namespace llvm;
 
@@ -63,11 +64,11 @@ static cl::opt<std::string> InteractiveChannelBaseName(
         "outgoing name should be "
         "<regalloc-evict-interactive-channel-base>.out"));
 
-static cl::opt<unsigned>
-    MaxCascade("mlregalloc-max-cascade", cl::Hidden,
-               cl::desc("The maximum number of times a live range can be "
-                        "evicted before preventing it from being evicted"),
-               cl::init(20));
+static cl::opt<unsigned> MaxEvictionCount(
+    "mlregalloc-max-eviction-count", cl::Hidden,
+    cl::desc("The maximum number of times a live range can be "
+             "evicted before preventing it from being evicted"),
+    cl::init(100));
 
 // Options that only make sense in development mode
 #ifdef LLVM_HAVE_TFLITE
@@ -364,6 +365,22 @@ class MLEvictAdvisor : public RegAllocEvictionAdvisor {
 
   using RegID = unsigned;
   mutable DenseMap<RegID, LIFeatureComponents> CachedFeatures;
+
+  mutable std::unordered_map<unsigned, unsigned> VirtRegEvictionCounts;
+
+  void onEviction(Register RegBeingEvicted) const {
+    // If we cannot find the virtual register in the map, we just assume it has
+    // not been evicted before and thus has a value of zero (which is what the
+    // subscript operator returns by default).
+    ++VirtRegEvictionCounts[RegBeingEvicted.id()];
+  }
+
+  unsigned getEvictionCount(Register Reg) const {
+    auto EvictionCountIt = VirtRegEvictionCounts.find(Reg.id());
+    if (EvictionCountIt != VirtRegEvictionCounts.end())
+      return EvictionCountIt->second;
+    return 0;
+  }
 };
 
 #define _DECL_FEATURES(type, name, shape, _)                                   \
@@ -657,7 +674,7 @@ bool MLEvictAdvisor::loadInterferenceFeatures(
       // threshold, prevent the range from being evicted. We still let the
       // range through if it is urgent as we are required to produce an
       // eviction if the candidate is not spillable.
-      if (IntfCascade >= MaxCascade && !Urgent)
+      if (getEvictionCount(Intf->reg()) > MaxEvictionCount && !Urgent)
         return false;
 
       // Only evict older cascades or live ranges without a cascade.
@@ -803,6 +820,22 @@ MCRegister MLEvictAdvisor::tryFindEvictionCandidate(
   }
   assert(CandidatePos < ValidPosLimit);
   (void)ValidPosLimit;
+
+  // Update information about how many times the virtual registers being
+  // evicted have been evicted so that we can prevent the model from evicting
+  // the same ranges continually and eating compile time.
+  if (CandidatePos == CandidateVirtRegPos) {
+    onEviction(VirtReg.reg());
+  } else {
+    for (MCRegUnit Unit : TRI->regunits(Regs[CandidatePos].first)) {
+      LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, Unit);
+      const auto &IFIntervals = Q.interferingVRegs(EvictInterferenceCutoff);
+      for (const LiveInterval *Intf : reverse(IFIntervals)) {
+        onEviction(Intf->reg());
+      }
+    }
+  }
+
   return Regs[CandidatePos].first;
 }
 

From aab25f20f6c06bab7aac6fb83d54705ec4cdfadd Mon Sep 17 00:00:00 2001
From: Adam Yang <hanbyang@microsoft.com>
Date: Mon, 27 Jan 2025 23:26:56 -0800
Subject: [PATCH 355/432] [HLSL][SPIRV][DXIL] Implement `WaveActiveMax`
 intrinsic (#123428)

```    - add clang builtin to Builtins.td
      - link builtin in hlsl_intrinsics
      - add codegen for spirv intrinsic and two directx intrinsics to retain
        signedness information of the operands in CGBuiltin.cpp
      - add semantic analysis in SemaHLSL.cpp
      - add lowering of spirv intrinsic to spirv backend in
        SPIRVInstructionSelector.cpp
      - add lowering of directx intrinsics to WaveActiveOp dxil op in
    DXIL.td

      - add test cases to illustrate passespendent pr merges.
```
Resolves #99170
---
 clang/include/clang/Basic/Builtins.td         |   6 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  36 +++++
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      |  99 ++++++++++++
 clang/lib/Sema/SemaHLSL.cpp                   |   1 +
 .../CodeGenHLSL/builtins/WaveActiveMax.hlsl   |  46 ++++++
 .../BuiltIns/WaveActiveMax-errors.hlsl        |  29 ++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |   2 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |   2 +
 llvm/lib/Target/DirectX/DXIL.td               |  12 +-
 .../DirectX/DirectXTargetTransformInfo.cpp    |   2 +
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  35 +++++
 llvm/test/CodeGen/DirectX/WaveActiveMax.ll    | 143 ++++++++++++++++++
 .../SPIRV/hlsl-intrinsics/WaveActiveMax.ll    |  57 +++++++
 13 files changed, 469 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/WaveActiveMax-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/WaveActiveMax.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMax.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 60c360d4a9e07..29939242596ba 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4795,6 +4795,12 @@ def HLSLWaveActiveCountBits : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "unsigned int(bool)";
 }
 
+def HLSLWaveActiveMax : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_active_max"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void (...)";
+}
+
 def HLSLWaveActiveSum : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_wave_active_sum"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9ae133421d8da..27dfab11c5531 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19295,6 +19295,25 @@ static Intrinsic::ID getWaveActiveSumIntrinsic(llvm::Triple::ArchType Arch,
   }
 }
 
+// Return wave active sum that corresponds to the QT scalar type
+static Intrinsic::ID getWaveActiveMaxIntrinsic(llvm::Triple::ArchType Arch,
+                                               CGHLSLRuntime &RT, QualType QT) {
+  switch (Arch) {
+  case llvm::Triple::spirv:
+    if (QT->isUnsignedIntegerType())
+      return llvm::Intrinsic::spv_wave_reduce_umax;
+    return llvm::Intrinsic::spv_wave_reduce_max;
+  case llvm::Triple::dxil: {
+    if (QT->isUnsignedIntegerType())
+      return llvm::Intrinsic::dx_wave_reduce_umax;
+    return llvm::Intrinsic::dx_wave_reduce_max;
+  }
+  default:
+    llvm_unreachable("Intrinsic WaveActiveMax"
+                     " not supported by target architecture");
+  }
+}
+
 Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
                                             const CallExpr *E,
                                             ReturnValueSlot ReturnValue) {
@@ -19624,6 +19643,23 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
                                                      /*AssumeConvergent=*/true),
                            ArrayRef{OpExpr}, "hlsl.wave.active.sum");
   }
+  case Builtin::BI__builtin_hlsl_wave_active_max: {
+    // Due to the use of variadic arguments, explicitly retreive argument
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    llvm::FunctionType *FT = llvm::FunctionType::get(
+        OpExpr->getType(), ArrayRef{OpExpr->getType()}, false);
+    Intrinsic::ID IID = getWaveActiveMaxIntrinsic(
+        getTarget().getTriple().getArch(), CGM.getHLSLRuntime(),
+        E->getArg(0)->getType());
+
+    // Get overloaded name
+    std::string Name =
+        Intrinsic::getName(IID, ArrayRef{OpExpr->getType()}, &CGM.getModule());
+    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
+                                                     /*Local=*/false,
+                                                     /*AssumeConvergent=*/true),
+                           ArrayRef{OpExpr}, "hlsl.wave.active.max");
+  }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
     // defined in SPIRVBuiltins.td. So instead we manually get the matching name
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index d1e4eb08aa764..d1f5fdff8b600 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -2468,6 +2468,105 @@ __attribute__((convergent)) double3 WaveReadLaneAt(double3, int32_t);
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
 __attribute__((convergent)) double4 WaveReadLaneAt(double4, int32_t);
 
+//===----------------------------------------------------------------------===//
+// WaveActiveMax builtins
+//===----------------------------------------------------------------------===//
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) half WaveActiveMax(half);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) half2 WaveActiveMax(half2);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) half3 WaveActiveMax(half3);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) half4 WaveActiveMax(half4);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int16_t WaveActiveMax(int16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int16_t2 WaveActiveMax(int16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int16_t3 WaveActiveMax(int16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int16_t4 WaveActiveMax(int16_t4);
+
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint16_t WaveActiveMax(uint16_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint16_t2 WaveActiveMax(uint16_t2);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint16_t3 WaveActiveMax(uint16_t3);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint16_t4 WaveActiveMax(uint16_t4);
+#endif
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int WaveActiveMax(int);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int2 WaveActiveMax(int2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int3 WaveActiveMax(int3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int4 WaveActiveMax(int4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint WaveActiveMax(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint2 WaveActiveMax(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint3 WaveActiveMax(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint4 WaveActiveMax(uint4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int64_t WaveActiveMax(int64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int64_t2 WaveActiveMax(int64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int64_t3 WaveActiveMax(int64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) int64_t4 WaveActiveMax(int64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint64_t WaveActiveMax(uint64_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint64_t2 WaveActiveMax(uint64_t2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint64_t3 WaveActiveMax(uint64_t3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) uint64_t4 WaveActiveMax(uint64_t4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) float WaveActiveMax(float);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) float2 WaveActiveMax(float2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) float3 WaveActiveMax(float3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) float4 WaveActiveMax(float4);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) double WaveActiveMax(double);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) double2 WaveActiveMax(double2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) double3 WaveActiveMax(double3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_active_max)
+__attribute__((convergent)) double4 WaveActiveMax(double4);
+
 //===----------------------------------------------------------------------===//
 // WaveActiveSum builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index aa99b44958eaf..d748c10455289 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2430,6 +2430,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     TheCall->setType(ArgTyA);
     break;
   }
+  case Builtin::BI__builtin_hlsl_wave_active_max:
   case Builtin::BI__builtin_hlsl_wave_active_sum: {
     if (SemaRef.checkArgCount(TheCall, 1))
       return true;
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
new file mode 100644
index 0000000000000..7891cfc1989af
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Test basic lowering to runtime function call.
+
+// CHECK-LABEL: test_int
+int test_int(int expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.max.i32([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.max.i32([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMax(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.max.i32([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.max.i32([[TY]]) #[[#attr:]]
+
+// CHECK-LABEL: test_uint64_t
+uint64_t test_uint64_t(uint64_t expr) {
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umax.i64([[TY]] %[[#]])
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umax.i64([[TY]] %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveActiveMax(expr);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.reduce.umax.i64([[TY]]) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.reduce.umax.i64([[TY]]) #[[#attr:]]
+
+// Test basic lowering to runtime function call with array and float value.
+
+// CHECK-LABEL: test_floatv4
+float4 test_floatv4(float4 expr) {
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.max.v4f32([[TY1]] %[[#]]
+  // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.max.v4f32([[TY1]] %[[#]])
+  // CHECK:  ret [[TY1]] %[[RET1]]
+  return WaveActiveMax(expr);
+}
+
+// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.reduce.max.v4f32([[TY1]]) #[[#attr]]
+// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.reduce.max.v4f32([[TY1]]) #[[#attr]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
+
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveActiveMax-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveActiveMax-errors.hlsl
new file mode 100644
index 0000000000000..e077a40ba5165
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveActiveMax-errors.hlsl
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+int test_too_few_arg() {
+  return __builtin_hlsl_wave_active_max();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_wave_active_max(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool test_expr_bool_type_check(bool p0) {
+  return __builtin_hlsl_wave_active_max(p0);
+  // expected-error@-1 {{invalid operand of type 'bool'}}
+}
+
+bool2 test_expr_bool_vec_type_check(bool2 p0) {
+  return __builtin_hlsl_wave_active_max(p0);
+  // expected-error@-1 {{invalid operand of type 'bool2' (aka 'vector<bool, 2>')}}
+}
+
+struct S { float f; };
+
+S test_expr_struct_type_check(S p0) {
+  return __builtin_hlsl_wave_active_max(p0);
+  // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index f21948697c8a6..beed84b144cec 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -105,6 +105,8 @@ def int_dx_wave_active_countbits : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i1
 def int_dx_wave_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+def int_dx_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_reduce_usum : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index be337dbccaf8a..38910ee263ee3 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -91,6 +91,8 @@ let TargetPrefix = "spv" in {
   def int_spv_wave_active_countbits : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_all : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_any : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i1_ty], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_umax : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
+  def int_spv_wave_reduce_max : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_reduce_sum : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>], [IntrConvergent, IntrNoMem]>;
   def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
   def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index d099bb395449d..7cb841d9bd5b5 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1000,6 +1000,16 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
                    IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Sum>,
                    IntrinArgI8<SignedOpKind_Unsigned>
                  ]>,
+    IntrinSelect<int_dx_wave_reduce_max,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>,
+                   IntrinArgI8<SignedOpKind_Signed>
+                 ]>,
+    IntrinSelect<int_dx_wave_reduce_umax,
+                 [
+                   IntrinArgIndex<0>, IntrinArgI8<WaveOpKind_Max>,
+                   IntrinArgI8<SignedOpKind_Unsigned>
+                 ]>,
   ];
 
   let arguments = [OverloadTy, Int8Ty, Int8Ty];
@@ -1008,7 +1018,7 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
     Overloads<DXIL1_0, [HalfTy, FloatTy, DoubleTy, Int16Ty, Int32Ty, Int64Ty]>
   ];
   let stages = [Stages<DXIL1_0, [all_stages]>];
-  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+  let attributes = [Attributes<DXIL1_0, []>];
 }
 
 def WaveAllBitCount : DXILOp<135, waveAllOp> {
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 4e6e01bc5edbc..ba656dc737140 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -40,6 +40,8 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   switch (ID) {
   case Intrinsic::dx_frac:
   case Intrinsic::dx_rsqrt:
+  case Intrinsic::dx_wave_reduce_max:
+  case Intrinsic::dx_wave_reduce_umax:
   case Intrinsic::dx_wave_reduce_sum:
   case Intrinsic::dx_wave_reduce_usum:
   case Intrinsic::dx_wave_readlane:
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index f5409c27d6ea3..e7d8fe5bd8015 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -215,6 +215,9 @@ class SPIRVInstructionSelector : public InstructionSelector {
   bool selectDot4AddPackedExpansion(Register ResVReg, const SPIRVType *ResType,
                                     MachineInstr &I) const;
 
+  bool selectWaveReduceMax(Register ResVReg, const SPIRVType *ResType,
+                           MachineInstr &I, bool IsUnsigned) const;
+
   bool selectWaveReduceSum(Register ResVReg, const SPIRVType *ResType,
                            MachineInstr &I) const;
 
@@ -2132,6 +2135,34 @@ bool SPIRVInstructionSelector::selectWaveActiveCountBits(
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectWaveReduceMax(Register ResVReg,
+                                                   const SPIRVType *ResType,
+                                                   MachineInstr &I,
+                                                   bool IsUnsigned) const {
+  assert(I.getNumOperands() == 3);
+  assert(I.getOperand(2).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+  Register InputRegister = I.getOperand(2).getReg();
+  SPIRVType *InputType = GR.getSPIRVTypeForVReg(InputRegister);
+
+  if (!InputType)
+    report_fatal_error("Input Type could not be determined.");
+
+  SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  // Retreive the operation to use based on input type
+  bool IsFloatTy = GR.isScalarOrVectorOfType(InputRegister, SPIRV::OpTypeFloat);
+  auto IntegerOpcodeType =
+      IsUnsigned ? SPIRV::OpGroupNonUniformUMax : SPIRV::OpGroupNonUniformSMax;
+  auto Opcode = IsFloatTy ? SPIRV::OpGroupNonUniformFMax : IntegerOpcodeType;
+  return BuildMI(BB, I, I.getDebugLoc(), TII.get(Opcode))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(GR.getOrCreateConstInt(SPIRV::Scope::Subgroup, I, IntTy, TII))
+      .addImm(SPIRV::GroupOperation::Reduce)
+      .addUse(I.getOperand(2).getReg())
+      .constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectWaveReduceSum(Register ResVReg,
                                                    const SPIRVType *ResType,
                                                    MachineInstr &I) const {
@@ -3086,6 +3117,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
     return selectWaveOpInst(ResVReg, ResType, I, SPIRV::OpGroupNonUniformAny);
   case Intrinsic::spv_wave_is_first_lane:
     return selectWaveOpInst(ResVReg, ResType, I, SPIRV::OpGroupNonUniformElect);
+  case Intrinsic::spv_wave_reduce_umax:
+    return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ true);
+  case Intrinsic::spv_wave_reduce_max:
+    return selectWaveReduceMax(ResVReg, ResType, I, /*IsUnsigned*/ false);
   case Intrinsic::spv_wave_reduce_sum:
     return selectWaveReduceSum(ResVReg, ResType, I);
   case Intrinsic::spv_wave_readlane:
diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMax.ll b/llvm/test/CodeGen/DirectX/WaveActiveMax.ll
new file mode 100644
index 0000000000000..d402e67e6c1a6
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveActiveMax.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, WaveActiveMax maps down to the DirectX op
+
+define noundef half @wave_active_max_half(half noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 3, i8 0){{$}}
+  %ret = call half @llvm.dx.wave.reduce.max.f16(half %expr)
+  ret half %ret
+}
+
+define noundef float @wave_active_max_float(float noundef %expr) {
+entry:
+; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 3, i8 0){{$}}
+  %ret = call float @llvm.dx.wave.reduce.max.f32(float %expr)
+  ret float %ret
+}
+
+define noundef double @wave_active_max_double(double noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 3, i8 0){{$}}
+  %ret = call double @llvm.dx.wave.reduce.max.f64(double %expr)
+  ret double %ret
+}
+
+define noundef i16 @wave_active_max_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 3, i8 0){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.max.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_max_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 3, i8 0){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.max.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_max_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 3, i8 0){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.max.i64(i64 %expr)
+  ret i64 %ret
+}
+
+define noundef i16 @wave_active_umax_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 3, i8 1){{$}}
+  %ret = call i16 @llvm.dx.wave.reduce.umax.i16(i16 %expr)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_active_umax_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 3, i8 1){{$}}
+  %ret = call i32 @llvm.dx.wave.reduce.umax.i32(i32 %expr)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_active_umax_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 3, i8 1){{$}}
+  %ret = call i64 @llvm.dx.wave.reduce.umax.i64(i64 %expr)
+  ret i64 %ret
+}
+
+declare half @llvm.dx.wave.reduce.max.f16(half)
+declare float @llvm.dx.wave.reduce.max.f32(float)
+declare double @llvm.dx.wave.reduce.max.f64(double)
+
+declare i16 @llvm.dx.wave.reduce.max.i16(i16)
+declare i32 @llvm.dx.wave.reduce.max.i32(i32)
+declare i64 @llvm.dx.wave.reduce.max.i64(i64)
+
+declare i16 @llvm.dx.wave.reduce.umax.i16(i16)
+declare i32 @llvm.dx.wave.reduce.umax.i32(i32)
+declare i64 @llvm.dx.wave.reduce.umax.i64(i64)
+
+; Test that for vector values, WaveActiveMax scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_active_max_v2half(<2 x half> noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 3, i8 0){{$}}
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 3, i8 0){{$}}
+  %ret = call <2 x half> @llvm.dx.wave.reduce.max.v2f16(<2 x half> %expr)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_active_max_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 3, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 3, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 3, i8 0){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.max.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_active_max_v4f64(<4 x double> noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 3, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 3, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 3, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 3, i8 0){{$}}
+  %ret = call <4 x double> @llvm.dx.wave.reduce.max.v4f64(<4 x double> %expr)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.reduce.max.v2f16(<2 x half>)
+declare <3 x i32> @llvm.dx.wave.reduce.max.v3i32(<3 x i32>)
+declare <4 x double> @llvm.dx.wave.reduce.max.v4f64(<4 x double>)
+
+define noundef <2 x i16> @wave_active_umax_v2i16(<2 x i16> noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 3, i8 1){{$}}
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 3, i8 1){{$}}
+  %ret = call <2 x i16> @llvm.dx.wave.reduce.umax.v2f16(<2 x i16> %expr)
+  ret <2 x i16> %ret
+}
+
+define noundef <3 x i32> @wave_active_umax_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 3, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 3, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 3, i8 1){{$}}
+  %ret = call <3 x i32> @llvm.dx.wave.reduce.umax.v3i32(<3 x i32> %expr)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x i64> @wave_active_umax_v4f64(<4 x i64> noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 3, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 3, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 3, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 3, i8 1){{$}}
+  %ret = call <4 x i64> @llvm.dx.wave.reduce.umax.v4f64(<4 x i64> %expr)
+  ret <4 x i64> %ret
+}
+
+declare <2 x i16> @llvm.dx.wave.reduce.umax.v2f16(<2 x i16>)
+declare <3 x i32> @llvm.dx.wave.reduce.umax.v3i32(<3 x i32>)
+declare <4 x i64> @llvm.dx.wave.reduce.umax.v4f64(<4 x i64>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMax.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMax.ll
new file mode 100644
index 0000000000000..b74e8c39a97aa
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMax.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK: OpCapability GroupNonUniformArithmetic
+
+; CHECK-DAG:   %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define float @test_float(float %fexpr) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformFMax %[[#f32]] %[[#scope]] Reduce %[[#fexpr]]
+  %0 = call float @llvm.spv.wave.reduce.max.f32(float %fexpr)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int_signed
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_signed(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformSMax %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.max.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_int_unsigned
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_unsigned(i32 %iexpr) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformUMax %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+  %0 = call i32 @llvm.spv.wave.reduce.umax.i32(i32 %iexpr)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK:   %[[#vhalfret:]] = OpGroupNonUniformFMax %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]]
+  %0 = call <4 x half> @llvm.spv.wave.reduce.max.v4half(<4 x half> %vbexpr)
+  ret <4 x half> %0
+}
+
+declare float @llvm.spv.wave.reduce.max.f32(float)
+declare i32 @llvm.spv.wave.reduce.max.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.max.v4half(<4 x half>)
+
+declare float @llvm.spv.wave.reduce.umax.f32(float)
+declare i32 @llvm.spv.wave.reduce.umax.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.umax.v4half(<4 x half>)
+

From 0865ecc5150b9a55ba1f9e30b6d463a66ac362a6 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 28 Jan 2025 08:41:31 +0100
Subject: [PATCH 356/432] [clang] Extend diagnose_if to accept more detailed
 warning information, take 2 (#119712)

This is take two of #70976. This iteration of the patch makes sure that
custom
diagnostics without any warning group don't get promoted by `-Werror` or
`-Wfatal-errors`.

This implements parts of the extension proposed in
https://discourse.llvm.org/t/exposing-the-diagnostic-engine-to-c/73092/7.

Specifically, this makes it possible to specify a diagnostic group in an
optional third argument.
---
 clang-tools-extra/clangd/Diagnostics.cpp      |  27 +-
 clang-tools-extra/clangd/Diagnostics.h        |   8 +-
 clang-tools-extra/clangd/ParsedAST.cpp        |   6 +-
 clang-tools-extra/clangd/Preamble.cpp         |   4 +-
 .../clangd/unittests/ConfigCompileTests.cpp   |  47 ++-
 clang/include/clang/Basic/Attr.td             |  12 +-
 clang/include/clang/Basic/Diagnostic.h        |   8 +-
 .../clang/Basic/DiagnosticCategories.h        |   5 +-
 clang/include/clang/Basic/DiagnosticIDs.h     | 154 ++++++++--
 .../clang/Basic/DiagnosticSemaKinds.td        |   6 +
 clang/lib/Basic/Diagnostic.cpp                |  22 +-
 clang/lib/Basic/DiagnosticIDs.cpp             | 285 +++++++++++-------
 clang/lib/Frontend/LogDiagnosticPrinter.cpp   |   4 +-
 .../Frontend/SerializedDiagnosticPrinter.cpp  |  12 +-
 clang/lib/Frontend/TextDiagnosticPrinter.cpp  |  10 +-
 clang/lib/Sema/Sema.cpp                       |   5 +-
 clang/lib/Sema/SemaCUDA.cpp                   |   4 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |  26 +-
 clang/lib/Sema/SemaOverload.cpp               |  33 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |   3 +-
 clang/lib/Serialization/ASTReader.cpp         |   2 +-
 clang/lib/Serialization/ASTWriter.cpp         |   2 +-
 .../StaticAnalyzer/Core/TextDiagnostics.cpp   |   1 -
 .../Frontend/custom-diag-werror-interaction.c |   9 +
 clang/test/Sema/diagnose_if.c                 |   8 +-
 .../SemaCXX/diagnose_if-warning-group.cpp     |  63 ++++
 clang/tools/diagtool/ListWarnings.cpp         |   7 +-
 clang/tools/diagtool/ShowEnabledWarnings.cpp  |   6 +-
 clang/tools/libclang/CXStoredDiagnostic.cpp   |   4 +-
 flang/lib/Frontend/TextDiagnosticPrinter.cpp  |   3 +-
 30 files changed, 562 insertions(+), 224 deletions(-)
 create mode 100644 clang/test/Frontend/custom-diag-werror-interaction.c
 create mode 100644 clang/test/SemaCXX/diagnose_if-warning-group.cpp

diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp
index a59d1e7ac8409..28bb994a9e99a 100644
--- a/clang-tools-extra/clangd/Diagnostics.cpp
+++ b/clang-tools-extra/clangd/Diagnostics.cpp
@@ -577,7 +577,17 @@ std::vector<Diag> StoreDiags::take(const clang::tidy::ClangTidyContext *Tidy) {
   for (auto &Diag : Output) {
     if (const char *ClangDiag = getDiagnosticCode(Diag.ID)) {
       // Warnings controlled by -Wfoo are better recognized by that name.
-      StringRef Warning = DiagnosticIDs::getWarningOptionForDiag(Diag.ID);
+      StringRef Warning = [&] {
+        if (OrigSrcMgr) {
+          return OrigSrcMgr->getDiagnostics()
+              .getDiagnosticIDs()
+              ->getWarningOptionForDiag(Diag.ID);
+        }
+        if (!DiagnosticIDs::IsCustomDiag(Diag.ID))
+          return DiagnosticIDs{}.getWarningOptionForDiag(Diag.ID);
+        return StringRef{};
+      }();
+
       if (!Warning.empty()) {
         Diag.Name = ("-W" + Warning).str();
       } else {
@@ -894,20 +904,23 @@ void StoreDiags::flushLastDiag() {
   Output.push_back(std::move(*LastDiag));
 }
 
-bool isBuiltinDiagnosticSuppressed(unsigned ID,
-                                   const llvm::StringSet<> &Suppress,
-                                   const LangOptions &LangOpts) {
+bool isDiagnosticSuppressed(const clang::Diagnostic &Diag,
+                            const llvm::StringSet<> &Suppress,
+                            const LangOptions &LangOpts) {
   // Don't complain about header-only stuff in mainfiles if it's a header.
   // FIXME: would be cleaner to suppress in clang, once we decide whether the
   //        behavior should be to silently-ignore or respect the pragma.
-  if (ID == diag::pp_pragma_sysheader_in_main_file && LangOpts.IsHeaderFile)
+  if (Diag.getID() == diag::pp_pragma_sysheader_in_main_file &&
+      LangOpts.IsHeaderFile)
     return true;
 
-  if (const char *CodePtr = getDiagnosticCode(ID)) {
+  if (const char *CodePtr = getDiagnosticCode(Diag.getID())) {
     if (Suppress.contains(normalizeSuppressedCode(CodePtr)))
       return true;
   }
-  StringRef Warning = DiagnosticIDs::getWarningOptionForDiag(ID);
+  StringRef Warning =
+      Diag.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
+          Diag.getID());
   if (!Warning.empty() && Suppress.contains(Warning))
     return true;
   return false;
diff --git a/clang-tools-extra/clangd/Diagnostics.h b/clang-tools-extra/clangd/Diagnostics.h
index d4c0478c63a5c..c45d8dc3aa6ce 100644
--- a/clang-tools-extra/clangd/Diagnostics.h
+++ b/clang-tools-extra/clangd/Diagnostics.h
@@ -181,11 +181,11 @@ class StoreDiags : public DiagnosticConsumer {
 };
 
 /// Determine whether a (non-clang-tidy) diagnostic is suppressed by config.
-bool isBuiltinDiagnosticSuppressed(unsigned ID,
-                                   const llvm::StringSet<> &Suppressed,
-                                   const LangOptions &);
+bool isDiagnosticSuppressed(const clang::Diagnostic &Diag,
+                            const llvm::StringSet<> &Suppressed,
+                            const LangOptions &);
 /// Take a user-specified diagnostic code, and convert it to a normalized form
-/// stored in the config and consumed by isBuiltinDiagnosticsSuppressed.
+/// stored in the config and consumed by isDiagnosticsSuppressed.
 ///
 /// (This strips err_ and -W prefix so we can match with or without them.)
 llvm::StringRef normalizeSuppressedCode(llvm::StringRef);
diff --git a/clang-tools-extra/clangd/ParsedAST.cpp b/clang-tools-extra/clangd/ParsedAST.cpp
index 89d6f26d0f150..3f63daaf400db 100644
--- a/clang-tools-extra/clangd/ParsedAST.cpp
+++ b/clang-tools-extra/clangd/ParsedAST.cpp
@@ -342,7 +342,7 @@ void applyWarningOptions(llvm::ArrayRef<std::string> ExtraArgs,
       if (Enable) {
         if (Diags.getDiagnosticLevel(ID, SourceLocation()) <
             DiagnosticsEngine::Warning) {
-          auto Group = DiagnosticIDs::getGroupForDiag(ID);
+          auto Group = Diags.getDiagnosticIDs()->getGroupForDiag(ID);
           if (!Group || !EnabledGroups(*Group))
             continue;
           Diags.setSeverity(ID, diag::Severity::Warning, SourceLocation());
@@ -585,8 +585,8 @@ ParsedAST::build(llvm::StringRef Filename, const ParseInputs &Inputs,
     ASTDiags.setLevelAdjuster([&](DiagnosticsEngine::Level DiagLevel,
                                   const clang::Diagnostic &Info) {
       if (Cfg.Diagnostics.SuppressAll ||
-          isBuiltinDiagnosticSuppressed(Info.getID(), Cfg.Diagnostics.Suppress,
-                                        Clang->getLangOpts()))
+          isDiagnosticSuppressed(Info, Cfg.Diagnostics.Suppress,
+                                 Clang->getLangOpts()))
         return DiagnosticsEngine::Ignored;
 
       auto It = OverriddenSeverity.find(Info.getID());
diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp
index ce88ec0eb88c1..b247e608eece3 100644
--- a/clang-tools-extra/clangd/Preamble.cpp
+++ b/clang-tools-extra/clangd/Preamble.cpp
@@ -622,8 +622,8 @@ buildPreamble(PathRef FileName, CompilerInvocation CI,
   PreambleDiagnostics.setLevelAdjuster([&](DiagnosticsEngine::Level DiagLevel,
                                            const clang::Diagnostic &Info) {
     if (Cfg.Diagnostics.SuppressAll ||
-        isBuiltinDiagnosticSuppressed(Info.getID(), Cfg.Diagnostics.Suppress,
-                                      CI.getLangOpts()))
+        isDiagnosticSuppressed(Info, Cfg.Diagnostics.Suppress,
+                               CI.getLangOpts()))
       return DiagnosticsEngine::Ignored;
     switch (Info.getID()) {
     case diag::warn_no_newline_eof:
diff --git a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
index 179960a02cade..c3e484a1a79c4 100644
--- a/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ConfigCompileTests.cpp
@@ -298,20 +298,41 @@ TEST_F(ConfigCompileTests, DiagnosticSuppression) {
                                    "unreachable-code", "unused-variable",
                                    "typecheck_bool_condition",
                                    "unexpected_friend", "warn_alloca"));
-  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
-      diag::warn_unreachable, Conf.Diagnostics.Suppress, LangOptions()));
+  clang::DiagnosticsEngine DiagEngine(new DiagnosticIDs, nullptr,
+                                      new clang::IgnoringDiagConsumer);
+
+  using Diag = clang::Diagnostic;
+  {
+    auto D = DiagEngine.Report(diag::warn_unreachable);
+    EXPECT_TRUE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
   // Subcategory not respected/suppressed.
-  EXPECT_FALSE(isBuiltinDiagnosticSuppressed(
-      diag::warn_unreachable_break, Conf.Diagnostics.Suppress, LangOptions()));
-  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
-      diag::warn_unused_variable, Conf.Diagnostics.Suppress, LangOptions()));
-  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(diag::err_typecheck_bool_condition,
-                                            Conf.Diagnostics.Suppress,
-                                            LangOptions()));
-  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
-      diag::err_unexpected_friend, Conf.Diagnostics.Suppress, LangOptions()));
-  EXPECT_TRUE(isBuiltinDiagnosticSuppressed(
-      diag::warn_alloca, Conf.Diagnostics.Suppress, LangOptions()));
+  {
+    auto D = DiagEngine.Report(diag::warn_unreachable_break);
+    EXPECT_FALSE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
+  {
+    auto D = DiagEngine.Report(diag::warn_unused_variable);
+    EXPECT_TRUE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
+  {
+    auto D = DiagEngine.Report(diag::err_typecheck_bool_condition);
+    EXPECT_TRUE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
+  {
+    auto D = DiagEngine.Report(diag::err_unexpected_friend);
+    EXPECT_TRUE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
+  {
+    auto D = DiagEngine.Report(diag::warn_alloca);
+    EXPECT_TRUE(isDiagnosticSuppressed(
+        Diag{&DiagEngine, D}, Conf.Diagnostics.Suppress, LangOptions()));
+  }
 
   Frag.Diagnostics.Suppress.emplace_back("*");
   EXPECT_TRUE(compileAndApply());
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 408d3adf370c8..f4ba2bc3c6de3 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -3458,18 +3458,16 @@ def DiagnoseIf : InheritableAttr {
   let Spellings = [GNU<"diagnose_if">];
   let Subjects = SubjectList<[Function, ObjCMethod, ObjCProperty]>;
   let Args = [ExprArgument<"Cond">, StringArgument<"Message">,
-              EnumArgument<"DiagnosticType", "DiagnosticType",
+              EnumArgument<"DefaultSeverity",
+                           "DefaultSeverity",
                            /*is_string=*/true,
-                           ["error", "warning"],
-                           ["DT_Error", "DT_Warning"]>,
+                           ["error",    "warning"],
+                           ["DS_error", "DS_warning"]>,
+              StringArgument<"WarningGroup", /*optional*/ 1>,
               BoolArgument<"ArgDependent", 0, /*fake*/ 1>,
               DeclArgument<Named, "Parent", 0, /*fake*/ 1>];
   let InheritEvenIfAlreadyPresent = 1;
   let LateParsed = LateAttrParseStandard;
-  let AdditionalMembers = [{
-    bool isError() const { return diagnosticType == DT_Error; }
-    bool isWarning() const { return diagnosticType == DT_Warning; }
-  }];
   let TemplateDependent = 1;
   let Documentation = [DiagnoseIfDocs];
 }
diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h
index 510b782e35d06..848acce3c4f13 100644
--- a/clang/include/clang/Basic/Diagnostic.h
+++ b/clang/include/clang/Basic/Diagnostic.h
@@ -375,10 +375,12 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
     // Map extensions to warnings or errors?
     diag::Severity ExtBehavior = diag::Severity::Ignored;
 
-    DiagState()
+    DiagnosticIDs &DiagIDs;
+
+    DiagState(DiagnosticIDs &DiagIDs)
         : IgnoreAllWarnings(false), EnableAllWarnings(false),
           WarningsAsErrors(false), ErrorsAsFatal(false),
-          SuppressSystemWarnings(false) {}
+          SuppressSystemWarnings(false), DiagIDs(DiagIDs) {}
 
     using iterator = llvm::DenseMap<unsigned, DiagnosticMapping>::iterator;
     using const_iterator =
@@ -893,6 +895,8 @@ class DiagnosticsEngine : public RefCountedBase<DiagnosticsEngine> {
   /// \param FormatString A fixed diagnostic format string that will be hashed
   /// and mapped to a unique DiagID.
   template <unsigned N>
+  // TODO: Deprecate this once all uses are removed from Clang.
+  // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
   unsigned getCustomDiagID(Level L, const char (&FormatString)[N]) {
     return Diags->getCustomDiagID((DiagnosticIDs::Level)L,
                                   StringRef(FormatString, N - 1));
diff --git a/clang/include/clang/Basic/DiagnosticCategories.h b/clang/include/clang/Basic/DiagnosticCategories.h
index 14be326f7515f..839f8dee3ca89 100644
--- a/clang/include/clang/Basic/DiagnosticCategories.h
+++ b/clang/include/clang/Basic/DiagnosticCategories.h
@@ -21,11 +21,12 @@ namespace clang {
     };
 
     enum class Group {
-#define DIAG_ENTRY(GroupName, FlagNameOffset, Members, SubGroups, Docs)        \
-  GroupName,
+#define DIAG_ENTRY(GroupName, FlagNameOffset, Members, SubGroups, Docs)    \
+      GroupName,
 #include "clang/Basic/DiagnosticGroups.inc"
 #undef CATEGORY
 #undef DIAG_ENTRY
+      NUM_GROUPS
     };
   }  // end namespace diag
 }  // end namespace clang
diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h
index a051af327de28..b49185c3335d8 100644
--- a/clang/include/clang/Basic/DiagnosticIDs.h
+++ b/clang/include/clang/Basic/DiagnosticIDs.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_BASIC_DIAGNOSTICIDS_H
 #define LLVM_CLANG_BASIC_DIAGNOSTICIDS_H
 
+#include "clang/Basic/DiagnosticCategories.h"
 #include "clang/Basic/LLVM.h"
 #include "llvm/ADT/IntrusiveRefCntPtr.h"
 #include "llvm/ADT/StringRef.h"
@@ -84,7 +85,7 @@ namespace clang {
     /// to either Ignore (nothing), Remark (emit a remark), Warning
     /// (emit a warning) or Error (emit as an error).  It allows clients to
     /// map ERRORs to Error or Fatal (stop emitting diagnostics after this one).
-    enum class Severity {
+    enum class Severity : uint8_t {
       // NOTE: 0 means "uncomputed".
       Ignored = 1, ///< Do not present this diagnostic, ignore it.
       Remark = 2,  ///< Present this diagnostic as a remark.
@@ -181,13 +182,96 @@ class DiagnosticMapping {
 class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
 public:
   /// The level of the diagnostic, after it has been through mapping.
-  enum Level {
-    Ignored, Note, Remark, Warning, Error, Fatal
+  enum Level : uint8_t { Ignored, Note, Remark, Warning, Error, Fatal };
+
+  // Diagnostic classes.
+  enum Class {
+    CLASS_INVALID = 0x00,
+    CLASS_NOTE = 0x01,
+    CLASS_REMARK = 0x02,
+    CLASS_WARNING = 0x03,
+    CLASS_EXTENSION = 0x04,
+    CLASS_ERROR = 0x05
+  };
+
+  static bool IsCustomDiag(diag::kind Diag) {
+    return Diag >= diag::DIAG_UPPER_LIMIT;
+  }
+
+  class CustomDiagDesc {
+    LLVM_PREFERRED_TYPE(diag::Severity)
+    unsigned DefaultSeverity : 3;
+    LLVM_PREFERRED_TYPE(Class)
+    unsigned DiagClass : 3;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned ShowInSystemHeader : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned ShowInSystemMacro : 1;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned HasGroup : 1;
+    diag::Group Group;
+    std::string Description;
+
+    auto get_as_tuple() const {
+      return std::tuple(DefaultSeverity, DiagClass, ShowInSystemHeader,
+                        ShowInSystemMacro, HasGroup, Group,
+                        std::string_view{Description});
+    }
+
+  public:
+    CustomDiagDesc(diag::Severity DefaultSeverity, std::string Description,
+                   unsigned Class = CLASS_WARNING,
+                   bool ShowInSystemHeader = false,
+                   bool ShowInSystemMacro = false,
+                   std::optional<diag::Group> Group = std::nullopt)
+        : DefaultSeverity(static_cast<unsigned>(DefaultSeverity)),
+          DiagClass(Class), ShowInSystemHeader(ShowInSystemHeader),
+          ShowInSystemMacro(ShowInSystemMacro), HasGroup(Group != std::nullopt),
+          Group(Group.value_or(diag::Group{})),
+          Description(std::move(Description)) {}
+
+    std::optional<diag::Group> GetGroup() const {
+      if (HasGroup)
+        return Group;
+      return std::nullopt;
+    }
+
+    diag::Severity GetDefaultSeverity() const {
+      return static_cast<diag::Severity>(DefaultSeverity);
+    }
+
+    Class GetClass() const { return static_cast<Class>(DiagClass); }
+    std::string_view GetDescription() const { return Description; }
+    bool ShouldShowInSystemHeader() const { return ShowInSystemHeader; }
+
+    friend bool operator==(const CustomDiagDesc &lhs,
+                           const CustomDiagDesc &rhs) {
+      return lhs.get_as_tuple() == rhs.get_as_tuple();
+    }
+
+    friend bool operator<(const CustomDiagDesc &lhs,
+                          const CustomDiagDesc &rhs) {
+      return lhs.get_as_tuple() < rhs.get_as_tuple();
+    }
+  };
+
+  struct GroupInfo {
+    LLVM_PREFERRED_TYPE(diag::Severity)
+    unsigned Severity : 3;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned HasNoWarningAsError : 1;
   };
 
 private:
   /// Information for uniquing and looking up custom diags.
   std::unique_ptr<diag::CustomDiagInfo> CustomDiagInfo;
+  std::unique_ptr<GroupInfo[]> GroupInfos = []() {
+    auto GIs = std::make_unique<GroupInfo[]>(
+        static_cast<size_t>(diag::Group::NUM_GROUPS));
+    for (size_t i = 0; i != static_cast<size_t>(diag::Group::NUM_GROUPS); ++i)
+      GIs[i] = {{}, false};
+    return GIs;
+  }();
 
 public:
   DiagnosticIDs();
@@ -202,7 +286,35 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   // FIXME: Replace this function with a create-only facilty like
   // createCustomDiagIDFromFormatString() to enforce safe usage. At the time of
   // writing, nearly all callers of this function were invalid.
-  unsigned getCustomDiagID(Level L, StringRef FormatString);
+  unsigned getCustomDiagID(CustomDiagDesc Diag);
+
+  // TODO: Deprecate this once all uses are removed from LLVM
+  // [[deprecated("Use a CustomDiagDesc instead of a Level")]]
+  unsigned getCustomDiagID(Level Level, StringRef Message) {
+    return getCustomDiagID([&]() -> CustomDiagDesc {
+      switch (Level) {
+      case DiagnosticIDs::Level::Ignored:
+        return {diag::Severity::Ignored, std::string(Message), CLASS_WARNING,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      case DiagnosticIDs::Level::Note:
+        return {diag::Severity::Fatal, std::string(Message), CLASS_NOTE,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      case DiagnosticIDs::Level::Remark:
+        return {diag::Severity::Remark, std::string(Message), CLASS_REMARK,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      case DiagnosticIDs::Level::Warning:
+        return {diag::Severity::Warning, std::string(Message), CLASS_WARNING,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      case DiagnosticIDs::Level::Error:
+        return {diag::Severity::Error, std::string(Message), CLASS_ERROR,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      case DiagnosticIDs::Level::Fatal:
+        return {diag::Severity::Fatal, std::string(Message), CLASS_ERROR,
+                /*ShowInSystemHeader*/ true, /*ShowInSystemMacro=*/true};
+      }
+      llvm_unreachable("Fully covered switch above!");
+    }());
+  }
 
   //===--------------------------------------------------------------------===//
   // Diagnostic classification and reporting interfaces.
@@ -214,35 +326,36 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// Return true if the unmapped diagnostic levelof the specified
   /// diagnostic ID is a Warning or Extension.
   ///
-  /// This only works on builtin diagnostics, not custom ones, and is not
-  /// legal to call on NOTEs.
-  static bool isBuiltinWarningOrExtension(unsigned DiagID);
+  /// This is not legal to call on NOTEs.
+  bool isWarningOrExtension(unsigned DiagID) const;
 
   /// Return true if the specified diagnostic is mapped to errors by
   /// default.
-  static bool isDefaultMappingAsError(unsigned DiagID);
+  bool isDefaultMappingAsError(unsigned DiagID) const;
 
   /// Get the default mapping for this diagnostic.
-  static DiagnosticMapping getDefaultMapping(unsigned DiagID);
+  DiagnosticMapping getDefaultMapping(unsigned DiagID) const;
+
+  void initCustomDiagMapping(DiagnosticMapping &, unsigned DiagID);
 
-  /// Determine whether the given built-in diagnostic ID is a Note.
-  static bool isBuiltinNote(unsigned DiagID);
+  /// Determine whether the given diagnostic ID is a Note.
+  bool isNote(unsigned DiagID) const;
 
-  /// Determine whether the given built-in diagnostic ID is for an
+  /// Determine whether the given diagnostic ID is for an
   /// extension of some sort.
-  static bool isBuiltinExtensionDiag(unsigned DiagID) {
+  bool isExtensionDiag(unsigned DiagID) const {
     bool ignored;
-    return isBuiltinExtensionDiag(DiagID, ignored);
+    return isExtensionDiag(DiagID, ignored);
   }
 
-  /// Determine whether the given built-in diagnostic ID is for an
+  /// Determine whether the given diagnostic ID is for an
   /// extension of some sort, and whether it is enabled by default.
   ///
   /// This also returns EnabledByDefault, which is set to indicate whether the
   /// diagnostic is ignored by default (in which case -pedantic enables it) or
   /// treated as a warning/error by default.
   ///
-  static bool isBuiltinExtensionDiag(unsigned DiagID, bool &EnabledByDefault);
+  bool isExtensionDiag(unsigned DiagID, bool &EnabledByDefault) const;
 
   /// Given a group ID, returns the flag that toggles the group.
   /// For example, for Group::DeprecatedDeclarations, returns
@@ -252,19 +365,22 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   /// Given a diagnostic group ID, return its documentation.
   static StringRef getWarningOptionDocumentation(diag::Group GroupID);
 
+  void setGroupSeverity(StringRef Group, diag::Severity);
+  void setGroupNoWarningsAsError(StringRef Group, bool);
+
   /// Given a group ID, returns the flag that toggles the group.
   /// For example, for "deprecated-declarations", returns
   /// Group::DeprecatedDeclarations.
   static std::optional<diag::Group> getGroupForWarningOption(StringRef);
 
   /// Return the lowest-level group that contains the specified diagnostic.
-  static std::optional<diag::Group> getGroupForDiag(unsigned DiagID);
+  std::optional<diag::Group> getGroupForDiag(unsigned DiagID) const;
 
   /// Return the lowest-level warning option that enables the specified
   /// diagnostic.
   ///
   /// If there is no -Wfoo flag that controls the diagnostic, this returns null.
-  static StringRef getWarningOptionForDiag(unsigned DiagID);
+  StringRef getWarningOptionForDiag(unsigned DiagID);
 
   /// Return the category number that a specified \p DiagID belongs to,
   /// or 0 if no category.
@@ -365,6 +481,8 @@ class DiagnosticIDs : public RefCountedBase<DiagnosticIDs> {
   getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
                         const DiagnosticsEngine &Diag) const LLVM_READONLY;
 
+  Class getDiagClass(unsigned DiagID) const;
+
   /// Used to report a diagnostic that is finally fully formed.
   ///
   /// \returns \c true if the diagnostic was emitted, \c false if it was
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 774e5484cfa0e..9487e3b62aaf6 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2953,9 +2953,15 @@ def ext_constexpr_function_never_constant_expr : ExtWarn<
   "constant expression">, InGroup<DiagGroup<"invalid-constexpr">>, DefaultError;
 def err_attr_cond_never_constant_expr : Error<
   "%0 attribute expression never produces a constant expression">;
+def err_diagnose_if_unknown_warning : Error<"unknown warning group '%0'">;
 def err_diagnose_if_invalid_diagnostic_type : Error<
   "invalid diagnostic type for 'diagnose_if'; use \"error\" or \"warning\" "
   "instead">;
+def err_diagnose_if_unknown_option : Error<"unknown diagnostic option">;
+def err_diagnose_if_expected_equals : Error<
+  "expected '=' after diagnostic option">;
+def err_diagnose_if_unexpected_value : Error<
+  "unexpected value; use 'true' or 'false'">;
 def err_constexpr_body_no_return : Error<
   "no return statement in %select{constexpr|consteval}0 function">;
 def err_constexpr_return_missing_expr : Error<
diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
index ae71758bc81e0..9e2f134135647 100644
--- a/clang/lib/Basic/Diagnostic.cpp
+++ b/clang/lib/Basic/Diagnostic.cpp
@@ -145,7 +145,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) {
 
     // Create a DiagState and DiagStatePoint representing diagnostic changes
     // through command-line.
-    DiagStates.emplace_back();
+    DiagStates.emplace_back(*Diags);
     DiagStatesByLoc.appendFirst(&DiagStates.back());
   }
 }
@@ -156,8 +156,11 @@ DiagnosticsEngine::DiagState::getOrAddMapping(diag::kind Diag) {
       DiagMap.insert(std::make_pair(Diag, DiagnosticMapping()));
 
   // Initialize the entry if we added it.
-  if (Result.second)
-    Result.first->second = DiagnosticIDs::getDefaultMapping(Diag);
+  if (Result.second) {
+    Result.first->second = DiagIDs.getDefaultMapping(Diag);
+    if (DiagnosticIDs::IsCustomDiag(Diag))
+      DiagIDs.initCustomDiagMapping(Result.first->second, Diag);
+  }
 
   return Result.first->second;
 }
@@ -299,7 +302,8 @@ void DiagnosticsEngine::DiagStateMap::dump(SourceManager &SrcMgr,
 
       for (auto &Mapping : *Transition.State) {
         StringRef Option =
-            DiagnosticIDs::getWarningOptionForDiag(Mapping.first);
+            SrcMgr.getDiagnostics().Diags->getWarningOptionForDiag(
+                Mapping.first);
         if (!DiagName.empty() && DiagName != Option)
           continue;
 
@@ -343,9 +347,7 @@ void DiagnosticsEngine::PushDiagStatePoint(DiagState *State,
 
 void DiagnosticsEngine::setSeverity(diag::kind Diag, diag::Severity Map,
                                     SourceLocation L) {
-  assert(Diag < diag::DIAG_UPPER_LIMIT &&
-         "Can only map builtin diagnostics");
-  assert((Diags->isBuiltinWarningOrExtension(Diag) ||
+  assert((Diags->isWarningOrExtension(Diag) ||
           (Map == diag::Severity::Fatal || Map == diag::Severity::Error)) &&
          "Cannot map errors into warnings!");
   assert((L.isInvalid() || SourceMgr) && "No SourceMgr for valid location");
@@ -397,6 +399,8 @@ bool DiagnosticsEngine::setSeverityForGroup(diag::Flavor Flavor,
   if (Diags->getDiagnosticsInGroup(Flavor, Group, GroupDiags))
     return true;
 
+  Diags->setGroupSeverity(Group, Map);
+
   // Set the mapping.
   for (diag::kind Diag : GroupDiags)
     setSeverity(Diag, Map, Loc);
@@ -419,6 +423,7 @@ bool DiagnosticsEngine::setDiagnosticGroupWarningAsError(StringRef Group,
   if (Enabled)
     return setSeverityForGroup(diag::Flavor::WarningOrError, Group,
                                diag::Severity::Error);
+  Diags->setGroupSeverity(Group, diag::Severity::Warning);
 
   // Otherwise, we want to set the diagnostic mapping's "no Werror" bit, and
   // potentially downgrade anything already mapped to be a warning.
@@ -450,6 +455,7 @@ bool DiagnosticsEngine::setDiagnosticGroupErrorAsFatal(StringRef Group,
   if (Enabled)
     return setSeverityForGroup(diag::Flavor::WarningOrError, Group,
                                diag::Severity::Fatal);
+  Diags->setGroupSeverity(Group, diag::Severity::Error);
 
   // Otherwise, we want to set the diagnostic mapping's "no Wfatal-errors" bit,
   // and potentially downgrade anything already mapped to be a fatal error.
@@ -482,7 +488,7 @@ void DiagnosticsEngine::setSeverityForAll(diag::Flavor Flavor,
 
   // Set the mapping.
   for (diag::kind Diag : AllDiags)
-    if (Diags->isBuiltinWarningOrExtension(Diag))
+    if (Diags->isWarningOrExtension(Diag))
       setSeverity(Diag, Map, Loc);
 }
 
diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp
index de1de6f61f3a1..ca5b8d2da769e 100644
--- a/clang/lib/Basic/DiagnosticIDs.cpp
+++ b/clang/lib/Basic/DiagnosticIDs.cpp
@@ -62,13 +62,12 @@ const uint32_t StaticDiagInfoDescriptionOffsets[] = {
 #undef DIAG
 };
 
-// Diagnostic classes.
 enum DiagnosticClass {
-  CLASS_NOTE       = 0x01,
-  CLASS_REMARK     = 0x02,
-  CLASS_WARNING    = 0x03,
-  CLASS_EXTENSION  = 0x04,
-  CLASS_ERROR      = 0x05
+  CLASS_NOTE = DiagnosticIDs::CLASS_NOTE,
+  CLASS_REMARK = DiagnosticIDs::CLASS_REMARK,
+  CLASS_WARNING = DiagnosticIDs::CLASS_WARNING,
+  CLASS_EXTENSION = DiagnosticIDs::CLASS_EXTENSION,
+  CLASS_ERROR = DiagnosticIDs::CLASS_ERROR,
 };
 
 struct StaticDiagInfoRec {
@@ -229,11 +228,60 @@ CATEGORY(INSTALLAPI, REFACTORING)
   return Found;
 }
 
-DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) {
+//===----------------------------------------------------------------------===//
+// Custom Diagnostic information
+//===----------------------------------------------------------------------===//
+
+namespace clang {
+namespace diag {
+using CustomDiagDesc = DiagnosticIDs::CustomDiagDesc;
+class CustomDiagInfo {
+  std::vector<CustomDiagDesc> DiagInfo;
+  std::map<CustomDiagDesc, unsigned> DiagIDs;
+  std::map<diag::Group, std::vector<unsigned>> GroupToDiags;
+
+public:
+  /// getDescription - Return the description of the specified custom
+  /// diagnostic.
+  const CustomDiagDesc &getDescription(unsigned DiagID) const {
+    assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
+           "Invalid diagnostic ID");
+    return DiagInfo[DiagID - DIAG_UPPER_LIMIT];
+  }
+
+  unsigned getOrCreateDiagID(DiagnosticIDs::CustomDiagDesc D) {
+    // Check to see if it already exists.
+    std::map<CustomDiagDesc, unsigned>::iterator I = DiagIDs.lower_bound(D);
+    if (I != DiagIDs.end() && I->first == D)
+      return I->second;
+
+    // If not, assign a new ID.
+    unsigned ID = DiagInfo.size() + DIAG_UPPER_LIMIT;
+    DiagIDs.insert(std::make_pair(D, ID));
+    DiagInfo.push_back(D);
+    if (auto Group = D.GetGroup())
+      GroupToDiags[*Group].emplace_back(ID);
+    return ID;
+  }
+
+  ArrayRef<unsigned> getDiagsInGroup(diag::Group G) const {
+    if (auto Diags = GroupToDiags.find(G); Diags != GroupToDiags.end())
+      return Diags->second;
+    return {};
+  }
+};
+
+} // namespace diag
+} // namespace clang
+
+DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) const {
   DiagnosticMapping Info = DiagnosticMapping::Make(
       diag::Severity::Fatal, /*IsUser=*/false, /*IsPragma=*/false);
 
-  if (const StaticDiagInfoRec *StaticInfo = GetDiagInfo(DiagID)) {
+  if (IsCustomDiag(DiagID)) {
+    Info.setSeverity(
+        CustomDiagInfo->getDescription(DiagID).GetDefaultSeverity());
+  } else if (const StaticDiagInfoRec *StaticInfo = GetDiagInfo(DiagID)) {
     Info.setSeverity((diag::Severity)StaticInfo->DefaultSeverity);
 
     if (StaticInfo->WarnNoWerror) {
@@ -246,6 +294,22 @@ DiagnosticMapping DiagnosticIDs::getDefaultMapping(unsigned DiagID) {
   return Info;
 }
 
+void DiagnosticIDs::initCustomDiagMapping(DiagnosticMapping &Mapping,
+                                          unsigned DiagID) {
+  assert(IsCustomDiag(DiagID));
+  const auto &Diag = CustomDiagInfo->getDescription(DiagID);
+  if (auto Group = Diag.GetGroup()) {
+    GroupInfo GroupInfo = GroupInfos[static_cast<size_t>(*Group)];
+    if (static_cast<diag::Severity>(GroupInfo.Severity) != diag::Severity())
+      Mapping.setSeverity(static_cast<diag::Severity>(GroupInfo.Severity));
+    Mapping.setNoWarningAsError(GroupInfo.HasNoWarningAsError);
+  } else {
+    Mapping.setSeverity(Diag.GetDefaultSeverity());
+    Mapping.setNoWarningAsError(true);
+    Mapping.setNoErrorAsFatal(true);
+  }
+}
+
 /// getCategoryNumberForDiag - Return the category number that a specified
 /// DiagID belongs to, or 0 if no category.
 unsigned DiagnosticIDs::getCategoryNumberForDiag(unsigned DiagID) {
@@ -303,61 +367,6 @@ bool DiagnosticIDs::isDeferrable(unsigned DiagID) {
   return false;
 }
 
-/// getBuiltinDiagClass - Return the class field of the diagnostic.
-///
-static unsigned getBuiltinDiagClass(unsigned DiagID) {
-  if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
-    return Info->Class;
-  return ~0U;
-}
-
-//===----------------------------------------------------------------------===//
-// Custom Diagnostic information
-//===----------------------------------------------------------------------===//
-
-namespace clang {
-  namespace diag {
-    class CustomDiagInfo {
-      typedef std::pair<DiagnosticIDs::Level, std::string> DiagDesc;
-      std::vector<DiagDesc> DiagInfo;
-      std::map<DiagDesc, unsigned> DiagIDs;
-    public:
-
-      /// getDescription - Return the description of the specified custom
-      /// diagnostic.
-      StringRef getDescription(unsigned DiagID) const {
-        assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
-               "Invalid diagnostic ID");
-        return DiagInfo[DiagID-DIAG_UPPER_LIMIT].second;
-      }
-
-      /// getLevel - Return the level of the specified custom diagnostic.
-      DiagnosticIDs::Level getLevel(unsigned DiagID) const {
-        assert(DiagID - DIAG_UPPER_LIMIT < DiagInfo.size() &&
-               "Invalid diagnostic ID");
-        return DiagInfo[DiagID-DIAG_UPPER_LIMIT].first;
-      }
-
-      unsigned getOrCreateDiagID(DiagnosticIDs::Level L, StringRef Message,
-                                 DiagnosticIDs &Diags) {
-        DiagDesc D(L, std::string(Message));
-        // Check to see if it already exists.
-        std::map<DiagDesc, unsigned>::iterator I = DiagIDs.lower_bound(D);
-        if (I != DiagIDs.end() && I->first == D)
-          return I->second;
-
-        // If not, assign a new ID.
-        unsigned ID = DiagInfo.size()+DIAG_UPPER_LIMIT;
-        DiagIDs.insert(std::make_pair(D, ID));
-        DiagInfo.push_back(D);
-        return ID;
-      }
-    };
-
-  } // end diag namespace
-} // end clang namespace
-
-
 //===----------------------------------------------------------------------===//
 // Common Diagnostic implementation
 //===----------------------------------------------------------------------===//
@@ -372,38 +381,32 @@ DiagnosticIDs::~DiagnosticIDs() {}
 ///
 /// \param FormatString A fixed diagnostic format string that will be hashed and
 /// mapped to a unique DiagID.
-unsigned DiagnosticIDs::getCustomDiagID(Level L, StringRef FormatString) {
+unsigned DiagnosticIDs::getCustomDiagID(CustomDiagDesc Diag) {
   if (!CustomDiagInfo)
     CustomDiagInfo.reset(new diag::CustomDiagInfo());
-  return CustomDiagInfo->getOrCreateDiagID(L, FormatString, *this);
+  return CustomDiagInfo->getOrCreateDiagID(Diag);
 }
 
-
-/// isBuiltinWarningOrExtension - Return true if the unmapped diagnostic
-/// level of the specified diagnostic ID is a Warning or Extension.
-/// This only works on builtin diagnostics, not custom ones, and is not legal to
-/// call on NOTEs.
-bool DiagnosticIDs::isBuiltinWarningOrExtension(unsigned DiagID) {
-  return DiagID < diag::DIAG_UPPER_LIMIT &&
-         getBuiltinDiagClass(DiagID) != CLASS_ERROR;
+bool DiagnosticIDs::isWarningOrExtension(unsigned DiagID) const {
+  return DiagID < diag::DIAG_UPPER_LIMIT
+             ? getDiagClass(DiagID) != CLASS_ERROR
+             : CustomDiagInfo->getDescription(DiagID).GetClass() != CLASS_ERROR;
 }
 
 /// Determine whether the given built-in diagnostic ID is a
 /// Note.
-bool DiagnosticIDs::isBuiltinNote(unsigned DiagID) {
-  return DiagID < diag::DIAG_UPPER_LIMIT &&
-    getBuiltinDiagClass(DiagID) == CLASS_NOTE;
+bool DiagnosticIDs::isNote(unsigned DiagID) const {
+  return DiagID < diag::DIAG_UPPER_LIMIT && getDiagClass(DiagID) == CLASS_NOTE;
 }
 
-/// isBuiltinExtensionDiag - Determine whether the given built-in diagnostic
+/// isExtensionDiag - Determine whether the given built-in diagnostic
 /// ID is for an extension of some sort.  This also returns EnabledByDefault,
 /// which is set to indicate whether the diagnostic is ignored by default (in
 /// which case -pedantic enables it) or treated as a warning/error by default.
 ///
-bool DiagnosticIDs::isBuiltinExtensionDiag(unsigned DiagID,
-                                        bool &EnabledByDefault) {
-  if (DiagID >= diag::DIAG_UPPER_LIMIT ||
-      getBuiltinDiagClass(DiagID) != CLASS_EXTENSION)
+bool DiagnosticIDs::isExtensionDiag(unsigned DiagID,
+                                    bool &EnabledByDefault) const {
+  if (IsCustomDiag(DiagID) || getDiagClass(DiagID) != CLASS_EXTENSION)
     return false;
 
   EnabledByDefault =
@@ -411,10 +414,7 @@ bool DiagnosticIDs::isBuiltinExtensionDiag(unsigned DiagID,
   return true;
 }
 
-bool DiagnosticIDs::isDefaultMappingAsError(unsigned DiagID) {
-  if (DiagID >= diag::DIAG_UPPER_LIMIT)
-    return false;
-
+bool DiagnosticIDs::isDefaultMappingAsError(unsigned DiagID) const {
   return getDefaultMapping(DiagID).getSeverity() >= diag::Severity::Error;
 }
 
@@ -424,7 +424,7 @@ StringRef DiagnosticIDs::getDescription(unsigned DiagID) const {
   if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
     return Info->getDescription();
   assert(CustomDiagInfo && "Invalid CustomDiagInfo");
-  return CustomDiagInfo->getDescription(DiagID);
+  return CustomDiagInfo->getDescription(DiagID).GetDescription();
 }
 
 static DiagnosticIDs::Level toLevel(diag::Severity SV) {
@@ -449,13 +449,7 @@ static DiagnosticIDs::Level toLevel(diag::Severity SV) {
 DiagnosticIDs::Level
 DiagnosticIDs::getDiagnosticLevel(unsigned DiagID, SourceLocation Loc,
                                   const DiagnosticsEngine &Diag) const {
-  // Handle custom diagnostics, which cannot be mapped.
-  if (DiagID >= diag::DIAG_UPPER_LIMIT) {
-    assert(CustomDiagInfo && "Invalid CustomDiagInfo");
-    return CustomDiagInfo->getLevel(DiagID);
-  }
-
-  unsigned DiagClass = getBuiltinDiagClass(DiagID);
+  unsigned DiagClass = getDiagClass(DiagID);
   if (DiagClass == CLASS_NOTE) return DiagnosticIDs::Note;
   return toLevel(getDiagnosticSeverity(DiagID, Loc, Diag));
 }
@@ -469,7 +463,8 @@ DiagnosticIDs::getDiagnosticLevel(unsigned DiagID, SourceLocation Loc,
 diag::Severity
 DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
                                      const DiagnosticsEngine &Diag) const {
-  assert(getBuiltinDiagClass(DiagID) != CLASS_NOTE);
+  bool IsCustomDiag = DiagnosticIDs::IsCustomDiag(DiagID);
+  assert(getDiagClass(DiagID) != CLASS_NOTE);
 
   // Specific non-error diagnostics may be mapped to various levels from ignored
   // to error.  Errors can only be mapped to fatal.
@@ -477,7 +472,7 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
 
   // Get the mapping information, or compute it lazily.
   DiagnosticsEngine::DiagState *State = Diag.GetDiagStateForLoc(Loc);
-  DiagnosticMapping &Mapping = State->getOrAddMapping((diag::kind)DiagID);
+  DiagnosticMapping Mapping = State->getOrAddMapping((diag::kind)DiagID);
 
   // TODO: Can a null severity really get here?
   if (Mapping.getSeverity() != diag::Severity())
@@ -485,14 +480,15 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
 
   // Upgrade ignored diagnostics if -Weverything is enabled.
   if (State->EnableAllWarnings && Result == diag::Severity::Ignored &&
-      !Mapping.isUser() && getBuiltinDiagClass(DiagID) != CLASS_REMARK)
+      !Mapping.isUser() &&
+      (IsCustomDiag || getDiagClass(DiagID) != CLASS_REMARK))
     Result = diag::Severity::Warning;
 
   // Ignore -pedantic diagnostics inside __extension__ blocks.
   // (The diagnostics controlled by -pedantic are the extension diagnostics
   // that are not enabled by default.)
   bool EnabledByDefault = false;
-  bool IsExtensionDiag = isBuiltinExtensionDiag(DiagID, EnabledByDefault);
+  bool IsExtensionDiag = isExtensionDiag(DiagID, EnabledByDefault);
   if (Diag.AllExtensionsSilenced && IsExtensionDiag && !EnabledByDefault)
     return diag::Severity::Ignored;
 
@@ -510,10 +506,12 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
   // as well as disabling all messages which are currently mapped to Warning
   // (whether by default or downgraded from Error via e.g. -Wno-error or #pragma
   // diagnostic.)
+  // FIXME: Should -w be ignored for custom warnings without a group?
   if (State->IgnoreAllWarnings) {
-    if (Result == diag::Severity::Warning ||
-        (Result >= diag::Severity::Error &&
-         !isDefaultMappingAsError((diag::kind)DiagID)))
+    if ((!IsCustomDiag || CustomDiagInfo->getDescription(DiagID).GetGroup()) &&
+        (Result == diag::Severity::Warning ||
+         (Result >= diag::Severity::Error &&
+          !isDefaultMappingAsError((diag::kind)DiagID))))
       return diag::Severity::Ignored;
   }
 
@@ -541,9 +539,11 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
     return Result;
 
   const auto &SM = Diag.getSourceManager();
-  // Custom diagnostics always are emitted in system headers.
+
   bool ShowInSystemHeader =
-      !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader;
+      IsCustomDiag
+          ? CustomDiagInfo->getDescription(DiagID).ShouldShowInSystemHeader()
+          : !GetDiagInfo(DiagID) || GetDiagInfo(DiagID)->WarnShowInSystemHeader;
 
   // If we are in a system header, we ignore it. We look at the diagnostic class
   // because we also want to ignore extensions and warnings in -Werror and
@@ -566,6 +566,15 @@ DiagnosticIDs::getDiagnosticSeverity(unsigned DiagID, SourceLocation Loc,
   return Result;
 }
 
+DiagnosticIDs::Class DiagnosticIDs::getDiagClass(unsigned DiagID) const {
+  if (IsCustomDiag(DiagID))
+    return Class(CustomDiagInfo->getDescription(DiagID).GetClass());
+
+  if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
+    return Class(Info->Class);
+  return CLASS_INVALID;
+}
+
 #define GET_DIAG_ARRAYS
 #include "clang/Basic/DiagnosticGroups.inc"
 #undef GET_DIAG_ARRAYS
@@ -607,7 +616,12 @@ DiagnosticIDs::getGroupForWarningOption(StringRef Name) {
   return static_cast<diag::Group>(Found - OptionTable);
 }
 
-std::optional<diag::Group> DiagnosticIDs::getGroupForDiag(unsigned DiagID) {
+std::optional<diag::Group>
+DiagnosticIDs::getGroupForDiag(unsigned DiagID) const {
+  if (IsCustomDiag(DiagID)) {
+    assert(CustomDiagInfo);
+    return CustomDiagInfo->getDescription(DiagID).GetGroup();
+  }
   if (const StaticDiagInfoRec *Info = GetDiagInfo(DiagID))
     return static_cast<diag::Group>(Info->getOptionGroupIndex());
   return std::nullopt;
@@ -639,7 +653,8 @@ std::vector<std::string> DiagnosticIDs::getDiagnosticFlags() {
 /// were filtered out due to having the wrong flavor.
 static bool getDiagnosticsInGroup(diag::Flavor Flavor,
                                   const WarningOption *Group,
-                                  SmallVectorImpl<diag::kind> &Diags) {
+                                  SmallVectorImpl<diag::kind> &Diags,
+                                  diag::CustomDiagInfo *CustomDiagInfo) {
   // An empty group is considered to be a warning group: we have empty groups
   // for GCC compatibility, and GCC does not have remarks.
   if (!Group->Members && !Group->SubGroups)
@@ -658,9 +673,14 @@ static bool getDiagnosticsInGroup(diag::Flavor Flavor,
 
   // Add the members of the subgroups.
   const int16_t *SubGroups = DiagSubGroups + Group->SubGroups;
-  for (; *SubGroups != (int16_t)-1; ++SubGroups)
+  for (; *SubGroups != (int16_t)-1; ++SubGroups) {
+    if (CustomDiagInfo)
+      llvm::copy(
+          CustomDiagInfo->getDiagsInGroup(static_cast<diag::Group>(*SubGroups)),
+          std::back_inserter(Diags));
     NotFound &= getDiagnosticsInGroup(Flavor, &OptionTable[(short)*SubGroups],
-                                      Diags);
+                                      Diags, CustomDiagInfo);
+  }
 
   return NotFound;
 }
@@ -668,12 +688,49 @@ static bool getDiagnosticsInGroup(diag::Flavor Flavor,
 bool
 DiagnosticIDs::getDiagnosticsInGroup(diag::Flavor Flavor, StringRef Group,
                                      SmallVectorImpl<diag::kind> &Diags) const {
-  if (std::optional<diag::Group> G = getGroupForWarningOption(Group))
-    return ::getDiagnosticsInGroup(
-        Flavor, &OptionTable[static_cast<unsigned>(*G)], Diags);
+  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
+    if (CustomDiagInfo)
+      llvm::copy(CustomDiagInfo->getDiagsInGroup(*G),
+                 std::back_inserter(Diags));
+    return ::getDiagnosticsInGroup(Flavor,
+                                   &OptionTable[static_cast<unsigned>(*G)],
+                                   Diags, CustomDiagInfo.get());
+  }
   return true;
 }
 
+template <class Func>
+static void forEachSubGroupImpl(const WarningOption *Group, Func func) {
+  for (const int16_t *SubGroups = DiagSubGroups + Group->SubGroups;
+       *SubGroups != -1; ++SubGroups) {
+    func(static_cast<size_t>(*SubGroups));
+    forEachSubGroupImpl(&OptionTable[*SubGroups], std::move(func));
+  }
+}
+
+template <class Func>
+static void forEachSubGroup(diag::Group Group, Func func) {
+  const WarningOption *WarningOpt = &OptionTable[static_cast<size_t>(Group)];
+  func(static_cast<size_t>(Group));
+  ::forEachSubGroupImpl(WarningOpt, std::move(func));
+}
+
+void DiagnosticIDs::setGroupSeverity(StringRef Group, diag::Severity Sev) {
+  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
+    ::forEachSubGroup(*G, [&](size_t SubGroup) {
+      GroupInfos[SubGroup].Severity = static_cast<unsigned>(Sev);
+    });
+  }
+}
+
+void DiagnosticIDs::setGroupNoWarningsAsError(StringRef Group, bool Val) {
+  if (std::optional<diag::Group> G = getGroupForWarningOption(Group)) {
+    ::forEachSubGroup(*G, [&](size_t SubGroup) {
+      GroupInfos[static_cast<size_t>(*G)].HasNoWarningAsError = Val;
+    });
+  }
+}
+
 void DiagnosticIDs::getAllDiagnostics(diag::Flavor Flavor,
                                       std::vector<diag::kind> &Diags) {
   for (unsigned i = 0; i != StaticDiagInfoSize; ++i)
@@ -696,7 +753,7 @@ StringRef DiagnosticIDs::getNearestOption(diag::Flavor Flavor,
 
     // Don't suggest groups that are not of this kind.
     llvm::SmallVector<diag::kind, 8> Diags;
-    if (::getDiagnosticsInGroup(Flavor, &O, Diags) || Diags.empty())
+    if (::getDiagnosticsInGroup(Flavor, &O, Diags, nullptr) || Diags.empty())
       continue;
 
     if (Distance == BestDistance) {
@@ -810,14 +867,8 @@ void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag,
 }
 
 bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const {
-  if (DiagID >= diag::DIAG_UPPER_LIMIT) {
-    assert(CustomDiagInfo && "Invalid CustomDiagInfo");
-    // Custom diagnostics.
-    return CustomDiagInfo->getLevel(DiagID) >= DiagnosticIDs::Error;
-  }
-
   // Only errors may be unrecoverable.
-  if (getBuiltinDiagClass(DiagID) < CLASS_ERROR)
+  if (getDiagClass(DiagID) < CLASS_ERROR)
     return false;
 
   if (DiagID == diag::err_unavailable ||
diff --git a/clang/lib/Frontend/LogDiagnosticPrinter.cpp b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
index 469d1c22633aa..4e963af837f01 100644
--- a/clang/lib/Frontend/LogDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
@@ -129,7 +129,8 @@ void LogDiagnosticPrinter::HandleDiagnostic(DiagnosticsEngine::Level Level,
   DE.DiagnosticLevel = Level;
 
   DE.WarningOption =
-      std::string(DiagnosticIDs::getWarningOptionForDiag(DE.DiagnosticID));
+      std::string(Info.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
+          DE.DiagnosticID));
 
   // Format the message.
   SmallString<100> MessageStr;
@@ -160,4 +161,3 @@ void LogDiagnosticPrinter::HandleDiagnostic(DiagnosticsEngine::Level Level,
   // Record the diagnostic entry.
   Entries.push_back(DE);
 }
-
diff --git a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 131334269aa75..02aa3e8e4d984 100644
--- a/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -202,7 +202,7 @@ class SDiagsWriter : public DiagnosticConsumer {
 
   /// Emit the string information for diagnostic flags.
   unsigned getEmitDiagnosticFlag(DiagnosticsEngine::Level DiagLevel,
-                                 unsigned DiagID = 0);
+                                 const Diagnostic *Diag = nullptr);
 
   unsigned getEmitDiagnosticFlag(StringRef DiagName);
 
@@ -536,11 +536,13 @@ unsigned SDiagsWriter::getEmitCategory(unsigned int category) {
 }
 
 unsigned SDiagsWriter::getEmitDiagnosticFlag(DiagnosticsEngine::Level DiagLevel,
-                                             unsigned DiagID) {
-  if (DiagLevel == DiagnosticsEngine::Note)
+                                             const Diagnostic *Diag) {
+  if (!Diag || DiagLevel == DiagnosticsEngine::Note)
     return 0; // No flag for notes.
 
-  StringRef FlagName = DiagnosticIDs::getWarningOptionForDiag(DiagID);
+  StringRef FlagName =
+      Diag->getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
+          Diag->getID());
   return getEmitDiagnosticFlag(FlagName);
 }
 
@@ -655,7 +657,7 @@ void SDiagsWriter::EmitDiagnosticMessage(FullSourceLoc Loc, PresumedLoc PLoc,
     unsigned DiagID = DiagnosticIDs::getCategoryNumberForDiag(Info->getID());
     Record.push_back(getEmitCategory(DiagID));
     // Emit the diagnostic flag string lazily and get the mapped ID.
-    Record.push_back(getEmitDiagnosticFlag(Level, Info->getID()));
+    Record.push_back(getEmitDiagnosticFlag(Level, Info));
   } else {
     Record.push_back(getEmitCategory());
     Record.push_back(getEmitDiagnosticFlag(Level));
diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
index dac5c44fe9256..28f7218dc23f5 100644
--- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -70,13 +70,17 @@ static void printDiagnosticOptions(raw_ostream &OS,
     // flag it as such. Note that diagnostics could also have been mapped by a
     // pragma, but we don't currently have a way to distinguish this.
     if (Level == DiagnosticsEngine::Error &&
-        DiagnosticIDs::isBuiltinWarningOrExtension(Info.getID()) &&
-        !DiagnosticIDs::isDefaultMappingAsError(Info.getID())) {
+        Info.getDiags()->getDiagnosticIDs()->isWarningOrExtension(
+            Info.getID()) &&
+        !Info.getDiags()->getDiagnosticIDs()->isDefaultMappingAsError(
+            Info.getID())) {
       OS << " [-Werror";
       Started = true;
     }
 
-    StringRef Opt = DiagnosticIDs::getWarningOptionForDiag(Info.getID());
+    StringRef Opt =
+        Info.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
+            Info.getID());
     if (!Opt.empty()) {
       OS << (Started ? "," : " [")
          << (Level == DiagnosticsEngine::Remark ? "-R" : "-W") << Opt;
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index abb46d3a84e74..9507d7602aa40 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1679,7 +1679,7 @@ void Sema::EmitDiagnostic(unsigned DiagID, const DiagnosticBuilder &DB) {
   // that is different from the last template instantiation where
   // we emitted an error, print a template instantiation
   // backtrace.
-  if (!DiagnosticIDs::isBuiltinNote(DiagID))
+  if (!Diags.getDiagnosticIDs()->isNote(DiagID))
     PrintContextStack();
 }
 
@@ -1693,7 +1693,8 @@ bool Sema::hasUncompilableErrorOccurred() const {
   if (Loc == DeviceDeferredDiags.end())
     return false;
   for (auto PDAt : Loc->second) {
-    if (DiagnosticIDs::isDefaultMappingAsError(PDAt.second.getDiagID()))
+    if (Diags.getDiagnosticIDs()->isDefaultMappingAsError(
+            PDAt.second.getDiagID()))
       return true;
   }
   return false;
diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp
index 35f28bf1bd61a..0e1bf727d72d2 100644
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@@ -833,7 +833,7 @@ SemaBase::SemaDiagnosticBuilder SemaCUDA::DiagIfDeviceCode(SourceLocation Loc,
       if (!getLangOpts().CUDAIsDevice)
         return SemaDiagnosticBuilder::K_Nop;
       if (SemaRef.IsLastErrorImmediate &&
-          getDiagnostics().getDiagnosticIDs()->isBuiltinNote(DiagID))
+          getDiagnostics().getDiagnosticIDs()->isNote(DiagID))
         return SemaDiagnosticBuilder::K_Immediate;
       return (SemaRef.getEmissionStatus(CurFunContext) ==
               Sema::FunctionEmissionStatus::Emitted)
@@ -864,7 +864,7 @@ Sema::SemaDiagnosticBuilder SemaCUDA::DiagIfHostCode(SourceLocation Loc,
       if (getLangOpts().CUDAIsDevice)
         return SemaDiagnosticBuilder::K_Nop;
       if (SemaRef.IsLastErrorImmediate &&
-          getDiagnostics().getDiagnosticIDs()->isBuiltinNote(DiagID))
+          getDiagnostics().getDiagnosticIDs()->isNote(DiagID))
         return SemaDiagnosticBuilder::K_Immediate;
       return (SemaRef.getEmissionStatus(CurFunContext) ==
               Sema::FunctionEmissionStatus::Emitted)
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index c2d82b9aa9b32..9d7d22590bce4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -879,22 +879,38 @@ static void handleDiagnoseIfAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   if (!checkFunctionConditionAttr(S, D, AL, Cond, Msg))
     return;
 
-  StringRef DiagTypeStr;
-  if (!S.checkStringLiteralArgumentAttr(AL, 2, DiagTypeStr))
+  StringRef DefaultSevStr;
+  if (!S.checkStringLiteralArgumentAttr(AL, 2, DefaultSevStr))
     return;
 
-  DiagnoseIfAttr::DiagnosticType DiagType;
-  if (!DiagnoseIfAttr::ConvertStrToDiagnosticType(DiagTypeStr, DiagType)) {
+  DiagnoseIfAttr::DefaultSeverity DefaultSev;
+  if (!DiagnoseIfAttr::ConvertStrToDefaultSeverity(DefaultSevStr, DefaultSev)) {
     S.Diag(AL.getArgAsExpr(2)->getBeginLoc(),
            diag::err_diagnose_if_invalid_diagnostic_type);
     return;
   }
 
+  StringRef WarningGroup;
+  SmallVector<StringRef, 2> Options;
+  if (AL.getNumArgs() > 3) {
+    if (!S.checkStringLiteralArgumentAttr(AL, 3, WarningGroup))
+      return;
+    if (WarningGroup.empty() ||
+        !S.getDiagnostics().getDiagnosticIDs()->getGroupForWarningOption(
+            WarningGroup)) {
+      S.Diag(AL.getArgAsExpr(3)->getBeginLoc(),
+             diag::err_diagnose_if_unknown_warning)
+          << WarningGroup;
+      return;
+    }
+  }
+
   bool ArgDependent = false;
   if (const auto *FD = dyn_cast<FunctionDecl>(D))
     ArgDependent = ArgumentDependenceChecker(FD).referencesArgs(Cond);
   D->addAttr(::new (S.Context) DiagnoseIfAttr(
-      S.Context, AL, Cond, Msg, DiagType, ArgDependent, cast<NamedDecl>(D)));
+      S.Context, AL, Cond, Msg, DefaultSev, WarningGroup, ArgDependent,
+      cast<NamedDecl>(D)));
 }
 
 static void handleNoBuiltinAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 6ae9c51c06b31..776dba84bb965 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -7377,8 +7377,10 @@ static bool diagnoseDiagnoseIfAttrsWith(Sema &S, const NamedDecl *ND,
     return false;
 
   auto WarningBegin = std::stable_partition(
-      Attrs.begin(), Attrs.end(),
-      [](const DiagnoseIfAttr *DIA) { return DIA->isError(); });
+      Attrs.begin(), Attrs.end(), [](const DiagnoseIfAttr *DIA) {
+        return DIA->getDefaultSeverity() == DiagnoseIfAttr::DS_error &&
+               DIA->getWarningGroup().empty();
+      });
 
   // Note that diagnose_if attributes are late-parsed, so they appear in the
   // correct order (unlike enable_if attributes).
@@ -7392,11 +7394,32 @@ static bool diagnoseDiagnoseIfAttrsWith(Sema &S, const NamedDecl *ND,
     return true;
   }
 
+  auto ToSeverity = [](DiagnoseIfAttr::DefaultSeverity Sev) {
+    switch (Sev) {
+    case DiagnoseIfAttr::DS_warning:
+      return diag::Severity::Warning;
+    case DiagnoseIfAttr::DS_error:
+      return diag::Severity::Error;
+    }
+    llvm_unreachable("Fully covered switch above!");
+  };
+
   for (const auto *DIA : llvm::make_range(WarningBegin, Attrs.end()))
     if (IsSuccessful(DIA)) {
-      S.Diag(Loc, diag::warn_diagnose_if_succeeded) << DIA->getMessage();
-      S.Diag(DIA->getLocation(), diag::note_from_diagnose_if)
-          << DIA->getParent() << DIA->getCond()->getSourceRange();
+      if (DIA->getWarningGroup().empty() &&
+          DIA->getDefaultSeverity() == DiagnoseIfAttr::DS_warning) {
+        S.Diag(Loc, diag::warn_diagnose_if_succeeded) << DIA->getMessage();
+        S.Diag(DIA->getLocation(), diag::note_from_diagnose_if)
+            << DIA->getParent() << DIA->getCond()->getSourceRange();
+      } else {
+        auto DiagGroup = S.Diags.getDiagnosticIDs()->getGroupForWarningOption(
+            DIA->getWarningGroup());
+        assert(DiagGroup);
+        auto DiagID = S.Diags.getDiagnosticIDs()->getCustomDiagID(
+            {ToSeverity(DIA->getDefaultSeverity()), "%0",
+             DiagnosticIDs::CLASS_WARNING, false, false, *DiagGroup});
+        S.Diag(Loc, DiagID) << DIA->getMessage();
+      }
     }
 
   return false;
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index ebe2dd91865ca..16a7049821a5c 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -284,7 +284,8 @@ static void instantiateDependentDiagnoseIfAttr(
   if (Cond)
     New->addAttr(new (S.getASTContext()) DiagnoseIfAttr(
         S.getASTContext(), *DIA, Cond, DIA->getMessage(),
-        DIA->getDiagnosticType(), DIA->getArgDependent(), New));
+        DIA->getDefaultSeverity(), DIA->getWarningGroup(),
+        DIA->getArgDependent(), New));
 }
 
 // Constructs and adds to New a new instance of CUDALaunchBoundsAttr using
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 08801d22fdca8..f524251c48ddd 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -6864,7 +6864,7 @@ void ASTReader::ReadPragmaDiagnosticMappings(DiagnosticsEngine &Diag) {
       // command line (-w, -Weverything, -Werror, ...) along with any explicit
       // -Wblah flags.
       unsigned Flags = Record[Idx++];
-      DiagState Initial;
+      DiagState Initial(*Diag.getDiagnosticIDs());
       Initial.SuppressSystemWarnings = Flags & 1; Flags >>= 1;
       Initial.ErrorsAsFatal = Flags & 1; Flags >>= 1;
       Initial.WarningsAsErrors = Flags & 1; Flags >>= 1;
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a580f375aee35..2d0fae8b64d07 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -3264,7 +3264,7 @@ void ASTWriter::WritePragmaDiagnosticMappings(const DiagnosticsEngine &Diag,
         // Skip default mappings. We have a mapping for every diagnostic ever
         // emitted, regardless of whether it was customized.
         if (!I.second.isPragma() &&
-            I.second == DiagnosticIDs::getDefaultMapping(I.first))
+            I.second == Diag.getDiagnosticIDs()->getDefaultMapping(I.first))
           continue;
         Mappings.push_back(I);
       }
diff --git a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
index e8cf367b83346..79ee430988b7c 100644
--- a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp
@@ -91,7 +91,6 @@ class TextDiagnostics : public PathDiagnosticConsumer {
                                     ? " [" + PD->getCheckerName() + "]"
                                     : "")
                                    .str();
-
       reportPiece(WarnID, PD->getLocation().asLocation(),
                   (PD->getShortDescription() + WarningMsg).str(),
                   PD->path.back()->getRanges(), PD->path.back()->getFixits());
diff --git a/clang/test/Frontend/custom-diag-werror-interaction.c b/clang/test/Frontend/custom-diag-werror-interaction.c
new file mode 100644
index 0000000000000..997c8c11ff0e0
--- /dev/null
+++ b/clang/test/Frontend/custom-diag-werror-interaction.c
@@ -0,0 +1,9 @@
+// RUN: %clang_cc1 -emit-llvm-only -fprofile-instrument=clang -fcoverage-mcdc -Werror -Wno-unused-value %s -verify
+
+int foo(int x);
+
+int main(void) {
+    int a, b, c;
+    a && foo( b && c ); // expected-warning{{unsupported MC/DC boolean expression; contains an operation with a nested boolean expression. Expression will not be covered}}
+    return 0;
+}
diff --git a/clang/test/Sema/diagnose_if.c b/clang/test/Sema/diagnose_if.c
index 4df39916c031e..e9b8497d5ca4e 100644
--- a/clang/test/Sema/diagnose_if.c
+++ b/clang/test/Sema/diagnose_if.c
@@ -2,10 +2,10 @@
 
 #define _diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
 
-void failure1(void) _diagnose_if(); // expected-error{{exactly 3 arguments}}
-void failure2(void) _diagnose_if(0); // expected-error{{exactly 3 arguments}}
-void failure3(void) _diagnose_if(0, ""); // expected-error{{exactly 3 arguments}}
-void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{exactly 3 arguments}}
+void failure1(void) _diagnose_if(); // expected-error{{at least 3 arguments}}
+void failure2(void) _diagnose_if(0); // expected-error{{at least 3 arguments}}
+void failure3(void) _diagnose_if(0, ""); // expected-error{{at least 3 arguments}}
+void failure4(void) _diagnose_if(0, "", "error", 1); // expected-error{{expected string literal as argument}}
 void failure5(void) _diagnose_if(0, 0, "error"); // expected-error{{expected string literal as argument of 'diagnose_if' attribute}}
 void failure6(void) _diagnose_if(0, "", "invalid"); // expected-error{{invalid diagnostic type for 'diagnose_if'; use "error" or "warning" instead}}
 void failure7(void) _diagnose_if(0, "", "ERROR"); // expected-error{{invalid diagnostic type}}
diff --git a/clang/test/SemaCXX/diagnose_if-warning-group.cpp b/clang/test/SemaCXX/diagnose_if-warning-group.cpp
new file mode 100644
index 0000000000000..a39c0c0c33c9e
--- /dev/null
+++ b/clang/test/SemaCXX/diagnose_if-warning-group.cpp
@@ -0,0 +1,63 @@
+// RUN: %clang_cc1 %s -verify=expected,wall -fno-builtin -Wno-pedantic -Werror=comment -Wno-error=abi -Wfatal-errors=assume -Wno-fatal-errors=assume -Wno-format
+// RUN: %clang_cc1 %s -verify=expected,wno-all,pedantic,format -fno-builtin -Wno-all -Werror=comment -Wno-error=abi -Werror=assume -Wformat
+
+#define diagnose_if(...) __attribute__((diagnose_if(__VA_ARGS__)))
+
+#ifndef EMTY_WARNING_GROUP
+void bougus_warning() diagnose_if(true, "oh no", "warning", "bogus warning") {} // expected-error {{unknown warning group 'bogus warning'}}
+
+void show_in_system_header() diagnose_if(true, "oh no", "warning", "assume", "Banane") {} // expected-error {{'diagnose_if' attribute takes no more than 4 arguments}}
+#endif // EMTY_WARNING_GROUP
+
+template <bool b>
+void diagnose_if_wcomma() diagnose_if(b, "oh no", "warning", "comma") {}
+
+template <bool b>
+void diagnose_if_wcomment() diagnose_if(b, "oh no", "warning", "comment") {}
+
+void empty_warning_group() diagnose_if(true, "oh no", "warning", "") {} // expected-error {{unknown warning group ''}}
+void empty_warning_group_error() diagnose_if(true, "oh no", "error", "") {} // expected-error {{unknown warning group ''}}
+
+void diagnose_if_wabi_default_error() diagnose_if(true, "ABI stuff", "error", "abi") {}
+void diagnose_assume() diagnose_if(true, "Assume diagnostic", "warning", "assume") {}
+
+void Wall() diagnose_if(true, "oh no", "warning", "all") {}
+void Wpedantic() diagnose_if(true, "oh no", "warning", "pedantic") {}
+void Wformat_extra_args() diagnose_if(true, "oh no", "warning", "format-extra-args") {}
+
+void call() {
+  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
+  diagnose_if_wcomma<false>();
+  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
+  diagnose_if_wcomment<false>();
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcomma"
+  diagnose_if_wcomma<true>();
+  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
+#pragma clang diagnostic pop
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcomment"
+  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
+  diagnose_if_wcomment<true>();
+#pragma clang diagnostic pop
+
+  diagnose_if_wcomma<true>(); // expected-warning {{oh no}}
+  diagnose_if_wcomment<true>(); // expected-error {{oh no}}
+
+  diagnose_if_wabi_default_error(); // expected-warning {{ABI stuff}}
+  diagnose_assume(); // expected-error {{Assume diagnostic}}
+
+  // Make sure that the -Wassume diagnostic isn't fatal
+  diagnose_if_wabi_default_error(); // expected-warning {{ABI stuff}}
+
+  Wall(); // wall-warning {{oh no}}
+  Wpedantic(); // pedantic-warning {{oh no}}
+  Wformat_extra_args(); // format-warning {{oh no}}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wformat"
+  Wformat_extra_args();
+#pragma clang diagnostic pop
+}
diff --git a/clang/tools/diagtool/ListWarnings.cpp b/clang/tools/diagtool/ListWarnings.cpp
index a71f6e3a66c8e..9f9647126dd8a 100644
--- a/clang/tools/diagtool/ListWarnings.cpp
+++ b/clang/tools/diagtool/ListWarnings.cpp
@@ -53,13 +53,13 @@ int ListWarnings::run(unsigned int argc, char **argv, llvm::raw_ostream &out) {
   for (const DiagnosticRecord &DR : getBuiltinDiagnosticsByName()) {
     const unsigned diagID = DR.DiagID;
 
-    if (DiagnosticIDs::isBuiltinNote(diagID))
+    if (DiagnosticIDs{}.isNote(diagID))
       continue;
 
-    if (!DiagnosticIDs::isBuiltinWarningOrExtension(diagID))
+    if (!DiagnosticIDs{}.isWarningOrExtension(diagID))
       continue;
 
-    Entry entry(DR.getName(), DiagnosticIDs::getWarningOptionForDiag(diagID));
+    Entry entry(DR.getName(), DiagnosticIDs{}.getWarningOptionForDiag(diagID));
 
     if (entry.Flag.empty())
       Unflagged.push_back(entry);
@@ -97,4 +97,3 @@ int ListWarnings::run(unsigned int argc, char **argv, llvm::raw_ostream &out) {
 
   return 0;
 }
-
diff --git a/clang/tools/diagtool/ShowEnabledWarnings.cpp b/clang/tools/diagtool/ShowEnabledWarnings.cpp
index 48bed7c828c16..1f32f791de082 100644
--- a/clang/tools/diagtool/ShowEnabledWarnings.cpp
+++ b/clang/tools/diagtool/ShowEnabledWarnings.cpp
@@ -119,10 +119,10 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) {
   for (const DiagnosticRecord &DR : getBuiltinDiagnosticsByName()) {
     unsigned DiagID = DR.DiagID;
 
-    if (DiagnosticIDs::isBuiltinNote(DiagID))
+    if (DiagnosticIDs{}.isNote(DiagID))
       continue;
 
-    if (!DiagnosticIDs::isBuiltinWarningOrExtension(DiagID))
+    if (!DiagnosticIDs{}.isWarningOrExtension(DiagID))
       continue;
 
     DiagnosticsEngine::Level DiagLevel =
@@ -130,7 +130,7 @@ int ShowEnabledWarnings::run(unsigned int argc, char **argv, raw_ostream &Out) {
     if (DiagLevel == DiagnosticsEngine::Ignored)
       continue;
 
-    StringRef WarningOpt = DiagnosticIDs::getWarningOptionForDiag(DiagID);
+    StringRef WarningOpt = DiagnosticIDs{}.getWarningOptionForDiag(DiagID);
     Active.push_back(PrettyDiag(DR.getName(), WarningOpt, DiagLevel));
   }
 
diff --git a/clang/tools/libclang/CXStoredDiagnostic.cpp b/clang/tools/libclang/CXStoredDiagnostic.cpp
index 03018229549bd..6fb3050f5f844 100644
--- a/clang/tools/libclang/CXStoredDiagnostic.cpp
+++ b/clang/tools/libclang/CXStoredDiagnostic.cpp
@@ -51,7 +51,9 @@ CXString CXStoredDiagnostic::getSpelling() const {
 
 CXString CXStoredDiagnostic::getDiagnosticOption(CXString *Disable) const {
   unsigned ID = Diag.getID();
-  StringRef Option = DiagnosticIDs::getWarningOptionForDiag(ID);
+  if (DiagnosticIDs::IsCustomDiag(ID))
+    return cxstring::createEmpty();
+  StringRef Option = DiagnosticIDs{}.getWarningOptionForDiag(ID);
   if (!Option.empty()) {
     if (Disable)
       *Disable = cxstring::createDup((Twine("-Wno-") + Option).str());
diff --git a/flang/lib/Frontend/TextDiagnosticPrinter.cpp b/flang/lib/Frontend/TextDiagnosticPrinter.cpp
index 2ab02c0b70ab5..65626827af3b3 100644
--- a/flang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/flang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -39,7 +39,8 @@ static void printRemarkOption(llvm::raw_ostream &os,
                               clang::DiagnosticsEngine::Level level,
                               const clang::Diagnostic &info) {
   llvm::StringRef opt =
-      clang::DiagnosticIDs::getWarningOptionForDiag(info.getID());
+      info.getDiags()->getDiagnosticIDs()->getWarningOptionForDiag(
+          info.getID());
   if (!opt.empty()) {
     // We still need to check if the level is a Remark since, an unknown option
     // warning could be printed i.e. [-Wunknown-warning-option]

From 7e22180c20fa3b4e0add41ad620d2eaac2b47fcc Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 28 Jan 2025 00:07:38 -0800
Subject: [PATCH 357/432] [StrTable] Mechanically convert Hexagon builtins to
 use TableGen (#123460)

This switches them to use the common builtin TableGen emission.

The fancy feature string preprocessor tricks are replaced with a fairly
direct translation into TableGen.

All of the actual definitions were created using a quite hack-y Python
script that was never intended to be productionized. It preserves the
order, spacing, and even comments from the original files. For
posterity, the script used is here:

https://gist.github.com/chandlerc/f53c7d735e33eecf388529bd9a6010df

The original `.def` file appears to be generated by some out-of-tree
`iset.py` script, which because it is out of tree I couldn't update. It
should be very straightforward though to update it to generate a similar
structure as was used to produce the `.td` file.

In addition to helping move towards TableGen for all of the builtins,
these builtins in particular can be *much* more efficiently handled
using TableGen when we start emitting string tables for them because it
allows de-duplicating all of the feature strings.

The commit sha parent at the time the PR was made is
7253c6fde498c4c9470b681df47d46e6930d6a02 and at that commit, the
resulting TableGen file produces a `.inc` file that only differs in
whitespace and the order of the builtins defined.
---
 clang/include/clang/Basic/BuiltinsHexagon.def |  173 --
 clang/include/clang/Basic/BuiltinsHexagon.td  | 2143 +++++++++++++++++
 .../clang/Basic/BuiltinsHexagonDep.def        | 1970 ---------------
 clang/include/clang/Basic/CMakeLists.txt      |    4 +
 clang/include/clang/Basic/TargetBuiltins.h    |   10 +-
 clang/include/module.modulemap                |    2 -
 clang/lib/Basic/Targets/Hexagon.cpp           |    2 +-
 7 files changed, 2153 insertions(+), 2151 deletions(-)
 delete mode 100644 clang/include/clang/Basic/BuiltinsHexagon.def
 create mode 100644 clang/include/clang/Basic/BuiltinsHexagon.td
 delete mode 100644 clang/include/clang/Basic/BuiltinsHexagonDep.def

diff --git a/clang/include/clang/Basic/BuiltinsHexagon.def b/clang/include/clang/Basic/BuiltinsHexagon.def
deleted file mode 100644
index adff9f884c049..0000000000000
--- a/clang/include/clang/Basic/BuiltinsHexagon.def
+++ /dev/null
@@ -1,173 +0,0 @@
-//===-- BuiltinsHexagon.def - Hexagon Builtin function database --*- C++ -*-==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the Hexagon-specific builtin function database. Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#pragma push_macro("V79")
-#define V79 "v79"
-#pragma push_macro("V75")
-#define V75 "v75|" V79
-#pragma push_macro("V73")
-#define V73 "v73|" V75
-#pragma push_macro("V71")
-#define V71 "v71|" V73
-#pragma push_macro("V69")
-#define V69 "v69|" V71
-#pragma push_macro("V68")
-#define V68 "v68|" V69
-#pragma push_macro("V67")
-#define V67 "v67|" V68
-#pragma push_macro("V66")
-#define V66 "v66|" V67
-#pragma push_macro("V65")
-#define V65 "v65|" V66
-#pragma push_macro("V62")
-#define V62 "v62|" V65
-#pragma push_macro("V60")
-#define V60 "v60|" V62
-#pragma push_macro("V55")
-#define V55 "v55|" V60
-#pragma push_macro("V5")
-#define V5 "v5|" V55
-
-#pragma push_macro("HVXV79")
-#define HVXV79 "hvxv79"
-#pragma push_macro("HVXV75")
-#define HVXV75 "hvxv75|" HVXV79
-#pragma push_macro("HVXV73")
-#define HVXV73 "hvxv73|" HVXV75
-#pragma push_macro("HVXV71")
-#define HVXV71 "hvxv71|" HVXV73
-#pragma push_macro("HVXV69")
-#define HVXV69 "hvxv69|" HVXV71
-#pragma push_macro("HVXV68")
-#define HVXV68 "hvxv68|" HVXV69
-#pragma push_macro("HVXV67")
-#define HVXV67 "hvxv67|" HVXV68
-#pragma push_macro("HVXV66")
-#define HVXV66 "hvxv66|" HVXV67
-#pragma push_macro("HVXV65")
-#define HVXV65 "hvxv65|" HVXV66
-#pragma push_macro("HVXV62")
-#define HVXV62 "hvxv62|" HVXV65
-#pragma push_macro("HVXV60")
-#define HVXV60 "hvxv60|" HVXV62
-
-
-// The builtins below are not autogenerated from iset.py.
-// Make sure you do not overwrite these.
-TARGET_BUILTIN(__builtin_SI_to_SXTHI_asrh, "ii", "", V5)
-TARGET_BUILTIN(__builtin_brev_ldd,   "v*LLi*CLLi*iC", "", V5)
-TARGET_BUILTIN(__builtin_brev_ldw,   "v*i*Ci*iC", "", V5)
-TARGET_BUILTIN(__builtin_brev_ldh,   "v*s*Cs*iC", "", V5)
-TARGET_BUILTIN(__builtin_brev_lduh,  "v*Us*CUs*iC", "", V5)
-TARGET_BUILTIN(__builtin_brev_ldb,   "v*Sc*CSc*iC", "", V5)
-TARGET_BUILTIN(__builtin_brev_ldub,  "v*Uc*CUc*iC", "", V5)
-TARGET_BUILTIN(__builtin_circ_ldd,   "LLi*LLi*LLi*iIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_ldw,   "i*i*i*iIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_ldh,   "s*s*s*iIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_lduh,  "Us*Us*Us*iIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_ldb,   "c*c*c*iIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_ldub,  "Uc*Uc*Uc*iIi", "", V5)
-TARGET_BUILTIN(__builtin_brev_std,   "LLi*CLLi*LLiiC", "", V5)
-TARGET_BUILTIN(__builtin_brev_stw,   "i*Ci*iiC", "", V5)
-TARGET_BUILTIN(__builtin_brev_sth,   "s*Cs*iiC", "", V5)
-TARGET_BUILTIN(__builtin_brev_sthhi, "s*Cs*iiC", "", V5)
-TARGET_BUILTIN(__builtin_brev_stb,   "c*Cc*iiC", "", V5)
-TARGET_BUILTIN(__builtin_circ_std,   "LLi*LLi*LLiiIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_stw,   "i*i*iiIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_sth,   "s*s*iiIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_sthhi, "s*s*iiIi", "", V5)
-TARGET_BUILTIN(__builtin_circ_stb,   "c*c*iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrub_pci, "iv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrb_pci, "iv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadruh_pci, "iv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrh_pci, "iv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadri_pci, "iv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrd_pci, "LLiv*IiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrub_pcr, "iv*ivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrb_pcr, "iv*ivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadruh_pcr, "iv*ivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrh_pcr, "iv*ivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadri_pcr, "iv*ivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_L2_loadrd_pcr, "LLiv*ivC*", "", V5)
-
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerb_pci, "vv*IiiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerh_pci, "vv*IiiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerf_pci, "vv*IiiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storeri_pci, "vv*IiiivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerd_pci, "vv*IiiLLivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerb_pcr, "vv*iivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerh_pcr, "vv*iivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerf_pcr, "vv*iivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storeri_pcr, "vv*iivC*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_storerd_pcr, "vv*iLLivC*", "", V5)
-
-TARGET_BUILTIN(__builtin_HEXAGON_prefetch,"vv*","", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A6_vminub_RdP,"LLiLLiLLi","", V62)
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstoreq,"vV64bv*V16i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorenq,"vV64bv*V16i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorentq,"vV64bv*V16i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorentnq,"vV64bv*V16i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstoreq_128B,"vV128bv*V32i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorenq_128B,"vV128bv*V32i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorentq_128B,"vV128bv*V32i","", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaskedstorentnq_128B,"vV128bv*V32i","", HVXV60)
-
-
-// These are only valid on v65
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybub_rtt,"V32iV16iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybub_rtt_128B,"V64iV32iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybub_rtt_acc,"V32iV32iV16iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybub_rtt_acc_128B,"V64iV64iV32iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_rtt,"V32iV16iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_rtt_128B,"V64iV32iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_rtt_acc,"V32iV32iV16iLLi","", "hvxv65")
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_rtt_acc_128B,"V64iV64iV32iLLi","", "hvxv65")
-
-#include "clang/Basic/BuiltinsHexagonDep.def"
-
-#pragma pop_macro("HVXV60")
-#pragma pop_macro("HVXV62")
-#pragma pop_macro("HVXV65")
-#pragma pop_macro("HVXV66")
-#pragma pop_macro("HVXV67")
-#pragma pop_macro("HVXV68")
-#pragma pop_macro("HVXV69")
-#pragma pop_macro("HVXV71")
-#pragma pop_macro("HVXV73")
-#pragma pop_macro("HVXV75")
-#pragma pop_macro("HVXV79")
-
-#pragma pop_macro("V5")
-#pragma pop_macro("V55")
-#pragma pop_macro("V60")
-#pragma pop_macro("V62")
-#pragma pop_macro("V65")
-#pragma pop_macro("V66")
-#pragma pop_macro("V67")
-#pragma pop_macro("V68")
-#pragma pop_macro("V69")
-#pragma pop_macro("V71")
-#pragma pop_macro("V73")
-#pragma pop_macro("V75")
-#pragma pop_macro("V79")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
-
diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
new file mode 100644
index 0000000000000..95b9012bf74f9
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -0,0 +1,2143 @@
+//===--- BuiltinsHexagon.td - Hexagon Builtin function defs -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Hexagon-specific builtin function database.
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class VFeatures {
+  string Features;
+}
+
+class V<string version, VFeatures newer> : VFeatures {
+  let Features = !strconcat("v", version, "|", newer.Features);
+}
+
+let Features = "v79" in def V79 : VFeatures;
+
+def V75 : V<"75", V79>;
+def V73 : V<"73", V75>;
+def V71 : V<"71", V73>;
+def V69 : V<"69", V71>;
+def V68 : V<"68", V69>;
+def V67 : V<"67", V68>;
+def V66 : V<"66", V67>;
+def V65 : V<"65", V66>;
+def V62 : V<"62", V65>;
+def V60 : V<"60", V62>;
+def V55 : V<"55", V60>;
+def V5  : V<"5", V55>;
+
+class HVXVFeatures {
+  string Features;
+}
+
+class HVXV<string version, HVXVFeatures newer> : HVXVFeatures {
+  let Features = !strconcat("hvxv", version, "|", newer.Features);
+}
+
+let Features = "hvxv79" in def HVXV79 : HVXVFeatures;
+
+def HVXV75 : HVXV<"75", HVXV79>;
+def HVXV73 : HVXV<"73", HVXV75>;
+def HVXV71 : HVXV<"71", HVXV73>;
+def HVXV69 : HVXV<"69", HVXV71>;
+def HVXV68 : HVXV<"68", HVXV69>;
+def HVXV67 : HVXV<"67", HVXV68>;
+def HVXV66 : HVXV<"66", HVXV67>;
+def HVXV65 : HVXV<"65", HVXV66>;
+def HVXV62 : HVXV<"62", HVXV65>;
+def HVXV60 : HVXV<"60", HVXV62>;
+
+class HexagonBuiltin<string prototype> : TargetBuiltin {
+  let Spellings = ["__builtin_HEXAGON_" # NAME];
+  let Prototype = prototype;
+  let Features = V5.Features;
+}
+
+class HexagonBuiltinNoPrefix<string prototype> : TargetBuiltin {
+  let Spellings = [NAME];
+  let Prototype = prototype;
+  let Features = V5.Features;
+}
+
+// The builtins below are not autogenerated from iset.py.
+// Make sure you do not overwrite these.
+def __builtin_SI_to_SXTHI_asrh : HexagonBuiltinNoPrefix<"int(int)">;
+def __builtin_brev_ldd : HexagonBuiltinNoPrefix<"void *(long long int * const, long long int *, int const)">;
+def __builtin_brev_ldw : HexagonBuiltinNoPrefix<"void *(int * const, int *, int const)">;
+def __builtin_brev_ldh : HexagonBuiltinNoPrefix<"void *(short * const, short *, int const)">;
+def __builtin_brev_lduh : HexagonBuiltinNoPrefix<"void *(unsigned short * const, unsigned short *, int const)">;
+def __builtin_brev_ldb : HexagonBuiltinNoPrefix<"void *(signed char * const, signed char *, int const)">;
+def __builtin_brev_ldub : HexagonBuiltinNoPrefix<"void *(unsigned char * const, unsigned char *, int const)">;
+def __builtin_circ_ldd : HexagonBuiltinNoPrefix<"long long int *(long long int *, long long int *, int, _Constant int)">;
+def __builtin_circ_ldw : HexagonBuiltinNoPrefix<"int *(int *, int *, int, _Constant int)">;
+def __builtin_circ_ldh : HexagonBuiltinNoPrefix<"short *(short *, short *, int, _Constant int)">;
+def __builtin_circ_lduh : HexagonBuiltinNoPrefix<"unsigned short *(unsigned short *, unsigned short *, int, _Constant int)">;
+def __builtin_circ_ldb : HexagonBuiltinNoPrefix<"char *(char *, char *, int, _Constant int)">;
+def __builtin_circ_ldub : HexagonBuiltinNoPrefix<"unsigned char *(unsigned char *, unsigned char *, int, _Constant int)">;
+def __builtin_brev_std : HexagonBuiltinNoPrefix<"long long int * const(long long int *, long long int, int const)">;
+def __builtin_brev_stw : HexagonBuiltinNoPrefix<"int * const(int *, int, int const)">;
+def __builtin_brev_sth : HexagonBuiltinNoPrefix<"short * const(short *, int, int const)">;
+def __builtin_brev_sthhi : HexagonBuiltinNoPrefix<"short * const(short *, int, int const)">;
+def __builtin_brev_stb : HexagonBuiltinNoPrefix<"char * const(char *, int, int const)">;
+def __builtin_circ_std : HexagonBuiltinNoPrefix<"long long int *(long long int *, long long int, int, _Constant int)">;
+def __builtin_circ_stw : HexagonBuiltinNoPrefix<"int *(int *, int, int, _Constant int)">;
+def __builtin_circ_sth : HexagonBuiltinNoPrefix<"short *(short *, int, int, _Constant int)">;
+def __builtin_circ_sthhi : HexagonBuiltinNoPrefix<"short *(short *, int, int, _Constant int)">;
+def __builtin_circ_stb : HexagonBuiltinNoPrefix<"char *(char *, int, int, _Constant int)">;
+def L2_loadrub_pci : HexagonBuiltin<"int(void *, _Constant int, int, void const *)">;
+def L2_loadrb_pci : HexagonBuiltin<"int(void *, _Constant int, int, void const *)">;
+def L2_loadruh_pci : HexagonBuiltin<"int(void *, _Constant int, int, void const *)">;
+def L2_loadrh_pci : HexagonBuiltin<"int(void *, _Constant int, int, void const *)">;
+def L2_loadri_pci : HexagonBuiltin<"int(void *, _Constant int, int, void const *)">;
+def L2_loadrd_pci : HexagonBuiltin<"long long int(void *, _Constant int, int, void const *)">;
+def L2_loadrub_pcr : HexagonBuiltin<"int(void *, int, void const *)">;
+def L2_loadrb_pcr : HexagonBuiltin<"int(void *, int, void const *)">;
+def L2_loadruh_pcr : HexagonBuiltin<"int(void *, int, void const *)">;
+def L2_loadrh_pcr : HexagonBuiltin<"int(void *, int, void const *)">;
+def L2_loadri_pcr : HexagonBuiltin<"int(void *, int, void const *)">;
+def L2_loadrd_pcr : HexagonBuiltin<"long long int(void *, int, void const *)">;
+
+def S2_storerb_pci : HexagonBuiltin<"void(void *, _Constant int, int, int, void const *)">;
+def S2_storerh_pci : HexagonBuiltin<"void(void *, _Constant int, int, int, void const *)">;
+def S2_storerf_pci : HexagonBuiltin<"void(void *, _Constant int, int, int, void const *)">;
+def S2_storeri_pci : HexagonBuiltin<"void(void *, _Constant int, int, int, void const *)">;
+def S2_storerd_pci : HexagonBuiltin<"void(void *, _Constant int, int, long long int, void const *)">;
+def S2_storerb_pcr : HexagonBuiltin<"void(void *, int, int, void const *)">;
+def S2_storerh_pcr : HexagonBuiltin<"void(void *, int, int, void const *)">;
+def S2_storerf_pcr : HexagonBuiltin<"void(void *, int, int, void const *)">;
+def S2_storeri_pcr : HexagonBuiltin<"void(void *, int, int, void const *)">;
+def S2_storerd_pcr : HexagonBuiltin<"void(void *, int, long long int, void const *)">;
+
+def prefetch : HexagonBuiltin<"void(void *)">;
+let Features = V62.Features in {
+  def A6_vminub_RdP : HexagonBuiltin<"long long int(long long int, long long int)">;
+}
+
+let Features = HVXV60.Features in {
+  def V6_vmaskedstoreq : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vmaskedstorenq : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vmaskedstorentq : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vmaskedstorentnq : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vmaskedstoreq_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vmaskedstorenq_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vmaskedstorentq_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vmaskedstorentnq_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+}
+
+
+// These are only valid on v65
+let Features = "hvxv65" in {
+  def V6_vrmpybub_rtt : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, long long int)">;
+  def V6_vrmpybub_rtt_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, long long int)">;
+  def V6_vrmpybub_rtt_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, long long int)">;
+  def V6_vrmpybub_rtt_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, long long int)">;
+  def V6_vrmpyub_rtt : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, long long int)">;
+  def V6_vrmpyub_rtt_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, long long int)">;
+  def V6_vrmpyub_rtt_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, long long int)">;
+  def V6_vrmpyub_rtt_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, long long int)">;
+}
+
+// V5 Scalar Instructions.
+
+def A2_abs : HexagonBuiltin<"int(int)">;
+def A2_absp : HexagonBuiltin<"long long int(long long int)">;
+def A2_abssat : HexagonBuiltin<"int(int)">;
+def A2_add : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_hh : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_hl : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_lh : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_ll : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_sat_hh : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_sat_hl : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_sat_lh : HexagonBuiltin<"int(int, int)">;
+def A2_addh_h16_sat_ll : HexagonBuiltin<"int(int, int)">;
+def A2_addh_l16_hl : HexagonBuiltin<"int(int, int)">;
+def A2_addh_l16_ll : HexagonBuiltin<"int(int, int)">;
+def A2_addh_l16_sat_hl : HexagonBuiltin<"int(int, int)">;
+def A2_addh_l16_sat_ll : HexagonBuiltin<"int(int, int)">;
+def A2_addi : HexagonBuiltin<"int(int, _Constant int)">;
+def A2_addp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_addpsat : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_addsat : HexagonBuiltin<"int(int, int)">;
+def A2_addsp : HexagonBuiltin<"long long int(int, long long int)">;
+def A2_and : HexagonBuiltin<"int(int, int)">;
+def A2_andir : HexagonBuiltin<"int(int, _Constant int)">;
+def A2_andp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_aslh : HexagonBuiltin<"int(int)">;
+def A2_asrh : HexagonBuiltin<"int(int)">;
+def A2_combine_hh : HexagonBuiltin<"int(int, int)">;
+def A2_combine_hl : HexagonBuiltin<"int(int, int)">;
+def A2_combine_lh : HexagonBuiltin<"int(int, int)">;
+def A2_combine_ll : HexagonBuiltin<"int(int, int)">;
+def A2_combineii : HexagonBuiltin<"long long int(_Constant int, _Constant int)">;
+def A2_combinew : HexagonBuiltin<"long long int(int, int)">;
+def A2_max : HexagonBuiltin<"int(int, int)">;
+def A2_maxp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_maxu : HexagonBuiltin<"unsigned int(int, int)">;
+def A2_maxup : HexagonBuiltin<"unsigned long long int(long long int, long long int)">;
+def A2_min : HexagonBuiltin<"int(int, int)">;
+def A2_minp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_minu : HexagonBuiltin<"unsigned int(int, int)">;
+def A2_minup : HexagonBuiltin<"unsigned long long int(long long int, long long int)">;
+def A2_neg : HexagonBuiltin<"int(int)">;
+def A2_negp : HexagonBuiltin<"long long int(long long int)">;
+def A2_negsat : HexagonBuiltin<"int(int)">;
+def A2_not : HexagonBuiltin<"int(int)">;
+def A2_notp : HexagonBuiltin<"long long int(long long int)">;
+def A2_or : HexagonBuiltin<"int(int, int)">;
+def A2_orir : HexagonBuiltin<"int(int, _Constant int)">;
+def A2_orp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_roundsat : HexagonBuiltin<"int(long long int)">;
+def A2_sat : HexagonBuiltin<"int(long long int)">;
+def A2_satb : HexagonBuiltin<"int(int)">;
+def A2_sath : HexagonBuiltin<"int(int)">;
+def A2_satub : HexagonBuiltin<"int(int)">;
+def A2_satuh : HexagonBuiltin<"int(int)">;
+def A2_sub : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_hh : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_hl : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_lh : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_ll : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_sat_hh : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_sat_hl : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_sat_lh : HexagonBuiltin<"int(int, int)">;
+def A2_subh_h16_sat_ll : HexagonBuiltin<"int(int, int)">;
+def A2_subh_l16_hl : HexagonBuiltin<"int(int, int)">;
+def A2_subh_l16_ll : HexagonBuiltin<"int(int, int)">;
+def A2_subh_l16_sat_hl : HexagonBuiltin<"int(int, int)">;
+def A2_subh_l16_sat_ll : HexagonBuiltin<"int(int, int)">;
+def A2_subp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_subri : HexagonBuiltin<"int(_Constant int, int)">;
+def A2_subsat : HexagonBuiltin<"int(int, int)">;
+def A2_svaddh : HexagonBuiltin<"int(int, int)">;
+def A2_svaddhs : HexagonBuiltin<"int(int, int)">;
+def A2_svadduhs : HexagonBuiltin<"int(int, int)">;
+def A2_svavgh : HexagonBuiltin<"int(int, int)">;
+def A2_svavghs : HexagonBuiltin<"int(int, int)">;
+def A2_svnavgh : HexagonBuiltin<"int(int, int)">;
+def A2_svsubh : HexagonBuiltin<"int(int, int)">;
+def A2_svsubhs : HexagonBuiltin<"int(int, int)">;
+def A2_svsubuhs : HexagonBuiltin<"int(int, int)">;
+def A2_swiz : HexagonBuiltin<"int(int)">;
+def A2_sxtb : HexagonBuiltin<"int(int)">;
+def A2_sxth : HexagonBuiltin<"int(int)">;
+def A2_sxtw : HexagonBuiltin<"long long int(int)">;
+def A2_tfr : HexagonBuiltin<"int(int)">;
+def A2_tfrih : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A2_tfril : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A2_tfrp : HexagonBuiltin<"long long int(long long int)">;
+def A2_tfrpi : HexagonBuiltin<"long long int(_Constant int)">;
+def A2_tfrsi : HexagonBuiltin<"int(_Constant int)">;
+def A2_vabsh : HexagonBuiltin<"long long int(long long int)">;
+def A2_vabshsat : HexagonBuiltin<"long long int(long long int)">;
+def A2_vabsw : HexagonBuiltin<"long long int(long long int)">;
+def A2_vabswsat : HexagonBuiltin<"long long int(long long int)">;
+def A2_vaddb_map : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddhs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddubs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vadduhs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vaddws : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavghcr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavghr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgubr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavguh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavguhr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavguw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavguwr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgwcr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vavgwr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vcmpbeq : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmpbgtu : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmpheq : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmphgt : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmphgtu : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmpweq : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmpwgt : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vcmpwgtu : HexagonBuiltin<"int(long long int, long long int)">;
+def A2_vconj : HexagonBuiltin<"long long int(long long int)">;
+def A2_vmaxb : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vmaxh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vmaxub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vmaxuh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vmaxuw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vmaxw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminb : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminuh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminuw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vminw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavgh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavghcr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavghr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavgw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavgwcr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vnavgwr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vraddub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vraddub_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def A2_vrsadub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vrsadub_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def A2_vsubb_map : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubhs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubub : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsububs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubuhs : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_vsubws : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_xor : HexagonBuiltin<"int(int, int)">;
+def A2_xorp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A2_zxtb : HexagonBuiltin<"int(int)">;
+def A2_zxth : HexagonBuiltin<"int(int)">;
+def A4_andn : HexagonBuiltin<"int(int, int)">;
+def A4_andnp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A4_bitsplit : HexagonBuiltin<"long long int(int, int)">;
+def A4_bitspliti : HexagonBuiltin<"long long int(int, unsigned _Constant int)">;
+def A4_boundscheck : HexagonBuiltin<"int(int, long long int)">;
+def A4_cmpbeq : HexagonBuiltin<"int(int, int)">;
+def A4_cmpbeqi : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_cmpbgt : HexagonBuiltin<"int(int, int)">;
+def A4_cmpbgti : HexagonBuiltin<"int(int, _Constant int)">;
+def A4_cmpbgtu : HexagonBuiltin<"int(int, int)">;
+def A4_cmpbgtui : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_cmpheq : HexagonBuiltin<"int(int, int)">;
+def A4_cmpheqi : HexagonBuiltin<"int(int, _Constant int)">;
+def A4_cmphgt : HexagonBuiltin<"int(int, int)">;
+def A4_cmphgti : HexagonBuiltin<"int(int, _Constant int)">;
+def A4_cmphgtu : HexagonBuiltin<"int(int, int)">;
+def A4_cmphgtui : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_combineir : HexagonBuiltin<"long long int(_Constant int, int)">;
+def A4_combineri : HexagonBuiltin<"long long int(int, _Constant int)">;
+def A4_cround_ri : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_cround_rr : HexagonBuiltin<"int(int, int)">;
+def A4_modwrapu : HexagonBuiltin<"int(int, int)">;
+def A4_orn : HexagonBuiltin<"int(int, int)">;
+def A4_ornp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def A4_rcmpeq : HexagonBuiltin<"int(int, int)">;
+def A4_rcmpeqi : HexagonBuiltin<"int(int, _Constant int)">;
+def A4_rcmpneq : HexagonBuiltin<"int(int, int)">;
+def A4_rcmpneqi : HexagonBuiltin<"int(int, _Constant int)">;
+def A4_round_ri : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_round_ri_sat : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def A4_round_rr : HexagonBuiltin<"int(int, int)">;
+def A4_round_rr_sat : HexagonBuiltin<"int(int, int)">;
+def A4_tlbmatch : HexagonBuiltin<"int(long long int, int)">;
+def A4_vcmpbeq_any : HexagonBuiltin<"int(long long int, long long int)">;
+def A4_vcmpbeqi : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def A4_vcmpbgt : HexagonBuiltin<"int(long long int, long long int)">;
+def A4_vcmpbgti : HexagonBuiltin<"int(long long int, _Constant int)">;
+def A4_vcmpbgtui : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def A4_vcmpheqi : HexagonBuiltin<"int(long long int, _Constant int)">;
+def A4_vcmphgti : HexagonBuiltin<"int(long long int, _Constant int)">;
+def A4_vcmphgtui : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def A4_vcmpweqi : HexagonBuiltin<"int(long long int, _Constant int)">;
+def A4_vcmpwgti : HexagonBuiltin<"int(long long int, _Constant int)">;
+def A4_vcmpwgtui : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def A4_vrmaxh : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrmaxuh : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrmaxuw : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrmaxw : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrminh : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrminuh : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrminuw : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A4_vrminw : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def A5_vaddhubs : HexagonBuiltin<"int(long long int, long long int)">;
+def C2_all8 : HexagonBuiltin<"int(int)">;
+def C2_and : HexagonBuiltin<"int(int, int)">;
+def C2_andn : HexagonBuiltin<"int(int, int)">;
+def C2_any8 : HexagonBuiltin<"int(int)">;
+def C2_bitsclr : HexagonBuiltin<"int(int, int)">;
+def C2_bitsclri : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def C2_bitsset : HexagonBuiltin<"int(int, int)">;
+def C2_cmpeq : HexagonBuiltin<"int(int, int)">;
+def C2_cmpeqi : HexagonBuiltin<"int(int, _Constant int)">;
+def C2_cmpeqp : HexagonBuiltin<"int(long long int, long long int)">;
+def C2_cmpgei : HexagonBuiltin<"int(int, _Constant int)">;
+def C2_cmpgeui : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def C2_cmpgt : HexagonBuiltin<"int(int, int)">;
+def C2_cmpgti : HexagonBuiltin<"int(int, _Constant int)">;
+def C2_cmpgtp : HexagonBuiltin<"int(long long int, long long int)">;
+def C2_cmpgtu : HexagonBuiltin<"int(int, int)">;
+def C2_cmpgtui : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def C2_cmpgtup : HexagonBuiltin<"int(long long int, long long int)">;
+def C2_cmplt : HexagonBuiltin<"int(int, int)">;
+def C2_cmpltu : HexagonBuiltin<"int(int, int)">;
+def C2_mask : HexagonBuiltin<"long long int(int)">;
+def C2_mux : HexagonBuiltin<"int(int, int, int)">;
+def C2_muxii : HexagonBuiltin<"int(int, _Constant int, _Constant int)">;
+def C2_muxir : HexagonBuiltin<"int(int, int, _Constant int)">;
+def C2_muxri : HexagonBuiltin<"int(int, _Constant int, int)">;
+def C2_not : HexagonBuiltin<"int(int)">;
+def C2_or : HexagonBuiltin<"int(int, int)">;
+def C2_orn : HexagonBuiltin<"int(int, int)">;
+def C2_pxfer_map : HexagonBuiltin<"int(int)">;
+def C2_tfrpr : HexagonBuiltin<"int(int)">;
+def C2_tfrrp : HexagonBuiltin<"int(int)">;
+def C2_vitpack : HexagonBuiltin<"int(int, int)">;
+def C2_vmux : HexagonBuiltin<"long long int(int, long long int, long long int)">;
+def C2_xor : HexagonBuiltin<"int(int, int)">;
+def C4_and_and : HexagonBuiltin<"int(int, int, int)">;
+def C4_and_andn : HexagonBuiltin<"int(int, int, int)">;
+def C4_and_or : HexagonBuiltin<"int(int, int, int)">;
+def C4_and_orn : HexagonBuiltin<"int(int, int, int)">;
+def C4_cmplte : HexagonBuiltin<"int(int, int)">;
+def C4_cmpltei : HexagonBuiltin<"int(int, _Constant int)">;
+def C4_cmplteu : HexagonBuiltin<"int(int, int)">;
+def C4_cmplteui : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def C4_cmpneq : HexagonBuiltin<"int(int, int)">;
+def C4_cmpneqi : HexagonBuiltin<"int(int, _Constant int)">;
+def C4_fastcorner9 : HexagonBuiltin<"int(int, int)">;
+def C4_fastcorner9_not : HexagonBuiltin<"int(int, int)">;
+def C4_nbitsclr : HexagonBuiltin<"int(int, int)">;
+def C4_nbitsclri : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def C4_nbitsset : HexagonBuiltin<"int(int, int)">;
+def C4_or_and : HexagonBuiltin<"int(int, int, int)">;
+def C4_or_andn : HexagonBuiltin<"int(int, int, int)">;
+def C4_or_or : HexagonBuiltin<"int(int, int, int)">;
+def C4_or_orn : HexagonBuiltin<"int(int, int, int)">;
+def F2_conv_d2df : HexagonBuiltin<"double(long long int)">;
+def F2_conv_d2sf : HexagonBuiltin<"float(long long int)">;
+def F2_conv_df2d : HexagonBuiltin<"long long int(double)">;
+def F2_conv_df2d_chop : HexagonBuiltin<"long long int(double)">;
+def F2_conv_df2sf : HexagonBuiltin<"float(double)">;
+def F2_conv_df2ud : HexagonBuiltin<"long long int(double)">;
+def F2_conv_df2ud_chop : HexagonBuiltin<"long long int(double)">;
+def F2_conv_df2uw : HexagonBuiltin<"int(double)">;
+def F2_conv_df2uw_chop : HexagonBuiltin<"int(double)">;
+def F2_conv_df2w : HexagonBuiltin<"int(double)">;
+def F2_conv_df2w_chop : HexagonBuiltin<"int(double)">;
+def F2_conv_sf2d : HexagonBuiltin<"long long int(float)">;
+def F2_conv_sf2d_chop : HexagonBuiltin<"long long int(float)">;
+def F2_conv_sf2df : HexagonBuiltin<"double(float)">;
+def F2_conv_sf2ud : HexagonBuiltin<"long long int(float)">;
+def F2_conv_sf2ud_chop : HexagonBuiltin<"long long int(float)">;
+def F2_conv_sf2uw : HexagonBuiltin<"int(float)">;
+def F2_conv_sf2uw_chop : HexagonBuiltin<"int(float)">;
+def F2_conv_sf2w : HexagonBuiltin<"int(float)">;
+def F2_conv_sf2w_chop : HexagonBuiltin<"int(float)">;
+def F2_conv_ud2df : HexagonBuiltin<"double(long long int)">;
+def F2_conv_ud2sf : HexagonBuiltin<"float(long long int)">;
+def F2_conv_uw2df : HexagonBuiltin<"double(int)">;
+def F2_conv_uw2sf : HexagonBuiltin<"float(int)">;
+def F2_conv_w2df : HexagonBuiltin<"double(int)">;
+def F2_conv_w2sf : HexagonBuiltin<"float(int)">;
+def F2_dfclass : HexagonBuiltin<"int(double, unsigned _Constant int)">;
+def F2_dfcmpeq : HexagonBuiltin<"int(double, double)">;
+def F2_dfcmpge : HexagonBuiltin<"int(double, double)">;
+def F2_dfcmpgt : HexagonBuiltin<"int(double, double)">;
+def F2_dfcmpuo : HexagonBuiltin<"int(double, double)">;
+def F2_dfimm_n : HexagonBuiltin<"double(unsigned _Constant int)">;
+def F2_dfimm_p : HexagonBuiltin<"double(unsigned _Constant int)">;
+def F2_sfadd : HexagonBuiltin<"float(float, float)">;
+def F2_sfclass : HexagonBuiltin<"int(float, unsigned _Constant int)">;
+def F2_sfcmpeq : HexagonBuiltin<"int(float, float)">;
+def F2_sfcmpge : HexagonBuiltin<"int(float, float)">;
+def F2_sfcmpgt : HexagonBuiltin<"int(float, float)">;
+def F2_sfcmpuo : HexagonBuiltin<"int(float, float)">;
+def F2_sffixupd : HexagonBuiltin<"float(float, float)">;
+def F2_sffixupn : HexagonBuiltin<"float(float, float)">;
+def F2_sffixupr : HexagonBuiltin<"float(float)">;
+def F2_sffma : HexagonBuiltin<"float(float, float, float)">;
+def F2_sffma_lib : HexagonBuiltin<"float(float, float, float)">;
+def F2_sffma_sc : HexagonBuiltin<"float(float, float, float, int)">;
+def F2_sffms : HexagonBuiltin<"float(float, float, float)">;
+def F2_sffms_lib : HexagonBuiltin<"float(float, float, float)">;
+def F2_sfimm_n : HexagonBuiltin<"float(unsigned _Constant int)">;
+def F2_sfimm_p : HexagonBuiltin<"float(unsigned _Constant int)">;
+def F2_sfmax : HexagonBuiltin<"float(float, float)">;
+def F2_sfmin : HexagonBuiltin<"float(float, float)">;
+def F2_sfmpy : HexagonBuiltin<"float(float, float)">;
+def F2_sfsub : HexagonBuiltin<"float(float, float)">;
+def M2_acci : HexagonBuiltin<"int(int, int, int)">;
+def M2_accii : HexagonBuiltin<"int(int, int, _Constant int)">;
+def M2_cmaci_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmacr_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmacs_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmacs_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmacsc_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmacsc_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cmpyi_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cmpyr_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cmpyrs_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_cmpyrs_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_cmpyrsc_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_cmpyrsc_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_cmpys_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cmpys_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cmpysc_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cmpysc_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_cnacs_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cnacs_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cnacsc_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_cnacsc_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_dpmpyss_acc_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_dpmpyss_nac_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_dpmpyss_rnd_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_dpmpyss_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_dpmpyuu_acc_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_dpmpyuu_nac_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_dpmpyuu_s0 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_hmmpyh_rs1 : HexagonBuiltin<"int(int, int)">;
+def M2_hmmpyh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_hmmpyl_rs1 : HexagonBuiltin<"int(int, int)">;
+def M2_hmmpyl_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_maci : HexagonBuiltin<"int(int, int, int)">;
+def M2_macsin : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def M2_macsip : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def M2_mmachs_rs0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmachs_rs1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmachs_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmachs_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacls_rs0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacls_rs1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacls_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacls_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacuhs_rs0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacuhs_rs1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacuhs_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmacuhs_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmaculs_rs0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmaculs_rs1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmaculs_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmaculs_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_mmpyh_rs0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyh_rs1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyh_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyh_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyl_rs0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyl_rs1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyl_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyl_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyuh_rs0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyuh_rs1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyuh_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyuh_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyul_rs0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyul_rs1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyul_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mmpyul_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_mpy_acc_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_acc_sat_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_hh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_hh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_hl_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_hl_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_lh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_lh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_ll_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_ll_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_nac_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_nac_sat_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpy_rnd_hh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_hh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_hl_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_hl_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_lh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_lh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_ll_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_rnd_ll_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_hh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_hh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_hl_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_hl_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_lh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_lh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_ll_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_ll_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_hh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_hh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_hl_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_hl_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_lh_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_lh_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_ll_s0 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_sat_rnd_ll_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_up : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_up_s1 : HexagonBuiltin<"int(int, int)">;
+def M2_mpy_up_s1_sat : HexagonBuiltin<"int(int, int)">;
+def M2_mpyd_acc_hh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_hh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_hl_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_hl_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_lh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_lh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_ll_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_acc_ll_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_hh_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_hh_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_hl_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_hl_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_lh_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_lh_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_ll_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_ll_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_nac_hh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_hh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_hl_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_hl_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_lh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_lh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_ll_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_nac_ll_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyd_rnd_hh_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_hh_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_hl_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_hl_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_lh_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_lh_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_ll_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyd_rnd_ll_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_mpyi : HexagonBuiltin<"int(int, int)">;
+def M2_mpysmi : HexagonBuiltin<"int(int, _Constant int)">;
+def M2_mpysu_up : HexagonBuiltin<"int(int, int)">;
+def M2_mpyu_acc_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_acc_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_hh_s0 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_hh_s1 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_hl_s0 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_hl_s1 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_lh_s0 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_lh_s1 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_ll_s0 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_ll_s1 : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyu_nac_hh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_hh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_hl_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_hl_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_lh_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_lh_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_ll_s0 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_nac_ll_s1 : HexagonBuiltin<"int(int, int, int)">;
+def M2_mpyu_up : HexagonBuiltin<"unsigned int(int, int)">;
+def M2_mpyud_acc_hh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_hh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_hl_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_hl_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_lh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_lh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_ll_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_acc_ll_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_hh_s0 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_hh_s1 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_hl_s0 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_hl_s1 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_lh_s0 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_lh_s1 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_ll_s0 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_ll_s1 : HexagonBuiltin<"unsigned long long int(int, int)">;
+def M2_mpyud_nac_hh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_hh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_hl_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_hl_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_lh_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_lh_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_ll_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyud_nac_ll_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_mpyui : HexagonBuiltin<"int(int, int)">;
+def M2_nacci : HexagonBuiltin<"int(int, int, int)">;
+def M2_naccii : HexagonBuiltin<"int(int, int, _Constant int)">;
+def M2_subacc : HexagonBuiltin<"int(int, int, int)">;
+def M2_vabsdiffh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vabsdiffw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vcmac_s0_sat_i : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vcmac_s0_sat_r : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vcmpy_s0_sat_i : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vcmpy_s0_sat_r : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vcmpy_s1_sat_i : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vcmpy_s1_sat_r : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vdmacs_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vdmacs_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vdmpyrs_s0 : HexagonBuiltin<"int(long long int, long long int)">;
+def M2_vdmpyrs_s1 : HexagonBuiltin<"int(long long int, long long int)">;
+def M2_vdmpys_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vdmpys_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vmac2 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_vmac2es : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vmac2es_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vmac2es_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vmac2s_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_vmac2s_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_vmac2su_s0 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_vmac2su_s1 : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M2_vmpy2es_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vmpy2es_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vmpy2s_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_vmpy2s_s0pack : HexagonBuiltin<"int(int, int)">;
+def M2_vmpy2s_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_vmpy2s_s1pack : HexagonBuiltin<"int(int, int)">;
+def M2_vmpy2su_s0 : HexagonBuiltin<"long long int(int, int)">;
+def M2_vmpy2su_s1 : HexagonBuiltin<"long long int(int, int)">;
+def M2_vraddh : HexagonBuiltin<"int(long long int, long long int)">;
+def M2_vradduh : HexagonBuiltin<"int(long long int, long long int)">;
+def M2_vrcmaci_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vrcmaci_s0c : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vrcmacr_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vrcmacr_s0c : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vrcmpyi_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vrcmpyi_s0c : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vrcmpyr_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vrcmpyr_s0c : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_vrcmpys_acc_s1 : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def M2_vrcmpys_s1 : HexagonBuiltin<"long long int(long long int, int)">;
+def M2_vrcmpys_s1rp : HexagonBuiltin<"int(long long int, int)">;
+def M2_vrmac_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M2_vrmpy_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M2_xor_xacc : HexagonBuiltin<"int(int, int, int)">;
+def M4_and_and : HexagonBuiltin<"int(int, int, int)">;
+def M4_and_andn : HexagonBuiltin<"int(int, int, int)">;
+def M4_and_or : HexagonBuiltin<"int(int, int, int)">;
+def M4_and_xor : HexagonBuiltin<"int(int, int, int)">;
+def M4_cmpyi_wh : HexagonBuiltin<"int(long long int, int)">;
+def M4_cmpyi_whc : HexagonBuiltin<"int(long long int, int)">;
+def M4_cmpyr_wh : HexagonBuiltin<"int(long long int, int)">;
+def M4_cmpyr_whc : HexagonBuiltin<"int(long long int, int)">;
+def M4_mac_up_s1_sat : HexagonBuiltin<"int(int, int, int)">;
+def M4_mpyri_addi : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def M4_mpyri_addr : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def M4_mpyri_addr_u2 : HexagonBuiltin<"int(int, unsigned _Constant int, int)">;
+def M4_mpyrr_addi : HexagonBuiltin<"int(unsigned _Constant int, int, int)">;
+def M4_mpyrr_addr : HexagonBuiltin<"int(int, int, int)">;
+def M4_nac_up_s1_sat : HexagonBuiltin<"int(int, int, int)">;
+def M4_or_and : HexagonBuiltin<"int(int, int, int)">;
+def M4_or_andn : HexagonBuiltin<"int(int, int, int)">;
+def M4_or_or : HexagonBuiltin<"int(int, int, int)">;
+def M4_or_xor : HexagonBuiltin<"int(int, int, int)">;
+def M4_pmpyw : HexagonBuiltin<"long long int(int, int)">;
+def M4_pmpyw_acc : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M4_vpmpyh : HexagonBuiltin<"long long int(int, int)">;
+def M4_vpmpyh_acc : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M4_vrmpyeh_acc_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M4_vrmpyeh_acc_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M4_vrmpyeh_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M4_vrmpyeh_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M4_vrmpyoh_acc_s0 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M4_vrmpyoh_acc_s1 : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M4_vrmpyoh_s0 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M4_vrmpyoh_s1 : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M4_xor_and : HexagonBuiltin<"int(int, int, int)">;
+def M4_xor_andn : HexagonBuiltin<"int(int, int, int)">;
+def M4_xor_or : HexagonBuiltin<"int(int, int, int)">;
+def M4_xor_xacc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M5_vdmacbsu : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M5_vdmpybsu : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M5_vmacbsu : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M5_vmacbuu : HexagonBuiltin<"long long int(long long int, int, int)">;
+def M5_vmpybsu : HexagonBuiltin<"long long int(int, int)">;
+def M5_vmpybuu : HexagonBuiltin<"long long int(int, int)">;
+def M5_vrmacbsu : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M5_vrmacbuu : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def M5_vrmpybsu : HexagonBuiltin<"long long int(long long int, long long int)">;
+def M5_vrmpybuu : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_addasl_rrri : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_p : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asl_i_p_acc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asl_i_p_and : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asl_i_p_nac : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asl_i_p_or : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asl_i_p_xacc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asl_i_r : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_asl_i_r_acc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_r_and : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_r_nac : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_r_or : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_r_sat : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_asl_i_r_xacc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asl_i_vh : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asl_i_vw : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asl_r_p : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_asl_r_p_acc : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asl_r_p_and : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asl_r_p_nac : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asl_r_p_or : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asl_r_p_xor : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asl_r_r : HexagonBuiltin<"int(int, int)">;
+def S2_asl_r_r_acc : HexagonBuiltin<"int(int, int, int)">;
+def S2_asl_r_r_and : HexagonBuiltin<"int(int, int, int)">;
+def S2_asl_r_r_nac : HexagonBuiltin<"int(int, int, int)">;
+def S2_asl_r_r_or : HexagonBuiltin<"int(int, int, int)">;
+def S2_asl_r_r_sat : HexagonBuiltin<"int(int, int)">;
+def S2_asl_r_vh : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_asl_r_vw : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_asr_i_p : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asr_i_p_acc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asr_i_p_and : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asr_i_p_nac : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asr_i_p_or : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_asr_i_p_rnd : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asr_i_p_rnd_goodsyntax : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asr_i_r : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_asr_i_r_acc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asr_i_r_and : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asr_i_r_nac : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asr_i_r_or : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_asr_i_r_rnd : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_asr_i_r_rnd_goodsyntax : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_asr_i_svw_trun : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def S2_asr_i_vh : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asr_i_vw : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_asr_r_p : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_asr_r_p_acc : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asr_r_p_and : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asr_r_p_nac : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asr_r_p_or : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asr_r_p_xor : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_asr_r_r : HexagonBuiltin<"int(int, int)">;
+def S2_asr_r_r_acc : HexagonBuiltin<"int(int, int, int)">;
+def S2_asr_r_r_and : HexagonBuiltin<"int(int, int, int)">;
+def S2_asr_r_r_nac : HexagonBuiltin<"int(int, int, int)">;
+def S2_asr_r_r_or : HexagonBuiltin<"int(int, int, int)">;
+def S2_asr_r_r_sat : HexagonBuiltin<"int(int, int)">;
+def S2_asr_r_svw_trun : HexagonBuiltin<"int(long long int, int)">;
+def S2_asr_r_vh : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_asr_r_vw : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_brev : HexagonBuiltin<"int(int)">;
+def S2_brevp : HexagonBuiltin<"long long int(long long int)">;
+def S2_cl0 : HexagonBuiltin<"int(int)">;
+def S2_cl0p : HexagonBuiltin<"int(long long int)">;
+def S2_cl1 : HexagonBuiltin<"int(int)">;
+def S2_cl1p : HexagonBuiltin<"int(long long int)">;
+def S2_clb : HexagonBuiltin<"int(int)">;
+def S2_clbnorm : HexagonBuiltin<"int(int)">;
+def S2_clbp : HexagonBuiltin<"int(long long int)">;
+def S2_clrbit_i : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_clrbit_r : HexagonBuiltin<"int(int, int)">;
+def S2_ct0 : HexagonBuiltin<"int(int)">;
+def S2_ct0p : HexagonBuiltin<"int(long long int)">;
+def S2_ct1 : HexagonBuiltin<"int(int)">;
+def S2_ct1p : HexagonBuiltin<"int(long long int)">;
+def S2_deinterleave : HexagonBuiltin<"long long int(long long int)">;
+def S2_extractu : HexagonBuiltin<"int(int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_extractu_rp : HexagonBuiltin<"int(int, long long int)">;
+def S2_extractup : HexagonBuiltin<"long long int(long long int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_extractup_rp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_insert : HexagonBuiltin<"int(int, int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_insert_rp : HexagonBuiltin<"int(int, int, long long int)">;
+def S2_insertp : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_insertp_rp : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+def S2_interleave : HexagonBuiltin<"long long int(long long int)">;
+def S2_lfsp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_lsl_r_p : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_lsl_r_p_acc : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsl_r_p_and : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsl_r_p_nac : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsl_r_p_or : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsl_r_p_xor : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsl_r_r : HexagonBuiltin<"int(int, int)">;
+def S2_lsl_r_r_acc : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsl_r_r_and : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsl_r_r_nac : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsl_r_r_or : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsl_r_vh : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_lsl_r_vw : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_lsr_i_p : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_lsr_i_p_acc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_lsr_i_p_and : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_lsr_i_p_nac : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_lsr_i_p_or : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_lsr_i_p_xacc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_lsr_i_r : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_lsr_i_r_acc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_lsr_i_r_and : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_lsr_i_r_nac : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_lsr_i_r_or : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_lsr_i_r_xacc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+def S2_lsr_i_vh : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_lsr_i_vw : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def S2_lsr_r_p : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_lsr_r_p_acc : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsr_r_p_and : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsr_r_p_nac : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsr_r_p_or : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsr_r_p_xor : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_lsr_r_r : HexagonBuiltin<"int(int, int)">;
+def S2_lsr_r_r_acc : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsr_r_r_and : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsr_r_r_nac : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsr_r_r_or : HexagonBuiltin<"int(int, int, int)">;
+def S2_lsr_r_vh : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_lsr_r_vw : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_packhl : HexagonBuiltin<"long long int(int, int)">;
+def S2_parityp : HexagonBuiltin<"int(long long int, long long int)">;
+def S2_setbit_i : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_setbit_r : HexagonBuiltin<"int(int, int)">;
+def S2_shuffeb : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_shuffeh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_shuffob : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_shuffoh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_svsathb : HexagonBuiltin<"int(int)">;
+def S2_svsathub : HexagonBuiltin<"int(int)">;
+def S2_tableidxb_goodsyntax : HexagonBuiltin<"int(int, int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_tableidxd_goodsyntax : HexagonBuiltin<"int(int, int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_tableidxh_goodsyntax : HexagonBuiltin<"int(int, int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_tableidxw_goodsyntax : HexagonBuiltin<"int(int, int, unsigned _Constant int, unsigned _Constant int)">;
+def S2_togglebit_i : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_togglebit_r : HexagonBuiltin<"int(int, int)">;
+def S2_tstbit_i : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S2_tstbit_r : HexagonBuiltin<"int(int, int)">;
+def S2_valignib : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_valignrb : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_vcnegh : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_vcrotate : HexagonBuiltin<"long long int(long long int, int)">;
+def S2_vrcnegh : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_vrndpackwh : HexagonBuiltin<"int(long long int)">;
+def S2_vrndpackwhs : HexagonBuiltin<"int(long long int)">;
+def S2_vsathb : HexagonBuiltin<"int(long long int)">;
+def S2_vsathb_nopack : HexagonBuiltin<"long long int(long long int)">;
+def S2_vsathub : HexagonBuiltin<"int(long long int)">;
+def S2_vsathub_nopack : HexagonBuiltin<"long long int(long long int)">;
+def S2_vsatwh : HexagonBuiltin<"int(long long int)">;
+def S2_vsatwh_nopack : HexagonBuiltin<"long long int(long long int)">;
+def S2_vsatwuh : HexagonBuiltin<"int(long long int)">;
+def S2_vsatwuh_nopack : HexagonBuiltin<"long long int(long long int)">;
+def S2_vsplatrb : HexagonBuiltin<"int(int)">;
+def S2_vsplatrh : HexagonBuiltin<"long long int(int)">;
+def S2_vspliceib : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+def S2_vsplicerb : HexagonBuiltin<"long long int(long long int, long long int, int)">;
+def S2_vsxtbh : HexagonBuiltin<"long long int(int)">;
+def S2_vsxthw : HexagonBuiltin<"long long int(int)">;
+def S2_vtrunehb : HexagonBuiltin<"int(long long int)">;
+def S2_vtrunewh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_vtrunohb : HexagonBuiltin<"int(long long int)">;
+def S2_vtrunowh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S2_vzxtbh : HexagonBuiltin<"long long int(int)">;
+def S2_vzxthw : HexagonBuiltin<"long long int(int)">;
+def S4_addaddi : HexagonBuiltin<"int(int, int, _Constant int)">;
+def S4_addi_asl_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_addi_lsr_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_andi_asl_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_andi_lsr_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_clbaddi : HexagonBuiltin<"int(int, _Constant int)">;
+def S4_clbpaddi : HexagonBuiltin<"int(long long int, _Constant int)">;
+def S4_clbpnorm : HexagonBuiltin<"int(long long int)">;
+def S4_extract : HexagonBuiltin<"int(int, unsigned _Constant int, unsigned _Constant int)">;
+def S4_extract_rp : HexagonBuiltin<"int(int, long long int)">;
+def S4_extractp : HexagonBuiltin<"long long int(long long int, unsigned _Constant int, unsigned _Constant int)">;
+def S4_extractp_rp : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_lsli : HexagonBuiltin<"int(_Constant int, int)">;
+def S4_ntstbit_i : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+def S4_ntstbit_r : HexagonBuiltin<"int(int, int)">;
+def S4_or_andi : HexagonBuiltin<"int(int, int, _Constant int)">;
+def S4_or_andix : HexagonBuiltin<"int(int, int, _Constant int)">;
+def S4_or_ori : HexagonBuiltin<"int(int, int, _Constant int)">;
+def S4_ori_asl_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_ori_lsr_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_parity : HexagonBuiltin<"int(int, int)">;
+def S4_subaddi : HexagonBuiltin<"int(int, _Constant int, int)">;
+def S4_subi_asl_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_subi_lsr_ri : HexagonBuiltin<"int(unsigned _Constant int, int, unsigned _Constant int)">;
+def S4_vrcrotate : HexagonBuiltin<"long long int(long long int, int, unsigned _Constant int)">;
+def S4_vrcrotate_acc : HexagonBuiltin<"long long int(long long int, long long int, int, unsigned _Constant int)">;
+def S4_vxaddsubh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_vxaddsubhr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_vxaddsubw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_vxsubaddh : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_vxsubaddhr : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S4_vxsubaddw : HexagonBuiltin<"long long int(long long int, long long int)">;
+def S5_asrhub_rnd_sat_goodsyntax : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def S5_asrhub_sat : HexagonBuiltin<"int(long long int, unsigned _Constant int)">;
+def S5_popcountp : HexagonBuiltin<"int(long long int)">;
+def S5_vasrhrnd_goodsyntax : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+def Y2_dccleana : HexagonBuiltin<"void(void *)">;
+def Y2_dccleaninva : HexagonBuiltin<"void(void *)">;
+def Y2_dcfetch : HexagonBuiltin<"void(void *)">;
+def Y2_dcinva : HexagonBuiltin<"void(void *)">;
+def Y2_dczeroa : HexagonBuiltin<"void(void *)">;
+def Y4_l2fetch : HexagonBuiltin<"void(void *, int)">;
+def Y5_l2fetch : HexagonBuiltin<"void(void *, long long int)">;
+
+// V60 Scalar Instructions.
+
+let Features = V60.Features in {
+  def S6_rol_i_p : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+  def S6_rol_i_p_acc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+  def S6_rol_i_p_and : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+  def S6_rol_i_p_nac : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+  def S6_rol_i_p_or : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+  def S6_rol_i_p_xacc : HexagonBuiltin<"long long int(long long int, long long int, unsigned _Constant int)">;
+  def S6_rol_i_r : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+  def S6_rol_i_r_acc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+  def S6_rol_i_r_and : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+  def S6_rol_i_r_nac : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+  def S6_rol_i_r_or : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+  def S6_rol_i_r_xacc : HexagonBuiltin<"int(int, int, unsigned _Constant int)">;
+}
+
+// V62 Scalar Instructions.
+
+let Features = V62.Features in {
+  def M6_vabsdiffb : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M6_vabsdiffub : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def S6_vsplatrbp : HexagonBuiltin<"long long int(int)">;
+  def S6_vtrunehb_ppp : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def S6_vtrunohb_ppp : HexagonBuiltin<"long long int(long long int, long long int)">;
+}
+
+// V65 Scalar Instructions.
+
+let Features = V65.Features in {
+  def A6_vcmpbeq_notany : HexagonBuiltin<"int(long long int, long long int)">;
+}
+
+// V66 Scalar Instructions.
+
+let Features = V66.Features in {
+  def F2_dfadd : HexagonBuiltin<"double(double, double)">;
+  def F2_dfsub : HexagonBuiltin<"double(double, double)">;
+  def M2_mnaci : HexagonBuiltin<"int(int, int, int)">;
+  def S2_mask : HexagonBuiltin<"int(unsigned _Constant int, unsigned _Constant int)">;
+}
+
+// V67 Scalar Instructions.
+
+let Features = "audio" in {
+  def A7_clip : HexagonBuiltin<"int(int, unsigned _Constant int)">;
+  def A7_croundd_ri : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+  def A7_croundd_rr : HexagonBuiltin<"long long int(long long int, int)">;
+  def A7_vclip : HexagonBuiltin<"long long int(long long int, unsigned _Constant int)">;
+}
+let Features = V67.Features in {
+  def F2_dfmax : HexagonBuiltin<"double(double, double)">;
+  def F2_dfmin : HexagonBuiltin<"double(double, double)">;
+  def F2_dfmpyfix : HexagonBuiltin<"double(double, double)">;
+  def F2_dfmpyhh : HexagonBuiltin<"double(double, double, double)">;
+  def F2_dfmpylh : HexagonBuiltin<"double(double, double, double)">;
+  def F2_dfmpyll : HexagonBuiltin<"double(double, double)">;
+}
+let Features = "audio" in {
+  def M7_dcmpyiw : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M7_dcmpyiw_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+  def M7_dcmpyiwc : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M7_dcmpyiwc_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+  def M7_dcmpyrw : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M7_dcmpyrw_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+  def M7_dcmpyrwc : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M7_dcmpyrwc_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+}
+let Features = V67.Features in {
+  def M7_vdmpy : HexagonBuiltin<"long long int(long long int, long long int)">;
+  def M7_vdmpy_acc : HexagonBuiltin<"long long int(long long int, long long int, long long int)">;
+}
+let Features = "audio" in {
+  def M7_wcmpyiw : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyiw_rnd : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyiwc : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyiwc_rnd : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyrw : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyrw_rnd : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyrwc : HexagonBuiltin<"int(long long int, long long int)">;
+  def M7_wcmpyrwc_rnd : HexagonBuiltin<"int(long long int, long long int)">;
+}
+
+// V68 Scalar Instructions.
+
+let Features = V68.Features in {
+  def Y6_dmlink : HexagonBuiltin<"void(void *, void *)">;
+  def Y6_dmpause : HexagonBuiltin<"int()">;
+  def Y6_dmpoll : HexagonBuiltin<"int()">;
+  def Y6_dmresume : HexagonBuiltin<"void(void *)">;
+  def Y6_dmstart : HexagonBuiltin<"void(void *)">;
+  def Y6_dmwait : HexagonBuiltin<"int()">;
+}
+
+// V60 HVX Instructions.
+
+let Features = HVXV60.Features in {
+  def V6_extractw : HexagonBuiltin<"int(_Vector<16, int>, int)">;
+  def V6_extractw_128B : HexagonBuiltin<"int(_Vector<32, int>, int)">;
+  def V6_hi : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_hi_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_lo : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_lo_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_lvsplatw : HexagonBuiltin<"_Vector<16, int>(int)">;
+  def V6_lvsplatw_128B : HexagonBuiltin<"_Vector<32, int>(int)">;
+  def V6_pred_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_pred_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_pred_and_n : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_pred_and_n_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_pred_not : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>)">;
+  def V6_pred_not_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>)">;
+  def V6_pred_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_pred_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_pred_or_n : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_pred_or_n_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_pred_scalar2 : HexagonBuiltin<"_Vector<64, bool>(int)">;
+  def V6_pred_scalar2_128B : HexagonBuiltin<"_Vector<128, bool>(int)">;
+  def V6_pred_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_pred_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_vS32b_nqpred_ai : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vS32b_nqpred_ai_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vS32b_nt_nqpred_ai : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vS32b_nt_nqpred_ai_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vS32b_nt_qpred_ai : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vS32b_nt_qpred_ai_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vS32b_qpred_ai : HexagonBuiltin<"void(_Vector<64, bool>, void *, _Vector<16, int>)">;
+  def V6_vS32b_qpred_ai_128B : HexagonBuiltin<"void(_Vector<128, bool>, void *, _Vector<32, int>)">;
+  def V6_vabsdiffh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vabsdiffh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vabsdiffub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vabsdiffub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vabsdiffuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vabsdiffuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vabsdiffw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vabsdiffw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vabsh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabsh_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsh_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabsw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabsw_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsw_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vaddb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddb_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddb_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vaddbnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddbnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddbq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddbq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddh_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddh_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vaddhnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddhnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddhq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddhq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddhsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddhsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vaddhw : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddhw_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddubh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddubh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddubsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddubsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddubsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddubsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vadduhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadduhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduhsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduhsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vadduhw : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadduhw_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddw_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddw_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vaddwnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddwnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddwq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddwq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddwsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddwsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddwsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddwsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_valignb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_valignb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_valignbi : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_valignbi_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vand : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vand_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vandqrt : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, int)">;
+  def V6_vandqrt_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, int)">;
+  def V6_vandqrt_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, bool>, int)">;
+  def V6_vandqrt_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<128, bool>, int)">;
+  def V6_vandvrt : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, int)">;
+  def V6_vandvrt_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, int)">;
+  def V6_vandvrt_acc : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, int)">;
+  def V6_vandvrt_acc_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, int)">;
+  def V6_vaslh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vaslh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vaslhv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaslhv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaslw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vaslw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vaslw_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vaslw_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vaslwv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaslwv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vasrh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vasrh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vasrhbrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrhbrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrhubrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrhubrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrhubsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrhubsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrhv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vasrhv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vasrw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vasrw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vasrw_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrw_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrwh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwhrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrwhrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrwhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrwuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vasrwv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vassign : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vassign_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vassignp : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vassignp_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>)">;
+  def V6_vavgh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavghrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavghrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavgub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavgubrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgubrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavguh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavguh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavguhrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavguhrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavgw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavgwrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgwrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcl0h : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcl0h_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcl0w : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcl0w_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcombine : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcombine_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vd0 : HexagonBuiltin<"_Vector<16, int>()">;
+  def V6_vd0_128B : HexagonBuiltin<"_Vector<32, int>()">;
+  def V6_vdealb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vdealb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vdealb4w : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdealb4w_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdealh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vdealh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vdealvdd : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vdealvdd_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdelta : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdelta_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdmpybus : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vdmpybus_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpybus_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vdmpybus_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpybus_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpybus_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vdmpybus_dv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpybus_dv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vdmpyhb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vdmpyhb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhb_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vdmpyhb_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhb_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhb_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vdmpyhb_dv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhb_dv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vdmpyhisat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhisat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, int)">;
+  def V6_vdmpyhisat_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhisat_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<64, int>, int)">;
+  def V6_vdmpyhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vdmpyhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhsat_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vdmpyhsat_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhsuisat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhsuisat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, int)">;
+  def V6_vdmpyhsuisat_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhsuisat_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<64, int>, int)">;
+  def V6_vdmpyhsusat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vdmpyhsusat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdmpyhsusat_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vdmpyhsusat_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdmpyhvsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdmpyhvsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdmpyhvsat_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdmpyhvsat_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdsaduh : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vdsaduh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vdsaduh_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vdsaduh_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_veqb : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqb_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqb_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqb_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqb_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqb_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqb_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqb_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqh : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqh_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqh_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqh_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqh_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqh_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqh_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqh_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqw : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqw_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqw_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqw_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqw_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqw_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqw_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqw_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtb : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtb_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtb_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtb_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtb_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtb_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtb_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtb_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgth : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgth_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgth_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgth_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgth_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgth_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgth_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgth_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtub : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtub_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtub_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtub_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtub_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtub_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtub_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtub_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuh : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuh_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuh_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuh_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuh_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuh_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuh_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuh_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuw : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuw_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuw_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuw_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuw_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuw_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtuw_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtuw_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtw : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtw_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtw_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtw_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtw_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtw_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtw_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtw_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vinsertwr : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vinsertwr_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vlalignb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlalignb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlalignbi : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_vlalignbi_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vlsrh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vlsrh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vlsrhv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vlsrhv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vlsrw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vlsrw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vlsrwv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vlsrwv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vlutvvb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvvb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlutvvb_oracc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvvb_oracc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlutvwh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvwh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlutvwh_oracc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvwh_oracc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmaxh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmaxh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmaxub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmaxub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmaxuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmaxuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmaxw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmaxw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vminh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vminh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vminub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vminub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vminuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vminuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vminw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vminw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpabus : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpabus_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vmpabus_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpabus_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vmpabusv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpabusv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vmpabuuv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpabuuv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vmpahb : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpahb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vmpahb_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpahb_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vmpybus : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, int)">;
+  def V6_vmpybus_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, int)">;
+  def V6_vmpybus_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, int)">;
+  def V6_vmpybus_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, int)">;
+  def V6_vmpybusv : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpybusv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpybusv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpybusv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpybv : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpybv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpybv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpybv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyewuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyewuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, int)">;
+  def V6_vmpyh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, int)">;
+  def V6_vmpyhsat_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, int)">;
+  def V6_vmpyhsat_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, int)">;
+  def V6_vmpyhsrs : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyhsrs_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyhss : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyhss_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyhus : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyhus_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyhus_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyhus_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyhv : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyhv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyhv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyhv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyhvsrs : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyhvsrs_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyieoh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyieoh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyiewh_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyiewh_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyiewuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyiewuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyiewuh_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyiewuh_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyih : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyih_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyih_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyih_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyihb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyihb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyihb_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vmpyihb_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpyiowh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyiowh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyiwb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyiwb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyiwb_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vmpyiwb_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpyiwh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyiwh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyiwh_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vmpyiwh_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpyowh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyowh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyowh_rnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyowh_rnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyowh_rnd_sacc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyowh_rnd_sacc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyowh_sacc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyowh_sacc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyub : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, int)">;
+  def V6_vmpyub_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, int)">;
+  def V6_vmpyub_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, int)">;
+  def V6_vmpyub_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, int)">;
+  def V6_vmpyubv : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyubv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyubv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyubv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyuh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, int)">;
+  def V6_vmpyuh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, int)">;
+  def V6_vmpyuh_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, int)">;
+  def V6_vmpyuh_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, int)">;
+  def V6_vmpyuhv : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyuhv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyuhv_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyuhv_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmux : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmux_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vnavgh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vnavgh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vnavgub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vnavgub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vnavgw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vnavgw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vnormamth : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vnormamth_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vnormamtw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vnormamtw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vnot : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vnot_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vor : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vor_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackeb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackeb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackeh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackeh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackhb_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackhb_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackhub_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackhub_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackob : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackob_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackoh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackoh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackwh_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackwh_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpackwuh_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vpackwuh_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vpopcounth : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vpopcounth_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vrdelta : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrdelta_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpybus : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vrmpybus_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vrmpybus_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vrmpybus_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vrmpybusi : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrmpybusi_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vrmpybusi_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrmpybusi_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vrmpybusv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpybusv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpybusv_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpybusv_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpybv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpybv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpybv_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpybv_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpyub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vrmpyub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vrmpyub_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vrmpyub_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vrmpyubi : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrmpyubi_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vrmpyubi_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrmpyubi_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vrmpyubv : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpyubv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrmpyubv_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrmpyubv_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vror : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vror_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vroundhb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vroundhb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vroundhub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vroundhub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vroundwh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vroundwh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vroundwuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vroundwuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrsadubi : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrsadubi_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vrsadubi_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int, unsigned _Constant int)">;
+  def V6_vrsadubi_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int, unsigned _Constant int)">;
+  def V6_vsathub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsathub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsatwh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsatwh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsb : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vsb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vsh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vsh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vshufeh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshufeh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vshuffb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vshuffb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vshuffeb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshuffeb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vshuffh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vshuffh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vshuffob : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshuffob_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vshuffvdd : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vshuffvdd_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vshufoeb : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshufoeb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vshufoeh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshufoeh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vshufoh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vshufoh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubb_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubb_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubbnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubbnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubbq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubbq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubh_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubh_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubhnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubhnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubhq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubhq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubhsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubhsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubhw : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubhw_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsububh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsububh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsububsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsububsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsububsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsububsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubuhsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubuhsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubuhw : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubuhw_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubw_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubw_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubwnq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubwnq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubwq : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubwq_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubwsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubwsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubwsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubwsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vswap : HexagonBuiltin<"_Vector<32, int>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vswap_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vtmpyb : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vtmpyb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vtmpyb_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vtmpyb_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vtmpybus : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vtmpybus_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vtmpybus_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vtmpybus_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vtmpyhb : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vtmpyhb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vtmpyhb_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vtmpyhb_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vunpackb : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vunpackb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vunpackh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vunpackh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vunpackob : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vunpackob_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vunpackoh : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vunpackoh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vunpackub : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vunpackub_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vunpackuh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vunpackuh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vxor : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vxor_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vzb : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vzb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vzh : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vzh_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+}
+
+// V62 HVX Instructions.
+
+let Features = HVXV62.Features in {
+  def V6_lvsplatb : HexagonBuiltin<"_Vector<16, int>(int)">;
+  def V6_lvsplatb_128B : HexagonBuiltin<"_Vector<32, int>(int)">;
+  def V6_lvsplath : HexagonBuiltin<"_Vector<16, int>(int)">;
+  def V6_lvsplath_128B : HexagonBuiltin<"_Vector<32, int>(int)">;
+  def V6_pred_scalar2v2 : HexagonBuiltin<"_Vector<64, bool>(int)">;
+  def V6_pred_scalar2v2_128B : HexagonBuiltin<"_Vector<128, bool>(int)">;
+  def V6_shuffeqh : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_shuffeqh_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_shuffeqw : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<64, bool>)">;
+  def V6_shuffeqw_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<128, bool>)">;
+  def V6_vaddbsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddbsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddbsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddbsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vaddcarry : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, void *)">;
+  def V6_vaddcarry_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, void *)">;
+  def V6_vaddclbh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddclbh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddclbw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddclbw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddhw_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddhw_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddubh_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddubh_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vaddububb_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vaddububb_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduhw_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadduhw_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduwsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadduwsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduwsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadduwsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vandnqrt : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, int)">;
+  def V6_vandnqrt_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, int)">;
+  def V6_vandnqrt_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<64, bool>, int)">;
+  def V6_vandnqrt_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<128, bool>, int)">;
+  def V6_vandvnqv : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>)">;
+  def V6_vandvnqv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>)">;
+  def V6_vandvqv : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>, _Vector<16, int>)">;
+  def V6_vandvqv_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>, _Vector<32, int>)">;
+  def V6_vasrhbsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrhbsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasruwuhrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasruwuhrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrwuhrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrwuhrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlsrb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vlsrb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vlutvvb_nm : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvvb_nm_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlutvvb_oracci : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_vlutvvb_oracci_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vlutvvbi : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_vlutvvbi_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vlutvwh_nm : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vlutvwh_nm_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vlutvwh_oracci : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_vlutvwh_oracci_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vlutvwhi : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>, unsigned _Constant int)">;
+  def V6_vlutvwhi_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_vmaxb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmaxb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vminb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vminb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpauhb : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpauhb_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vmpauhb_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpauhb_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vmpyewuh_64 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyewuh_64_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpyiwub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyiwub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyiwub_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vmpyiwub_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpyowh_64_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyowh_64_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrounduhub : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrounduhub_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrounduwuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrounduwuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsatuwuh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsatuwuh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubbsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubbsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubbsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubbsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+  def V6_vsubcarry : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, void *)">;
+  def V6_vsubcarry_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, void *)">;
+  def V6_vsubububb_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubububb_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubuwsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsubuwsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubuwsat_dv : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubuwsat_dv_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>)">;
+}
+
+// V65 HVX Instructions.
+
+let Features = HVXV65.Features in {
+  def V6_vabsb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabsb_sat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabsb_sat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vaslh_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vaslh_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasrh_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasrh_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasruhubrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasruhubrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasruhubsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasruhubsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vasruwuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vasruwuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vavgb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavgbrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavgbrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavguw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavguw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vavguwrnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vavguwrnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdd0 : HexagonBuiltin<"_Vector<32, int>()">;
+  def V6_vdd0_128B : HexagonBuiltin<"_Vector<64, int>()">;
+  def V6_vgathermh : HexagonBuiltin<"void(void *, int, int, _Vector<16, int>)">;
+  def V6_vgathermh_128B : HexagonBuiltin<"void(void *, int, int, _Vector<32, int>)">;
+  def V6_vgathermhq : HexagonBuiltin<"void(void *, _Vector<64, bool>, int, int, _Vector<16, int>)">;
+  def V6_vgathermhq_128B : HexagonBuiltin<"void(void *, _Vector<128, bool>, int, int, _Vector<32, int>)">;
+  def V6_vgathermhw : HexagonBuiltin<"void(void *, int, int, _Vector<32, int>)">;
+  def V6_vgathermhw_128B : HexagonBuiltin<"void(void *, int, int, _Vector<64, int>)">;
+  def V6_vgathermhwq : HexagonBuiltin<"void(void *, _Vector<64, bool>, int, int, _Vector<32, int>)">;
+  def V6_vgathermhwq_128B : HexagonBuiltin<"void(void *, _Vector<128, bool>, int, int, _Vector<64, int>)">;
+  def V6_vgathermw : HexagonBuiltin<"void(void *, int, int, _Vector<16, int>)">;
+  def V6_vgathermw_128B : HexagonBuiltin<"void(void *, int, int, _Vector<32, int>)">;
+  def V6_vgathermwq : HexagonBuiltin<"void(void *, _Vector<64, bool>, int, int, _Vector<16, int>)">;
+  def V6_vgathermwq_128B : HexagonBuiltin<"void(void *, _Vector<128, bool>, int, int, _Vector<32, int>)">;
+  def V6_vlut4 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, long long int)">;
+  def V6_vlut4_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, long long int)">;
+  def V6_vmpabuu : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpabuu_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, int)">;
+  def V6_vmpabuu_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vmpabuu_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, int)">;
+  def V6_vmpahhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, long long int)">;
+  def V6_vmpahhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, long long int)">;
+  def V6_vmpauhuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, long long int)">;
+  def V6_vmpauhuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, long long int)">;
+  def V6_vmpsuhuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, long long int)">;
+  def V6_vmpsuhuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, long long int)">;
+  def V6_vmpyh_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, int)">;
+  def V6_vmpyh_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, int)">;
+  def V6_vmpyuhe : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpyuhe_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpyuhe_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_vmpyuhe_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vnavgb : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vnavgb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vprefixqb : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>)">;
+  def V6_vprefixqb_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>)">;
+  def V6_vprefixqh : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>)">;
+  def V6_vprefixqh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>)">;
+  def V6_vprefixqw : HexagonBuiltin<"_Vector<16, int>(_Vector<64, bool>)">;
+  def V6_vprefixqw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<128, bool>)">;
+  def V6_vscattermh : HexagonBuiltin<"void(int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermh_128B : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vscattermh_add : HexagonBuiltin<"void(int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermh_add_128B : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vscattermhq : HexagonBuiltin<"void(_Vector<64, bool>, int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermhq_128B : HexagonBuiltin<"void(_Vector<128, bool>, int, int, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vscattermhw : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<16, int>)">;
+  def V6_vscattermhw_128B : HexagonBuiltin<"void(int, int, _Vector<64, int>, _Vector<32, int>)">;
+  def V6_vscattermhw_add : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<16, int>)">;
+  def V6_vscattermhw_add_128B : HexagonBuiltin<"void(int, int, _Vector<64, int>, _Vector<32, int>)">;
+  def V6_vscattermhwq : HexagonBuiltin<"void(_Vector<64, bool>, int, int, _Vector<32, int>, _Vector<16, int>)">;
+  def V6_vscattermhwq_128B : HexagonBuiltin<"void(_Vector<128, bool>, int, int, _Vector<64, int>, _Vector<32, int>)">;
+  def V6_vscattermw : HexagonBuiltin<"void(int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermw_128B : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vscattermw_add : HexagonBuiltin<"void(int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermw_add_128B : HexagonBuiltin<"void(int, int, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vscattermwq : HexagonBuiltin<"void(_Vector<64, bool>, int, int, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vscattermwq_128B : HexagonBuiltin<"void(_Vector<128, bool>, int, int, _Vector<32, int>, _Vector<32, int>)">;
+}
+
+// V66 HVX Instructions.
+
+let Features = HVXV66.Features in {
+  def V6_vaddcarryo : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, void *)">;
+  def V6_vaddcarryo_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, void *)">;
+  def V6_vaddcarrysat : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<64, bool>)">;
+  def V6_vaddcarrysat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<128, bool>)">;
+  def V6_vasr_into : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vasr_into_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vrotr : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vrotr_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsatdw : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsatdw_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsubcarryo : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, void *)">;
+  def V6_vsubcarryo_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, void *)">;
+}
+
+// V68 HVX Instructions.
+
+let Features = HVXV68.Features in {
+  def V6_v6mpyhubs10 : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_v6mpyhubs10_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, unsigned _Constant int)">;
+  def V6_v6mpyhubs10_vxx : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_v6mpyhubs10_vxx_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, _Vector<64, int>, unsigned _Constant int)">;
+  def V6_v6mpyvubs10 : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_v6mpyvubs10_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, unsigned _Constant int)">;
+  def V6_v6mpyvubs10_vxx : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>, unsigned _Constant int)">;
+  def V6_v6mpyvubs10_vxx_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<64, int>, _Vector<64, int>, unsigned _Constant int)">;
+  def V6_vabs_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vadd_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_hf_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_hf_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_qf16_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_qf16_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_qf32_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_qf32_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_sf_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_sf_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vadd_sf_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_sf_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vassign_fp : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vassign_fp_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_hf_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_hf_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_hf_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_vconv_hf_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_vconv_sf_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_sf_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcvt_b_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt_b_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt_h_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcvt_h_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcvt_hf_b : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt_hf_b_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vcvt_hf_h : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcvt_hf_h_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcvt_hf_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt_hf_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt_hf_ub : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt_hf_ub_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vcvt_hf_uh : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcvt_hf_uh_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcvt_sf_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt_sf_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vcvt_ub_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt_ub_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt_uh_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vcvt_uh_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vdmpy_sf_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdmpy_sf_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vdmpy_sf_hf_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vdmpy_sf_hf_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfmax_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmax_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfmax_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmax_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfmin_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmin_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfmin_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmin_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfneg_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vfneg_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vfneg_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vfneg_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vgthf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgthf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgthf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgthf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgthf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgthf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgthf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgthf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtsf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtsf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtsf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtsf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtsf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtsf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtsf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtsf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmax_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmax_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmax_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmax_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmin_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmin_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmin_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmin_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_hf_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_hf_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_hf_hf_acc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_hf_hf_acc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf16_mix_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf16_mix_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf32_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf32_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf32_mix_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf32_mix_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf32_qf16 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf32_qf16_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_sf_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_sf_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_sf_hf_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_sf_hf_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_sf_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_sf_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_hf_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_qf16_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_qf16_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_qf32_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_qf32_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_hf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_hf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+}
+
+// V69 HVX Instructions.
+
+let Features = HVXV69.Features in {
+  def V6_vasrvuhubrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vasrvuhubrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vasrvuhubsat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vasrvuhubsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vasrvwuhrndsat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vasrvwuhrndsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vasrvwuhsat : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>, _Vector<16, int>)">;
+  def V6_vasrvwuhsat_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>, _Vector<32, int>)">;
+  def V6_vmpyuhvs : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpyuhvs_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+}
+
+// V73 HVX Instructions.
+
+let Features = HVXV73.Features in {
+  def V6_vadd_sf_bf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_sf_bf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vconv_h_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_h_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_hf_h : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_hf_h_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_sf_w : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_sf_w_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_w_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_w_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vcvt_bf_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt_bf_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtbf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtbf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtbf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtbf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtbf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtbf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vgtbf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vgtbf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmax_bf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmax_bf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmin_bf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmin_bf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_sf_bf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_sf_bf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_sf_bf_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_sf_bf_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_bf : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_bf_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+}
+
+// V79 HVX Instructions.
+
+let Features = HVXV79.Features in {
+  def V6_get_qfext : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_get_qfext_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_get_qfext_oracc : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_get_qfext_oracc_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_set_qfext : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_set_qfext_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vabs_f8 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_f8_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vadd_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vadd_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt2_b_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt2_b_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt2_hf_b : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt2_hf_b_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vcvt2_hf_ub : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt2_hf_ub_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vcvt2_ub_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt2_ub_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt_f8_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vcvt_f8_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vcvt_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vcvt_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vfmax_f8 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmax_f8_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfmin_f8 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vfmin_f8_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vfneg_f8 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vfneg_f8_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vmerge_qf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmerge_qf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_hf_f8_acc : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_vmpy_hf_f8_acc_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<64, int>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vmpy_rt_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpy_rt_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpy_rt_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpy_rt_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vmpy_rt_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, int)">;
+  def V6_vmpy_rt_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, int)">;
+  def V6_vsub_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
+}
diff --git a/clang/include/clang/Basic/BuiltinsHexagonDep.def b/clang/include/clang/Basic/BuiltinsHexagonDep.def
deleted file mode 100644
index 616ff3ccf5b6b..0000000000000
--- a/clang/include/clang/Basic/BuiltinsHexagonDep.def
+++ /dev/null
@@ -1,1970 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Automatically generated file, do not edit!
-//===----------------------------------------------------------------------===//
-
-
-// V5 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_A2_abs, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_absp, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_abssat, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_add, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_hh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_lh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_sat_hh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_sat_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_sat_lh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_h16_sat_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_l16_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_l16_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_l16_sat_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addh_l16_sat_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addpsat, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addsat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_addsp, "LLiiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_and, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_andir, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_andp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_aslh, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_asrh, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combine_hh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combine_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combine_lh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combine_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combineii, "LLiIiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_combinew, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_max, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_maxp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_maxu, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_maxup, "ULLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_min, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_minp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_minu, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_minup, "ULLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_neg, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_negp, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_negsat, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_not, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_notp, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_or, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_orir, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_orp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_roundsat, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sat, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_satb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sath, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_satub, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_satuh, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sub, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_hh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_lh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_sat_hh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_sat_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_sat_lh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_h16_sat_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_l16_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_l16_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_l16_sat_hl, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subh_l16_sat_ll, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subri, "iIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_subsat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svaddh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svaddhs, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svadduhs, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svavgh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svavghs, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svnavgh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svsubh, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svsubhs, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_svsubuhs, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_swiz, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sxtb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sxth, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_sxtw, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfr, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfrih, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfril, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfrp, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfrpi, "LLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_tfrsi, "iIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vabsh, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vabshsat, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vabsw, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vabswsat, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddb_map, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddhs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddubs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vadduhs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vaddws, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavghcr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavghr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgubr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavguh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavguhr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavguw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavguwr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgwcr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vavgwr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpbeq, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpbgtu, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpheq, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmphgt, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmphgtu, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpweq, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpwgt, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vcmpwgtu, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vconj, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxb, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxuh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxuw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vmaxw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminb, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminuh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminuw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vminw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavgh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavghcr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavghr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavgw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavgwcr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vnavgwr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vraddub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vraddub_acc, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vrsadub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vrsadub_acc, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubb_map, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubhs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubub, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsububs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubuhs, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_vsubws, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_xor, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_xorp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_zxtb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A2_zxth, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_andn, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_andnp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_bitsplit, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_bitspliti, "LLiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_boundscheck, "iiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbeq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbeqi, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbgt, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbgti, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbgtu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpbgtui, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpheq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmpheqi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmphgt, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmphgti, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmphgtu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cmphgtui, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_combineir, "LLiIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_combineri, "LLiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cround_ri, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_cround_rr, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_modwrapu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_orn, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_ornp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_rcmpeq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_rcmpeqi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_rcmpneq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_rcmpneqi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_round_ri, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_round_ri_sat, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_round_rr, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_round_rr_sat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_tlbmatch, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpbeq_any, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpbeqi, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpbgt, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpbgti, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpbgtui, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpheqi, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmphgti, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmphgtui, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpweqi, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpwgti, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vcmpwgtui, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrmaxh, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrmaxuh, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrmaxuw, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrmaxw, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrminh, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrminuh, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrminuw, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A4_vrminw, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_A5_vaddhubs, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_all8, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_and, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_andn, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_any8, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_bitsclr, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_bitsclri, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_bitsset, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpeq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpeqi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpeqp, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgei, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgeui, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgt, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgti, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgtp, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgtu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgtui, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpgtup, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmplt, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_cmpltu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_mask, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_mux, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_muxii, "iiIiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_muxir, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_muxri, "iiIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_not, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_or, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_orn, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_pxfer_map, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_tfrpr, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_tfrrp, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_vitpack, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_vmux, "LLiiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C2_xor, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_and_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_and_andn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_and_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_and_orn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmplte, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmpltei, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmplteu, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmplteui, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmpneq, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_cmpneqi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_fastcorner9, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_fastcorner9_not, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_nbitsclr, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_nbitsclri, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_nbitsset, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_or_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_or_andn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_or_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_C4_or_orn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_d2df, "dLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_d2sf, "fLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2d, "LLid", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2d_chop, "LLid", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2sf, "fd", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2ud, "LLid", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2ud_chop, "LLid", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2uw, "id", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2uw_chop, "id", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2w, "id", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_df2w_chop, "id", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2d, "LLif", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2d_chop, "LLif", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2df, "df", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2ud, "LLif", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2ud_chop, "LLif", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2uw, "if", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2uw_chop, "if", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2w, "if", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_sf2w_chop, "if", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_ud2df, "dLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_ud2sf, "fLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_uw2df, "di", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_uw2sf, "fi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_w2df, "di", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_conv_w2sf, "fi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfclass, "idUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfcmpeq, "idd", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfcmpge, "idd", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfcmpgt, "idd", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfcmpuo, "idd", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfimm_n, "dUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfimm_p, "dUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfadd, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfclass, "ifUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfcmpeq, "iff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfcmpge, "iff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfcmpgt, "iff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfcmpuo, "iff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffixupd, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffixupn, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffixupr, "ff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffma, "ffff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffma_lib, "ffff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffma_sc, "ffffi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffms, "ffff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sffms_lib, "ffff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfimm_n, "fUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfimm_p, "fUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfmax, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfmin, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfmpy, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_sfsub, "fff", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_acci, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_accii, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmaci_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmacr_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmacs_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmacs_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmacsc_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmacsc_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyi_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyr_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyrs_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyrs_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyrsc_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpyrsc_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpys_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpys_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpysc_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cmpysc_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cnacs_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cnacs_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cnacsc_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_cnacsc_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyss_acc_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyss_nac_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyss_rnd_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyss_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyuu_acc_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyuu_nac_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_dpmpyuu_s0, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_hmmpyh_rs1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_hmmpyh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_hmmpyl_rs1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_hmmpyl_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_maci, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_macsin, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_macsip, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmachs_rs0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmachs_rs1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmachs_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmachs_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacls_rs0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacls_rs1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacls_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacls_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacuhs_rs0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacuhs_rs1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacuhs_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmacuhs_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmaculs_rs0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmaculs_rs1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmaculs_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmaculs_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyh_rs0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyh_rs1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyh_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyh_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyl_rs0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyl_rs1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyl_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyl_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyuh_rs0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyuh_rs1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyuh_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyuh_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyul_rs0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyul_rs1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyul_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mmpyul_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_acc_sat_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_hh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_hh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_hl_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_hl_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_lh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_lh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_ll_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_ll_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_nac_sat_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_hh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_hh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_hl_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_hl_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_lh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_lh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_ll_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_rnd_ll_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_hh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_hh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_hl_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_hl_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_lh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_lh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_ll_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_ll_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_hh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_hh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_hl_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_hl_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_lh_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_lh_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_ll_s0, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_sat_rnd_ll_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_up, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_up_s1, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpy_up_s1_sat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_hh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_hh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_hl_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_hl_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_lh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_lh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_ll_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_acc_ll_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_hh_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_hh_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_hl_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_hl_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_lh_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_lh_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_ll_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_ll_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_hh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_hh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_hl_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_hl_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_lh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_lh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_ll_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_nac_ll_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_hh_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_hh_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_hl_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_hl_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_lh_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_lh_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_ll_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyd_rnd_ll_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyi, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpysmi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpysu_up, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_acc_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_hh_s0, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_hh_s1, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_hl_s0, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_hl_s1, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_lh_s0, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_lh_s1, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_ll_s0, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_ll_s1, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_hh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_hh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_hl_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_hl_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_lh_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_lh_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_ll_s0, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_nac_ll_s1, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyu_up, "Uiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_hh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_hh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_hl_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_hl_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_lh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_lh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_ll_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_acc_ll_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_hh_s0, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_hh_s1, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_hl_s0, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_hl_s1, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_lh_s0, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_lh_s1, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_ll_s0, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_ll_s1, "ULLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_hh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_hh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_hl_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_hl_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_lh_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_lh_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_ll_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyud_nac_ll_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mpyui, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_nacci, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_naccii, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_subacc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vabsdiffh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vabsdiffw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmac_s0_sat_i, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmac_s0_sat_r, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmpy_s0_sat_i, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmpy_s0_sat_r, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmpy_s1_sat_i, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vcmpy_s1_sat_r, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmacs_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmacs_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmpyrs_s0, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmpyrs_s1, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmpys_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vdmpys_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2es, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2es_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2es_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2s_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2s_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2su_s0, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmac2su_s1, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2es_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2es_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2s_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2s_s0pack, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2s_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2s_s1pack, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2su_s0, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vmpy2su_s1, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vraddh, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vradduh, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmaci_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmaci_s0c, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmacr_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmacr_s0c, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpyi_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpyi_s0c, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpyr_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpyr_s0c, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpys_acc_s1, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpys_s1, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrcmpys_s1rp, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrmac_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_vrmpy_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_xor_xacc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_and_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_and_andn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_and_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_and_xor, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_cmpyi_wh, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_cmpyi_whc, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_cmpyr_wh, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_cmpyr_whc, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mac_up_s1_sat, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mpyri_addi, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mpyri_addr, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mpyri_addr_u2, "iiUIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mpyrr_addi, "iUIiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_mpyrr_addr, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_nac_up_s1_sat, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_or_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_or_andn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_or_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_or_xor, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_pmpyw, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_pmpyw_acc, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vpmpyh, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vpmpyh_acc, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyeh_acc_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyeh_acc_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyeh_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyeh_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyoh_acc_s0, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyoh_acc_s1, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyoh_s0, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_vrmpyoh_s1, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_xor_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_xor_andn, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_xor_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M4_xor_xacc, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vdmacbsu, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vdmpybsu, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vmacbsu, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vmacbuu, "LLiLLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vmpybsu, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vmpybuu, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vrmacbsu, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vrmacbuu, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vrmpybsu, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_M5_vrmpybuu, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_addasl_rrri, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p_acc, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p_and, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p_nac, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p_or, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_p_xacc, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_acc, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_and, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_nac, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_or, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_sat, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_r_xacc, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_vh, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_i_vw, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p_acc, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p_and, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p_nac, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p_or, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_p_xor, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r_acc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r_nac, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_r_sat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_vh, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asl_r_vw, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_acc, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_and, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_nac, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_or, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_rnd, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_p_rnd_goodsyntax, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_acc, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_and, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_nac, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_or, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_rnd, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_r_rnd_goodsyntax, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_svw_trun, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_vh, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_i_vw, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p_acc, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p_and, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p_nac, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p_or, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_p_xor, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r_acc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r_nac, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_r_sat, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_svw_trun, "iLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_vh, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_asr_r_vw, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_brev, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_brevp, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_cl0, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_cl0p, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_cl1, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_cl1p, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_clb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_clbnorm, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_clbp, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_clrbit_i, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_clrbit_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_ct0, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_ct0p, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_ct1, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_ct1p, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_deinterleave, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_extractu, "iiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_extractu_rp, "iiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_extractup, "LLiLLiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_extractup_rp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_insert, "iiiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_insert_rp, "iiiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_insertp, "LLiLLiLLiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_insertp_rp, "LLiLLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_interleave, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lfsp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p_acc, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p_and, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p_nac, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p_or, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_p_xor, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_r_acc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_r_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_r_nac, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_r_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_vh, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsl_r_vw, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p_acc, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p_and, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p_nac, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p_or, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_p_xacc, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r_acc, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r_and, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r_nac, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r_or, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_r_xacc, "iiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_vh, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_i_vw, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p_acc, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p_and, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p_nac, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p_or, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_p_xor, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_r_acc, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_r_and, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_r_nac, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_r_or, "iiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_vh, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_lsr_r_vw, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_packhl, "LLiii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_parityp, "iLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_setbit_i, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_setbit_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_shuffeb, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_shuffeh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_shuffob, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_shuffoh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_svsathb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_svsathub, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tableidxb_goodsyntax, "iiiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tableidxd_goodsyntax, "iiiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tableidxh_goodsyntax, "iiiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tableidxw_goodsyntax, "iiiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_togglebit_i, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_togglebit_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tstbit_i, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_tstbit_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_valignib, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_valignrb, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vcnegh, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vcrotate, "LLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vrcnegh, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vrndpackwh, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vrndpackwhs, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsathb, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsathb_nopack, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsathub, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsathub_nopack, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsatwh, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsatwh_nopack, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsatwuh, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsatwuh_nopack, "LLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsplatrb, "ii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsplatrh, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vspliceib, "LLiLLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsplicerb, "LLiLLiLLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsxtbh, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vsxthw, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vtrunehb, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vtrunewh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vtrunohb, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vtrunowh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vzxtbh, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_vzxthw, "LLii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_addaddi, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_addi_asl_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_addi_lsr_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_andi_asl_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_andi_lsr_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_clbaddi, "iiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_clbpaddi, "iLLiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_clbpnorm, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_extract, "iiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_extract_rp, "iiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_extractp, "LLiLLiUIiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_extractp_rp, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_lsli, "iIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_ntstbit_i, "iiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_ntstbit_r, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_or_andi, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_or_andix, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_or_ori, "iiiIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_ori_asl_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_ori_lsr_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_parity, "iii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_subaddi, "iiIii", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_subi_asl_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_subi_lsr_ri, "iUIiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vrcrotate, "LLiLLiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vrcrotate_acc, "LLiLLiLLiiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxaddsubh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxaddsubhr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxaddsubw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxsubaddh, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxsubaddhr, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S4_vxsubaddw, "LLiLLiLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S5_asrhub_rnd_sat_goodsyntax, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S5_asrhub_sat, "iLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S5_popcountp, "iLLi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_S5_vasrhrnd_goodsyntax, "LLiLLiUIi", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y2_dccleana, "vv*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y2_dccleaninva, "vv*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y2_dcfetch, "vv*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y2_dcinva, "vv*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y2_dczeroa, "vv*", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y4_l2fetch, "vv*i", "", V5)
-TARGET_BUILTIN(__builtin_HEXAGON_Y5_l2fetch, "vv*LLi", "", V5)
-
-// V60 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p, "LLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p_acc, "LLiLLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p_and, "LLiLLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p_nac, "LLiLLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p_or, "LLiLLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_p_xacc, "LLiLLiLLiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r, "iiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r_acc, "iiiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r_and, "iiiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r_nac, "iiiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r_or, "iiiUIi", "", V60)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_rol_i_r_xacc, "iiiUIi", "", V60)
-
-// V62 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_M6_vabsdiffb, "LLiLLiLLi", "", V62)
-TARGET_BUILTIN(__builtin_HEXAGON_M6_vabsdiffub, "LLiLLiLLi", "", V62)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_vsplatrbp, "LLii", "", V62)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_vtrunehb_ppp, "LLiLLiLLi", "", V62)
-TARGET_BUILTIN(__builtin_HEXAGON_S6_vtrunohb_ppp, "LLiLLiLLi", "", V62)
-
-// V65 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_A6_vcmpbeq_notany, "iLLiLLi", "", V65)
-
-// V66 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfadd, "ddd", "", V66)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfsub, "ddd", "", V66)
-TARGET_BUILTIN(__builtin_HEXAGON_M2_mnaci, "iiii", "", V66)
-TARGET_BUILTIN(__builtin_HEXAGON_S2_mask, "iUIiUIi", "", V66)
-
-// V67 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_A7_clip, "iiUIi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_A7_croundd_ri, "LLiLLiUIi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_A7_croundd_rr, "LLiLLii", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_A7_vclip, "LLiLLiUIi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmax, "ddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmin, "ddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmpyfix, "ddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmpyhh, "dddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmpylh, "dddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_F2_dfmpyll, "ddd", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyiw, "LLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyiw_acc, "LLiLLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyiwc, "LLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyiwc_acc, "LLiLLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyrw, "LLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyrw_acc, "LLiLLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyrwc, "LLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_dcmpyrwc_acc, "LLiLLiLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_vdmpy, "LLiLLiLLi", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_M7_vdmpy_acc, "LLiLLiLLiLLi", "", V67)
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyiw, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyiw_rnd, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyiwc, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyiwc_rnd, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyrw, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyrw_rnd, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyrwc, "iLLiLLi", "", "audio")
-TARGET_BUILTIN(__builtin_HEXAGON_M7_wcmpyrwc_rnd, "iLLiLLi", "", "audio")
-
-// V68 Scalar Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmlink, "vv*v*", "", V68)
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmpause, "i", "", V68)
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmpoll, "i", "", V68)
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmresume, "vv*", "", V68)
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmstart, "vv*", "", V68)
-TARGET_BUILTIN(__builtin_HEXAGON_Y6_dmwait, "i", "", V68)
-
-// V60 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_extractw, "iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_extractw_128B, "iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_hi, "V16iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_hi_128B, "V32iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lo, "V16iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lo_128B, "V32iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplatw, "V16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplatw_128B, "V32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_and, "V64bV64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_and_128B, "V128bV128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_and_n, "V64bV64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_and_n_128B, "V128bV128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_not, "V64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_not_128B, "V128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_or, "V64bV64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_or_128B, "V128bV128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_or_n, "V64bV64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_or_n_128B, "V128bV128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_scalar2, "V64bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_scalar2_128B, "V128bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_xor, "V64bV64bV64b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_xor_128B, "V128bV128bV128b", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nqpred_ai, "vV64bv*V16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nqpred_ai_128B, "vV128bv*V32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nt_nqpred_ai, "vV64bv*V16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nt_nqpred_ai_128B, "vV128bv*V32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nt_qpred_ai, "vV64bv*V16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_nt_qpred_ai_128B, "vV128bv*V32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_qpred_ai, "vV64bv*V16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vS32b_qpred_ai_128B, "vV128bv*V32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsdiffw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsh, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsh_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsh_sat, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsh_sat_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsw, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsw_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsw_sat, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsw_sat_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddb, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddb_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddb_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddb_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddh_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddh_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhw, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhw_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubh, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubh_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhw, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhw_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddw_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddw_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddwsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_valignb, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_valignb_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_valignbi, "V16iV16iV16iUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_valignbi_128B, "V32iV32iV32iUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vand, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vand_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandqrt, "V16iV64bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandqrt_128B, "V32iV128bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandqrt_acc, "V16iV16iV64bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandqrt_acc_128B, "V32iV32iV128bi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvrt, "V64bV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvrt_128B, "V128bV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvrt_acc, "V64bV64bV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvrt_acc_128B, "V128bV128bV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslh, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslh_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslhv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslhv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslw, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslw_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslw_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslw_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslwv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslwv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrh, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrh_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhbrndsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhbrndsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhubrndsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhubrndsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhubsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhubsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrw, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrw_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrw_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrw_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwh, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwh_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwhrndsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwhrndsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwhsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwhsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwuhsat, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwuhsat_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassign, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassign_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassignp, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassignp_128B, "V64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavghrnd, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavghrnd_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgubrnd, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgubrnd_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguhrnd, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguhrnd_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgwrnd, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgwrnd_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcl0h, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcl0h_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcl0w, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcl0w_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcombine, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcombine_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vd0, "V16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vd0_128B, "V32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealb, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealb_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealb4w, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealb4w_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealh, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealh_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealvdd, "V32iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdealvdd_128B, "V64iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdelta, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdelta_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpybus_dv_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhb_dv_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat, "V16iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_128B, "V32iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_acc, "V16iV16iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhisat_acc_128B, "V32iV32iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsat_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat, "V16iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_128B, "V32iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_acc, "V16iV16iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsuisat_acc_128B, "V32iV32iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhsusat_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpyhvsat_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdsaduh, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdsaduh_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdsaduh_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdsaduh_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqb_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqh_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_veqw_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtb_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgth_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtub_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuh_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtuw_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw, "V64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_128B, "V128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_and, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_and_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_or, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_or_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_xor, "V64bV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtw_xor_128B, "V128bV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vinsertwr, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vinsertwr_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlalignb, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlalignb_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlalignbi, "V16iV16iV16iUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlalignbi_128B, "V32iV32iV32iUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrh, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrh_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrhv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrhv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrw, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrw_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrwv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrwv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracc, "V16iV16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracc_128B, "V32iV32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh, "V32iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_128B, "V64iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracc, "V32iV32iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracc_128B, "V64iV64iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabus, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabus_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabus_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabus_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabusv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabusv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuuv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuuv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahb, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahb_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahb_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahb_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybus, "V32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybus_128B, "V64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybus_acc, "V32iV32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybus_acc_128B, "V64iV64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybusv, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybusv_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybusv_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybusv_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybv, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybv_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybv_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpybv_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyewuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyewuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyh, "V32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyh_128B, "V64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhsat_acc, "V32iV32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhsat_acc_128B, "V64iV64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhsrs, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhsrs_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhss, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhss_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhus, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhus_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhus_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhus_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhv, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhv_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhv_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhv_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhvsrs, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyhvsrs_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyieoh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyieoh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewh_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewh_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiewuh_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyih, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyih_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyih_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyih_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyihb, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyihb_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyihb_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyihb_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiowh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiowh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwb, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwb_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwh, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwh_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_sacc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_rnd_sacc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_sacc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_sacc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyub, "V32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyub_128B, "V64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyub_acc, "V32iV32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyub_acc_128B, "V64iV64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyubv, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyubv_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyubv_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyubv_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuh, "V32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuh_128B, "V64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuh_acc, "V32iV32iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuh_acc_128B, "V64iV64iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhv, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_acc, "V32iV32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhv_acc_128B, "V64iV64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmux, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmux_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnormamth, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnormamth_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnormamtw, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnormamtw_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnot, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnot_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vor, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vor_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackeb, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackeb_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackeh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackeh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackhb_sat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackhb_sat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackhub_sat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackhub_sat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackob, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackob_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackoh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackoh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackwh_sat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackwh_sat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackwuh_sat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpackwuh_sat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpopcounth, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vpopcounth_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrdelta, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrdelta_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybus, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybus_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybus_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybus_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusi, "V32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_128B, "V64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_acc, "V32iV32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusi_acc_128B, "V64iV64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybusv_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybv_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpybv_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_acc, "V16iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyub_acc_128B, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubi, "V32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_128B, "V64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_acc, "V32iV32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubi_acc_128B, "V64iV64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubv, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_acc, "V16iV16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrmpyubv_acc_128B, "V32iV32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vror, "V16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vror_128B, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundhb, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundhb_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundhub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundhub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundwh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundwh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundwuh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vroundwuh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrsadubi, "V32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrsadubi_128B, "V64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrsadubi_acc, "V32iV32iV32iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrsadubi_acc_128B, "V64iV64iV64iiUIi", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsathub, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsathub_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatwh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatwh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsb, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsb_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsh, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsh_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufeh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufeh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffb, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffb_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffeb, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffeb_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffh, "V16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffh_128B, "V32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffob, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffob_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffvdd, "V32iV16iV16ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshuffvdd_128B, "V64iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoeb, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoeb_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoeh, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoeh_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vshufoh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubb, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubb_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubb_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubb_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubh, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubh_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubh_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubh_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhw, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubhw_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububh, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububh_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsububsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhw, "V32iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuhw_128B, "V64iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubw, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubw_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubw_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubw_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwnq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwnq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwq, "V16iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwq_128B, "V32iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwsat, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwsat_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwsat_dv, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubwsat_dv_128B, "V64iV64iV64i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vswap, "V32iV64bV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vswap_128B, "V64iV128bV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyb, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyb_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyb_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyb_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpybus, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpybus_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpybus_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpybus_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyhb, "V32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_128B, "V64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_acc, "V32iV32iV32ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vtmpyhb_acc_128B, "V64iV64iV64ii", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackb, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackb_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackh, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackh_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackob, "V32iV32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackob_128B, "V64iV64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackoh, "V32iV32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackoh_128B, "V64iV64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackub, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackub_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackuh, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vunpackuh_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vxor, "V16iV16iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vxor_128B, "V32iV32iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vzb, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vzb_128B, "V64iV32i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vzh, "V32iV16i", "", HVXV60)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vzh_128B, "V64iV32i", "", HVXV60)
-
-// V62 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplatb, "V16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplatb_128B, "V32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplath, "V16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_lvsplath_128B, "V32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_scalar2v2, "V64bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_pred_scalar2v2_128B, "V128bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_shuffeqh, "V64bV64bV64b", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_shuffeqh_128B, "V128bV128bV128b", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_shuffeqw, "V64bV64bV64b", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_shuffeqw_128B, "V128bV128bV128b", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbsat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbsat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbsat_dv, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddbsat_dv_128B, "V64iV64iV64i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarry, "V16iV16iV16iv*", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarry_128B, "V32iV32iV32iv*", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddclbh, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddclbh_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddclbw, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddclbw_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhw_acc, "V32iV32iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddhw_acc_128B, "V64iV64iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubh_acc, "V32iV32iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddubh_acc_128B, "V64iV64iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddububb_sat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddububb_sat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhw_acc, "V32iV32iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduhw_acc_128B, "V64iV64iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduwsat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduwsat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduwsat_dv, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadduwsat_dv_128B, "V64iV64iV64i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandnqrt, "V16iV64bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandnqrt_128B, "V32iV128bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandnqrt_acc, "V16iV16iV64bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandnqrt_acc_128B, "V32iV32iV128bi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvnqv, "V16iV64bV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvnqv_128B, "V32iV128bV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvqv, "V16iV64bV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vandvqv_128B, "V32iV128bV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhbsat, "V16iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrhbsat_128B, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruwuhrndsat, "V16iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruwuhrndsat_128B, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwuhrndsat, "V16iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrwuhrndsat_128B, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrb, "V16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlsrb_128B, "V32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_nm, "V16iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_nm_128B, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracci, "V16iV16iV16iV16iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvb_oracci_128B, "V32iV32iV32iV32iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvbi, "V16iV16iV16iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvvbi_128B, "V32iV32iV32iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_nm, "V32iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_nm_128B, "V64iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracci, "V32iV32iV16iV16iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwh_oracci_128B, "V64iV64iV32iV32iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwhi, "V32iV16iV16iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlutvwhi_128B, "V64iV32iV32iUIi", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxb, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmaxb_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminb, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vminb_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhb, "V32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhb_128B, "V64iV64ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhb_acc, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhb_acc_128B, "V64iV64iV64ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyewuh_64, "V32iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyewuh_64_128B, "V64iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwub, "V16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwub_128B, "V32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwub_acc, "V16iV16iV16ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyiwub_acc_128B, "V32iV32iV32ii", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_64_acc, "V32iV32iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyowh_64_acc_128B, "V64iV64iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrounduhub, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrounduhub_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrounduwuh, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrounduwuh_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatuwuh, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatuwuh_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbsat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbsat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbsat_dv, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubbsat_dv_128B, "V64iV64iV64i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubcarry, "V16iV16iV16iv*", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubcarry_128B, "V32iV32iV32iv*", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubububb_sat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubububb_sat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuwsat, "V16iV16iV16i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuwsat_128B, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuwsat_dv, "V32iV32iV32i", "", HVXV62)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubuwsat_dv_128B, "V64iV64iV64i", "", HVXV62)
-
-// V65 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsb, "V16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsb_128B, "V32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsb_sat, "V16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabsb_sat_128B, "V32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslh_acc, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaslh_acc_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrh_acc, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrh_acc_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruhubrndsat, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruhubrndsat_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruhubsat, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruhubsat_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruwuhsat, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasruwuhsat_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgb, "V16iV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgb_128B, "V32iV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgbrnd, "V16iV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavgbrnd_128B, "V32iV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguw, "V16iV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguw_128B, "V32iV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguwrnd, "V16iV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vavguwrnd_128B, "V32iV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdd0, "V32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdd0_128B, "V64i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermh, "vv*iiV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermh_128B, "vv*iiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhq, "vv*V64biiV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhq_128B, "vv*V128biiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhw, "vv*iiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhw_128B, "vv*iiV64i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhwq, "vv*V64biiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermhwq_128B, "vv*V128biiV64i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermw, "vv*iiV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermw_128B, "vv*iiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermwq, "vv*V64biiV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgathermwq_128B, "vv*V128biiV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlut4, "V16iV16iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vlut4_128B, "V32iV32iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuu, "V32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuu_128B, "V64iV64ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuu_acc, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpabuu_acc_128B, "V64iV64iV64ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahhsat, "V16iV16iV16iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpahhsat_128B, "V32iV32iV32iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhuhsat, "V16iV16iV16iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpauhuhsat_128B, "V32iV32iV32iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpsuhuhsat, "V16iV16iV16iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpsuhuhsat_128B, "V32iV32iV32iLLi", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyh_acc, "V32iV32iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyh_acc_128B, "V64iV64iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhe, "V16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhe_128B, "V32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhe_acc, "V16iV16iV16ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhe_acc_128B, "V32iV32iV32ii", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgb, "V16iV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vnavgb_128B, "V32iV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqb, "V16iV64b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqb_128B, "V32iV128b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqh, "V16iV64b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqh_128B, "V32iV128b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqw, "V16iV64b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vprefixqw_128B, "V32iV128b", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermh, "viiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermh_128B, "viiV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermh_add, "viiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermh_add_128B, "viiV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhq, "vV64biiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhq_128B, "vV128biiV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhw, "viiV32iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhw_128B, "viiV64iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhw_add, "viiV32iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhw_add_128B, "viiV64iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhwq, "vV64biiV32iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermhwq_128B, "vV128biiV64iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermw, "viiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermw_128B, "viiV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermw_add, "viiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermw_add_128B, "viiV32iV32i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermwq, "vV64biiV16iV16i", "", HVXV65)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vscattermwq_128B, "vV128biiV32iV32i", "", HVXV65)
-
-// V66 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarryo, "V16iV16iV16iv*", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarryo_128B, "V32iV32iV32iv*", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarrysat, "V16iV16iV16iV64b", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vaddcarrysat_128B, "V32iV32iV32iV128b", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasr_into, "V32iV32iV16iV16i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasr_into_128B, "V64iV64iV32iV32i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrotr, "V16iV16iV16i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vrotr_128B, "V32iV32iV32i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatdw, "V16iV16iV16i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsatdw_128B, "V32iV32iV32i", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubcarryo, "V16iV16iV16iv*", "", HVXV66)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsubcarryo_128B, "V32iV32iV32iv*", "", HVXV66)
-
-// V68 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyhubs10, "V32iV32iV32iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyhubs10_128B, "V64iV64iV64iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyhubs10_vxx, "V32iV32iV32iV32iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyhubs10_vxx_128B, "V64iV64iV64iV64iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyvubs10, "V32iV32iV32iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyvubs10_128B, "V64iV64iV64iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyvubs10_vxx, "V32iV32iV32iV32iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_v6mpyvubs10_vxx_128B, "V64iV64iV64iV64iUIi", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_hf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_hf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_sf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_sf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf16, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf16_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf16_mix, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf16_mix_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf32, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf32_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf32_mix, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_qf32_mix_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_hf, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_hf_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassign_fp, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vassign_fp_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_qf16, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_qf16_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_qf32, "V16iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_qf32_128B, "V32iV64i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_sf_qf32, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_sf_qf32_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_b_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_b_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_h_hf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_h_hf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_b, "V32iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_b_128B, "V64iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_h, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_h_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_ub, "V32iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_ub_128B, "V64iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_uh, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_uh_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_sf_hf, "V32iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_sf_hf_128B, "V64iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_ub_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_ub_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_uh_hf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_uh_hf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpy_sf_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpy_sf_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpy_sf_hf_acc, "V16iV16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vdmpy_sf_hf_acc_128B, "V32iV32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_hf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_hf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_sf, "V16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_sf_128B, "V32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf, "V64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_128B, "V128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_and, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_and_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_or, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_or_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_xor, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgthf_xor_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf, "V64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_128B, "V128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_and, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_and_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_or, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_or_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_xor, "V64bV64bV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtsf_xor_128B, "V128bV128bV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_hf_acc, "V16iV16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_hf_acc_128B, "V32iV32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16_mix_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf16_mix_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_hf, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_hf_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_mix_hf, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_mix_hf_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_qf16, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_qf16_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_qf32_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_hf, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_hf_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_hf_acc, "V32iV32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_hf_acc_128B, "V64iV64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf_hf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf_hf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf16, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf16_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf16_mix, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf16_mix_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf32, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf32_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf32_mix, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_qf32_mix_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_128B, "V32iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_hf, "V32iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_hf_128B, "V64iV32iV32i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_sf, "V16iV16iV16i", "", HVXV68)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_sf_128B, "V32iV32iV32i", "", HVXV68)
-
-// V69 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvuhubrndsat, "V16iV32iV16i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvuhubrndsat_128B, "V32iV64iV32i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvuhubsat, "V16iV32iV16i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvuhubsat_128B, "V32iV64iV32i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvwuhrndsat, "V16iV32iV16i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvwuhrndsat_128B, "V32iV64iV32i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvwuhsat, "V16iV32iV16i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vasrvwuhsat_128B, "V32iV64iV32i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhvs, "V16iV16iV16i", "", HVXV69)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpyuhvs_128B, "V32iV32iV32i", "", HVXV69)
-
-// V73 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_bf, "V32iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_sf_bf_128B, "V64iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_h_hf, "V16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_h_hf_128B, "V32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_h, "V16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_hf_h_128B, "V32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_sf_w, "V16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_sf_w_128B, "V32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_w_sf, "V16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vconv_w_sf_128B, "V32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_bf_sf, "V16iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_bf_sf_128B, "V32iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf, "V64bV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_128B, "V128bV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_and, "V64bV64bV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_and_128B, "V128bV128bV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_or, "V64bV64bV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_or_128B, "V128bV128bV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_xor, "V64bV64bV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vgtbf_xor_128B, "V128bV128bV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_bf, "V16iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmax_bf_128B, "V32iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_bf, "V16iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmin_bf_128B, "V32iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_bf, "V32iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_bf_128B, "V64iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_bf_acc, "V32iV32iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_sf_bf_acc_128B, "V64iV64iV32iV32i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_bf, "V32iV16iV16i", "", HVXV73)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_sf_bf_128B, "V64iV32iV32i", "", HVXV73)
-
-// V79 HVX Instructions.
-
-TARGET_BUILTIN(__builtin_HEXAGON_V6_get_qfext, "V16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_get_qfext_128B, "V32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_get_qfext_oracc, "V16iV16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_get_qfext_oracc_128B, "V32iV32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_set_qfext, "V16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_set_qfext_128B, "V32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_f8, "V16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vabs_f8_128B, "V32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf_f8, "V32iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vadd_hf_f8_128B, "V64iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_b_hf, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_b_hf_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_hf_b, "V32iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_hf_b_128B, "V64iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_hf_ub, "V32iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_hf_ub_128B, "V64iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_ub_hf, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt2_ub_hf_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_f8_hf, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_f8_hf_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_f8, "V32iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vcvt_hf_f8_128B, "V64iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_f8, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmax_f8_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_f8, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfmin_f8_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_f8, "V16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vfneg_f8_128B, "V32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmerge_qf, "V16iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmerge_qf_128B, "V32iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_f8, "V32iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_f8_128B, "V64iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_f8_acc, "V32iV32iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_hf_f8_acc_128B, "V64iV64iV32iV32i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_hf, "V16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_hf_128B, "V32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_qf16, "V16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_qf16_128B, "V32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_sf, "V16iV16ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vmpy_rt_sf_128B, "V32iV32ii", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf_f8, "V32iV16iV16i", "", HVXV79)
-TARGET_BUILTIN(__builtin_HEXAGON_V6_vsub_hf_f8_128B, "V64iV32iV32i", "", HVXV79)
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index d9e62112f21a3..93dbc9c8ca62f 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -72,6 +72,10 @@ clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins
   SOURCE BuiltinsBPF.td
   TARGET ClangBuiltinsBPF)
 
+clang_tablegen(BuiltinsHexagon.inc -gen-clang-builtins
+  SOURCE BuiltinsHexagon.td
+  TARGET ClangBuiltinsHexagon)
+
 clang_tablegen(BuiltinsNVPTX.inc -gen-clang-builtins
   SOURCE BuiltinsNVPTX.td
   TARGET ClangBuiltinsNVPTX)
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index e46061c25fb75..95eb110bb9c24 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -354,12 +354,12 @@ namespace clang {
 
   /// Hexagon builtins
   namespace Hexagon {
-    enum {
-        LastTIBuiltin = clang::Builtin::FirstTSBuiltin-1,
+  enum {
+    LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/BuiltinsHexagon.def"
-        LastTSBuiltin
-    };
+#include "clang/Basic/BuiltinsHexagon.inc"
+    LastTSBuiltin
+  };
   }
 
   /// MIPS builtins
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 0a2911c2cea0a..b318bd95ee67c 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -44,8 +44,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
   textual header "clang/Basic/BuiltinsAMDGPU.def"
   textual header "clang/Basic/BuiltinsARM.def"
-  textual header "clang/Basic/BuiltinsHexagon.def"
-  textual header "clang/Basic/BuiltinsHexagonDep.def"
   textual header "clang/Basic/BuiltinsHexagonMapCustomDep.def"
   textual header "clang/Basic/BuiltinsLoongArch.def"
   textual header "clang/Basic/BuiltinsLoongArchBase.def"
diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp
index b5e06b679ece7..2e173e01ed8ed 100644
--- a/clang/lib/Basic/Targets/Hexagon.cpp
+++ b/clang/lib/Basic/Targets/Hexagon.cpp
@@ -211,7 +211,7 @@ static constexpr Builtin::Info BuiltinInfo[] = {
   {#ID, TYPE, ATTRS, nullptr, HEADER, ALL_LANGUAGES},
 #define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE)                               \
   {#ID, TYPE, ATTRS, FEATURE, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#include "clang/Basic/BuiltinsHexagon.def"
+#include "clang/Basic/BuiltinsHexagon.inc"
 };
 
 bool HexagonTargetInfo::hasFeature(StringRef Feature) const {

From c8d3ccfa165d0193edf42ce1a0ba3077133c85e8 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 28 Jan 2025 16:07:36 +0800
Subject: [PATCH 358/432] [RISCV] Use llvm::reverse instead of
 make_range(rbegin, rend). NFC

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 3 +--
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp   | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 1fd130d7e040e..b56a39d8316d1 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -1719,8 +1719,7 @@ void RISCVInsertVSETVLI::coalesceVSETVLIs(MachineBasicBlock &MBB) const {
       ToDelete.push_back(VLOpDef);
   };
 
-  for (MachineInstr &MI :
-       make_early_inc_range(make_range(MBB.rbegin(), MBB.rend()))) {
+  for (MachineInstr &MI : make_early_inc_range(reverse(MBB))) {
 
     if (!isVectorConfigInstr(MI)) {
       Used.doUnion(getDemanded(MI, ST));
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index be897bb257796..e8c01e57038bf 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -1365,7 +1365,7 @@ bool RISCVVLOptimizer::runOnMachineFunction(MachineFunction &MF) {
     if (!MDT->isReachableFromEntry(&MBB))
       continue;
 
-    for (auto &MI : make_range(MBB.rbegin(), MBB.rend())) {
+    for (auto &MI : reverse(MBB)) {
       if (!isCandidate(MI))
         continue;
       if (!tryReduceVL(MI))

From 8e97f50eed71ff59b5a6fcb31e3e1af3fb30cdb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Tue, 28 Jan 2025 09:10:02 +0100
Subject: [PATCH 359/432] [clang][ASTImporter] Fix possible crash at import of
 function template (#124273)

During import of a function template at specific conditions an assertion
"TemplateOrSpecialization.isNull()" can be triggered. This can
happen when the new AST is already incompatible after import failures.
Problem is fixed by returning import failure at the assert condition.
---
 clang/lib/AST/ASTImporter.cpp                 |  8 +++
 .../Inputs/ctu-test-import-failure-import.cpp | 52 +++++++++++++++++++
 ...ure-import.cpp.externalDefMap.ast-dump.txt |  5 ++
 .../test/Analysis/ctu-test-import-failure.cpp | 34 ++++++++++++
 4 files changed, 99 insertions(+)
 create mode 100644 clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp
 create mode 100644 clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp.externalDefMap.ast-dump.txt
 create mode 100644 clang/test/Analysis/ctu-test-import-failure.cpp

diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 5433b6120ab9f..09fa10f716ec1 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -6759,6 +6759,14 @@ ASTNodeImporter::VisitFunctionTemplateDecl(FunctionTemplateDecl *D) {
                               Params, TemplatedFD))
     return ToFunc;
 
+  // Fail if TemplatedFD is already part of a template.
+  // The template should have been found by structural equivalence check before,
+  // or ToFunc should be already imported.
+  // If not, there is AST incompatibility that can be caused by previous import
+  // errors. (NameConflict is not exact here.)
+  if (TemplatedFD->getDescribedTemplate())
+    return make_error<ASTImportError>(ASTImportError::NameConflict);
+
   TemplatedFD->setDescribedFunctionTemplate(ToFunc);
 
   ToFunc->setAccess(D->getAccess());
diff --git a/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp b/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp
new file mode 100644
index 0000000000000..ffe860870a214
--- /dev/null
+++ b/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp
@@ -0,0 +1,52 @@
+namespace std {
+inline namespace __cxx11 {
+template <typename _CharT, typename = int, typename = _CharT>
+class basic_string;
+}
+template <typename, typename> class basic_istream;
+template <typename> struct __get_first_arg;
+struct allocator_traits {
+  using type = __get_first_arg<int>;
+};
+} // namespace std
+namespace std {
+inline namespace __cxx11 {
+template <typename, typename, typename> class basic_string {
+  allocator_traits _M_allocated_capacity;
+  void _M_assign();
+};
+} // namespace __cxx11
+} // namespace std
+namespace std {
+template <typename _CharT, typename _Alloc> void operator!=(_Alloc, _CharT);
+template <typename _CharT, typename _Traits, typename _Alloc>
+basic_istream<_CharT, _Traits> &getline(basic_istream<_CharT, _Traits> &,
+                                        basic_string<_CharT, _Traits, _Alloc> &,
+                                        _CharT);
+} // namespace std
+namespace std {
+template <typename _CharT, typename _Traits, typename _Alloc>
+void basic_string<_CharT, _Traits, _Alloc>::_M_assign() {
+  this != 0;
+}
+template <typename _CharT, typename _Traits, typename _Alloc>
+basic_istream<_CharT, _Traits> &getline(basic_istream<_CharT, _Traits> &,
+                                        basic_string<_CharT, _Traits, _Alloc> &,
+                                        _CharT) {}
+} // namespace std
+struct CommandLineOptionDefinition {
+  void *OutAddress;
+};
+struct CommandLineCommand {
+  CommandLineOptionDefinition Options;
+};
+namespace CommandLine {
+extern const CommandLineCommand RootCommands[];
+extern const int RootExamples[];
+} // namespace CommandLine
+using utf8 = char;
+using u8string = std::basic_string<utf8>;
+u8string _rct2DataPath;
+CommandLineOptionDefinition StandardOptions{&_rct2DataPath};
+const CommandLineCommand CommandLine::RootCommands[]{StandardOptions};
+const int CommandLine::RootExamples[]{};
diff --git a/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp.externalDefMap.ast-dump.txt b/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp.externalDefMap.ast-dump.txt
new file mode 100644
index 0000000000000..6ffb3795d3e36
--- /dev/null
+++ b/clang/test/Analysis/Inputs/ctu-test-import-failure-import.cpp.externalDefMap.ast-dump.txt
@@ -0,0 +1,5 @@
+47:c:@N@std@S@allocator_traits@F@allocator_traits# ctu-test-import-failure-import.cpp.ast
+29:c:@N@CommandLine@RootCommands ctu-test-import-failure-import.cpp.ast
+55:c:@N@std@N@__cxx11@ST>3#T#T#T@basic_string@F@_M_assign# ctu-test-import-failure-import.cpp.ast
+97:c:@S@CommandLineOptionDefinition@F@CommandLineOptionDefinition#&1$@S@CommandLineOptionDefinition# ctu-test-import-failure-import.cpp.ast
+29:c:@N@CommandLine@RootExamples ctu-test-import-failure-import.cpp.ast
\ No newline at end of file
diff --git a/clang/test/Analysis/ctu-test-import-failure.cpp b/clang/test/Analysis/ctu-test-import-failure.cpp
new file mode 100644
index 0000000000000..2295a66538fc9
--- /dev/null
+++ b/clang/test/Analysis/ctu-test-import-failure.cpp
@@ -0,0 +1,34 @@
+// RUN: rm -rf %t && mkdir %t
+// RUN: mkdir -p %t/ctudir
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 \
+// RUN:   -emit-pch -o %t/ctudir/ctu-test-import-failure-import.cpp.ast %S/Inputs/ctu-test-import-failure-import.cpp
+// RUN: cp %S/Inputs/ctu-test-import-failure-import.cpp.externalDefMap.ast-dump.txt %t/ctudir/externalDefMap.txt
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -std=c++17 -analyze \
+// RUN:   -analyzer-checker=core \
+// RUN:   -analyzer-config experimental-enable-naive-ctu-analysis=true \
+// RUN:   -analyzer-config ctu-dir=%t/ctudir \
+// RUN:   -verify %s
+
+// Check that importing this code does not cause crash.
+// Import intentionally fails because mismatch of '__get_first_arg'.
+
+namespace std {
+inline namespace __cxx11 {}
+template <typename _CharT, typename> class basic_istream;
+struct __get_first_arg;
+inline namespace __cxx11 {
+template <typename, typename, typename> class basic_string;
+}
+template <typename _CharT, typename _Traits, typename _Alloc>
+basic_istream<_CharT, _Traits> &getline(basic_istream<_CharT, _Traits> &,
+                                        basic_string<_CharT, _Traits, _Alloc> &,
+                                        _CharT) {}
+} // namespace std
+namespace CommandLine {
+extern const int RootExamples[];
+}
+
+// expected-warning@Inputs/ctu-test-import-failure-import.cpp:14{{incompatible definitions}}
+// expected-warning@Inputs/ctu-test-import-failure-import.cpp:14{{incompatible definitions}}
+// expected-note@Inputs/ctu-test-import-failure-import.cpp:14{{no corresponding field here}}
+// expected-note@Inputs/ctu-test-import-failure-import.cpp:14{{no corresponding field here}}

From cd57c9530b915aafac251b9f2757eca15027dc10 Mon Sep 17 00:00:00 2001
From: Aaditya <115080342+easyonaadit@users.noreply.github.com>
Date: Tue, 28 Jan 2025 13:41:59 +0530
Subject: [PATCH 360/432] [NFC][AMDGPU] Autogenerating test cases (#124507)

---
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll | 2800 ++++++++++++++---
 llvm/test/CodeGen/AMDGPU/nested-calls.ll      |  106 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      | 1072 +++++--
 3 files changed, 3315 insertions(+), 663 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 6fb071dd42d2f..3241a76d46a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -1,123 +1,177 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,FLATSCR %s
 
-; GCN-LABEL: {{^}}callee_no_stack:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack() #0 {
+; GCN-LABEL: callee_no_stack:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_no_fp_elim_all() #1 {
+; MUBUF-LABEL: callee_no_stack_no_fp_elim_all:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_no_stack_no_fp_elim_all:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_nonleaf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_no_stack_no_fp_elim_nonleaf() #2 {
+; GCN-LABEL: callee_no_stack_no_fp_elim_nonleaf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack() #0 {
+; MUBUF-LABEL: callee_with_stack:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
 ; Can use free call clobbered register to preserve original FP value.
-
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; MUBUF-NEXT:   s_addk_i32 s32, 0x200
-; FLATSCR-NEXT: s_add_i32 s32, s32, 8
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}}
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_all() #1 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_all:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_all:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_non_leaf:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}}
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_non_leaf:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_non_leaf:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}callee_with_stack_and_call:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2
-; MUBUF-DAG:   s_addk_i32 s32, 0x400{{$}}
-; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}}
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30,
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
-
-; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
-; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}}
-
-; GCN: s_swappc_b64
-
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]]
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]]
-
-; MUBUF:    s_mov_b32 s32, s33{{$}}
-; FLATSCR:  s_mov_b32 s32, s33{{$}}
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @callee_with_stack_and_call() #0 {
+; MUBUF-LABEL: callee_with_stack_and_call:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s16, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; MUBUF-NEXT:    v_writelane_b32 v40, s16, 2
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_and_call:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void @external_void_func_void()
@@ -130,36 +184,60 @@ define void @callee_with_stack_and_call() #0 {
 ; There is stack usage only because of the need to evict a VGPR for
 ; spilling CSR SGPRs.
 
-; GCN-LABEL: {{^}}callee_no_stack_with_call:
-; GCN: s_waitcnt
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF-DAG:   s_addk_i32 s32, 0x400
-; FLATSCR-DAG: s_add_i32 s32, s32, 16
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]]
-
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; GCN: s_swappc_b64
-
-; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0
-; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1
-
-; MUBUF:   s_mov_b32 s32, s33
-; FLATSCR: s_mov_b32 s32, s33
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]]
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @callee_no_stack_with_call() #0 {
+; MUBUF-LABEL: callee_no_stack_with_call:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s16, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[18:19]
+; MUBUF-NEXT:    v_writelane_b32 v40, s16, 2
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, external_void_func_void@rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, external_void_func_void@rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v40, 2
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_no_stack_with_call:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s0, 2
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_void@rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    v_readlane_b32 s0, v40, 2
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_void()
   ret void
 }
@@ -168,26 +246,306 @@ declare hidden void @external_void_func_void() #0
 
 ; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and
 ; restored. No FP is required.
-;
-; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
-; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], s
-; GCN: v_writelane_b32 [[CSR_VGPR]], s
-
-; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
-; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
-
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
+; MUBUF-LABEL: callee_func_sgpr_spill_no_calls:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s36, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s37, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s38, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s39, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s40, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s41, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s42, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s43, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s44, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s45, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s46, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s47, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s56, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s57, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s58, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s59, 23
+; MUBUF-NEXT:    v_writelane_b32 v40, s60, 24
+; MUBUF-NEXT:    v_writelane_b32 v40, s61, 25
+; MUBUF-NEXT:    v_writelane_b32 v40, s62, 26
+; MUBUF-NEXT:    v_writelane_b32 v40, s63, 27
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 28
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 29
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 30
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 31
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 32
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 33
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 34
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 35
+; MUBUF-NEXT:    v_writelane_b32 v40, s72, 36
+; MUBUF-NEXT:    v_writelane_b32 v40, s73, 37
+; MUBUF-NEXT:    v_writelane_b32 v40, s74, 38
+; MUBUF-NEXT:    v_writelane_b32 v40, s75, 39
+; MUBUF-NEXT:    v_writelane_b32 v40, s76, 40
+; MUBUF-NEXT:    v_writelane_b32 v40, s77, 41
+; MUBUF-NEXT:    v_writelane_b32 v40, s78, 42
+; MUBUF-NEXT:    v_writelane_b32 v40, s79, 43
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 44
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 45
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 46
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 47
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[68:83]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[52:67]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[36:51]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[4:19]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[20:27]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; def s[28:29]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[68:83]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[52:67]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[36:51]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[20:27]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[28:29]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use s[4:19]
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 47
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 46
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 45
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 44
+; MUBUF-NEXT:    v_readlane_b32 s79, v40, 43
+; MUBUF-NEXT:    v_readlane_b32 s78, v40, 42
+; MUBUF-NEXT:    v_readlane_b32 s77, v40, 41
+; MUBUF-NEXT:    v_readlane_b32 s76, v40, 40
+; MUBUF-NEXT:    v_readlane_b32 s75, v40, 39
+; MUBUF-NEXT:    v_readlane_b32 s74, v40, 38
+; MUBUF-NEXT:    v_readlane_b32 s73, v40, 37
+; MUBUF-NEXT:    v_readlane_b32 s72, v40, 36
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 35
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 34
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 33
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 32
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 31
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 30
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 29
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 28
+; MUBUF-NEXT:    v_readlane_b32 s63, v40, 27
+; MUBUF-NEXT:    v_readlane_b32 s62, v40, 26
+; MUBUF-NEXT:    v_readlane_b32 s61, v40, 25
+; MUBUF-NEXT:    v_readlane_b32 s60, v40, 24
+; MUBUF-NEXT:    v_readlane_b32 s59, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s58, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s57, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s56, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s47, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s46, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s45, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s44, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s43, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s42, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s41, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s40, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s39, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s38, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s37, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s36, v40, 0
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_func_sgpr_spill_no_calls:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s36, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s37, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s38, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 18
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 19
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 20
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 21
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 22
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 23
+; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 24
+; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 25
+; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 26
+; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 27
+; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 28
+; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 29
+; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 30
+; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 31
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 32
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 33
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 34
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 35
+; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 36
+; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 37
+; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 38
+; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 39
+; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 40
+; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 41
+; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 42
+; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 43
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[52:67]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[36:51]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[16:31]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[0:15]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[68:75]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; def s[34:35]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[52:67]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[36:51]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[16:31]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[68:75]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[34:35]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use s[0:15]
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 43
+; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 42
+; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 41
+; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 40
+; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 39
+; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 38
+; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 37
+; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 36
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 35
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 34
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 33
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 32
+; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 31
+; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 30
+; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 29
+; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 28
+; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 27
+; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 26
+; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 25
+; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 24
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 23
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 22
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 21
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 20
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 19
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 18
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s38, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s37, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s36, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s35, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s34, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
   call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
   call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
@@ -212,55 +570,83 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
 
 ; Has no spilled CSR VGPRs used for SGPR spilling, so no need to
 ; enable all lanes and restore.
-
-; GCN-LABEL: {{^}}spill_only_csr_sgpr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_xor_saveexec_b64
-; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec,
-; GCN-NEXT: v_writelane_b32 v0, s42, 0
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; clobber s42
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: v_readlane_b32 s42, v0, 0
-; GCN-NEXT: s_xor_saveexec_b64
-; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec,
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @spill_only_csr_sgpr() {
+; MUBUF-LABEL: spill_only_csr_sgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v0, s42, 0
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber s42
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s42, v0, 0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: spill_only_csr_sgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v0, s42, 0
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber s42
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s42, v0, 0
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber s42", "~{s42}"()
   ret void
 }
 
 ; TODO: Can the SP inc/deec be remvoed?
-; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
-; GCN: s_waitcnt
-; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF-DAG:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
-; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4
-
-; GCN:	;;#ASMSTART
-; GCN-NEXT: ; clobber v41
-; GCN-NEXT: ;;#ASMEND
-
-; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; MUBUF:        s_addk_i32 s32, 0x300
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; MUBUF-NEXT:   s_mov_b32 s33, s4
-; FLATSCR:      s_add_i32 s32, s32, 12
-; FLATSCR-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s33, s0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
+; MUBUF-LABEL: callee_with_stack_no_fp_elim_csr_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_with_stack_no_fp_elim_csr_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -268,32 +654,312 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 }
 
 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
-; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
-; GCN: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v1
-; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
-; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
-; GCN: ;;#ASMSTART
-
-; MUBUF:        s_mov_b32 s32, s33
-; FLATSCR:      s_mov_b32 s32, s33
-
-; GCN: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[TMP_SGPR]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @last_lane_vgpr_for_fp_csr() #1 {
+; MUBUF-LABEL: last_lane_vgpr_for_fp_csr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v1, s40, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s41, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s42, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s43, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s44, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s45, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s46, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s47, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s56, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s57, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s58, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s59, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s60, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s61, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s62, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s63, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 28
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 29
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 30
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 31
+; MUBUF-NEXT:    v_writelane_b32 v1, s72, 32
+; MUBUF-NEXT:    v_writelane_b32 v1, s73, 33
+; MUBUF-NEXT:    v_writelane_b32 v1, s74, 34
+; MUBUF-NEXT:    v_writelane_b32 v1, s75, 35
+; MUBUF-NEXT:    v_writelane_b32 v1, s76, 36
+; MUBUF-NEXT:    v_writelane_b32 v1, s77, 37
+; MUBUF-NEXT:    v_writelane_b32 v1, s78, 38
+; MUBUF-NEXT:    v_writelane_b32 v1, s79, 39
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 40
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 41
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 42
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 43
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 44
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 45
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 46
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 47
+; MUBUF-NEXT:    v_writelane_b32 v1, s88, 48
+; MUBUF-NEXT:    v_writelane_b32 v1, s89, 49
+; MUBUF-NEXT:    v_writelane_b32 v1, s90, 50
+; MUBUF-NEXT:    v_writelane_b32 v1, s91, 51
+; MUBUF-NEXT:    v_writelane_b32 v1, s92, 52
+; MUBUF-NEXT:    v_writelane_b32 v1, s93, 53
+; MUBUF-NEXT:    v_writelane_b32 v1, s94, 54
+; MUBUF-NEXT:    v_writelane_b32 v1, s95, 55
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 56
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 57
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 58
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 59
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 60
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 61
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 62
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 62
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 61
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 60
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 59
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 58
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 57
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 56
+; MUBUF-NEXT:    v_readlane_b32 s95, v1, 55
+; MUBUF-NEXT:    v_readlane_b32 s94, v1, 54
+; MUBUF-NEXT:    v_readlane_b32 s93, v1, 53
+; MUBUF-NEXT:    v_readlane_b32 s92, v1, 52
+; MUBUF-NEXT:    v_readlane_b32 s91, v1, 51
+; MUBUF-NEXT:    v_readlane_b32 s90, v1, 50
+; MUBUF-NEXT:    v_readlane_b32 s89, v1, 49
+; MUBUF-NEXT:    v_readlane_b32 s88, v1, 48
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 47
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 46
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 45
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 44
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 43
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 42
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 41
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 40
+; MUBUF-NEXT:    v_readlane_b32 s79, v1, 39
+; MUBUF-NEXT:    v_readlane_b32 s78, v1, 38
+; MUBUF-NEXT:    v_readlane_b32 s77, v1, 37
+; MUBUF-NEXT:    v_readlane_b32 s76, v1, 36
+; MUBUF-NEXT:    v_readlane_b32 s75, v1, 35
+; MUBUF-NEXT:    v_readlane_b32 s74, v1, 34
+; MUBUF-NEXT:    v_readlane_b32 s73, v1, 33
+; MUBUF-NEXT:    v_readlane_b32 s72, v1, 32
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 31
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s63, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s62, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s61, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s60, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s59, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s58, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s57, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s56, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s47, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s46, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s45, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s44, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s43, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s42, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s41, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s40, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: last_lane_vgpr_for_fp_csr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 28
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 29
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 30
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 31
+; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 32
+; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 33
+; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 34
+; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 35
+; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 36
+; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 37
+; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 38
+; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 39
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 40
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 41
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 42
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 43
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 44
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 45
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 46
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 47
+; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 48
+; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 49
+; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 50
+; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 51
+; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 52
+; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 53
+; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 54
+; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 55
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 56
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 57
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 58
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 59
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 60
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 61
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 62
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 62
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 61
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 60
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 59
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 58
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 57
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 56
+; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 55
+; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 54
+; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 53
+; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 52
+; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 51
+; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 50
+; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 49
+; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 48
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 47
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 46
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 45
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 44
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 43
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 42
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 41
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 40
+; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 39
+; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 38
+; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 37
+; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 36
+; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 35
+; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 34
+; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 33
+; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 32
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 31
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -310,37 +976,316 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 }
 
 ; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
-; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-61: v_writelane_b32 v1,
-; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v1,
-; MUBUF:   buffer_store_dword
-; FLATSCR: scratch_store_dword
-; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v1,
-; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
-; MUBUF:        s_addk_i32 s32, 0x400
-; FLATSCR:      s_add_i32 s32, s32, 16
-; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
-; MUBUF-NEXT:   s_mov_b32 s32, s33
-; FLATSCR-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define void @no_new_vgpr_for_fp_csr() #1 {
+; MUBUF-LABEL: no_new_vgpr_for_fp_csr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v1, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v1, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v1, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v1, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v1, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v1, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v1, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v1, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v1, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v1, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v1, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v1, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v1, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v1, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v1, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v1, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v1, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v1, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v1, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v1, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v1, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v1, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v1, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v1, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v1, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v1, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v1, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v1, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v1, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v1, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v1, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v1, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v1, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v1, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v1, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v1, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v1, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v1, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v1, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v1, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v1, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v1, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v1, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v1, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v1, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v1, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v1, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v1, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v1, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v1, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v1, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v1, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v1, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v1, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v1, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v1, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v1, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v1, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v1, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v1, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v1, s100, 61
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v1, s101, 62
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber v41
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_writelane_b32 v1, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_readlane_b32 s102, v1, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v1, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v1, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v1, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v1, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v1, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v1, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v1, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v1, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v1, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v1, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v1, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v1, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v1, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v1, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v1, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v1, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v1, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v1, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v1, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v1, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v1, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v1, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v1, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v1, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v1, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v1, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v1, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v1, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v1, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v1, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v1, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v1, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v1, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v1, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v1, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v1, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v1, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v1, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v1, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v1, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v1, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v1, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v1, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v1, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v1, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v1, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v1, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v1, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v1, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v1, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v1, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v1, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v1, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v1, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v1, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v1, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v1, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v1, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v1, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v1, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v1, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_new_vgpr_for_fp_csr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v1, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v1, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v1, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v1, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v1, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v1, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v1, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v1, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v1, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v1, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v1, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v1, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v1, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v1, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v1, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v1, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v1, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v1, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v1, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v1, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v1, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v1, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v1, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v1, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v1, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v1, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v1, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v1, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v1, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v1, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v1, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v1, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v1, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v1, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v1, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v1, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v1, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v1, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v1, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v1, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v1, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v1, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v1, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v1, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v1, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v1, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v1, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v1, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v1, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v1, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v1, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v1, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v1, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v1, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v1, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v1, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v1, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v1, s100, 61
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    v_writelane_b32 v1, s101, 62
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33 offset:4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber v41
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_writelane_b32 v1, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_readlane_b32 s102, v1, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v1, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v1, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v1, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v1, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v1, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v1, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v1, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v1, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v1, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v1, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v1, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v1, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v1, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v1, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v1, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v1, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v1, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v1, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v1, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v1, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v1, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v1, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v1, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v1, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v1, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v1, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v1, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v1, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v1, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v1, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v1, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v1, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v1, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v1, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v1, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v1, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v1, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v1, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v1, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v1, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v1, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v1, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v1, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v1, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v1, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v1, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v1, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v1, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v1, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v1, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v1, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v1, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v1, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v1, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v1, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v1, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v1, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v1, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v1, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v1, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v1, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void asm sideeffect "; clobber v41", "~{v41}"()
@@ -356,64 +1301,99 @@ define void @no_new_vgpr_for_fp_csr() #1 {
   ret void
 }
 
-; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
-; GCN: s_waitcnt
-; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
-; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
-; MUBUF-NEXT:   s_add_i32 s33, s32, 0x7ffc0
-; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
-; MUBUF-NEXT:   s_and_b32 s33, s33, 0xfff80000
-; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
-; MUBUF-NEXT:   s_mov_b32 s5, s34
-; FLATSCR-NEXT: s_mov_b32 s1, s34
-; MUBUF-NEXT:   s_mov_b32 s34, s32 
-; FLATSCR-NEXT: s_mov_b32 s34, s32
-; MUBUF-NEXT:   s_add_i32 s32, s32, 0x180000
-; FLATSCR-NEXT: s_addk_i32 s32, 0x6000
-; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; MUBUF-NEXT:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}}
-; FLATSCR-NEXT: s_add_i32 s2, s33, 0x2000
-; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s2
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; MUBUF-NEXT:   s_mov_b32 s32, s34 
-; MUBUF-NEXT:   s_mov_b32 s34, s5 
-; FLATSCR-NEXT:   s_mov_b32 s32, s34 
-; FLATSCR-NEXT:   s_mov_b32 s34, s1 
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64
 define void @realign_stack_no_fp_elim() #1 {
+; MUBUF-LABEL: realign_stack_no_fp_elim:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_add_i32 s33, s32, 0x7ffc0
+; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfff80000
+; MUBUF-NEXT:    s_mov_b32 s5, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s32
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x180000
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x2000
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s32, s34
+; MUBUF-NEXT:    s_mov_b32 s34, s5
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: realign_stack_no_fp_elim:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_add_i32 s33, s32, 0x1fff
+; FLATSCR-NEXT:    s_and_b32 s33, s33, 0xffffe000
+; FLATSCR-NEXT:    s_mov_b32 s1, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s32
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x6000
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x2000
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s32, s34
+; FLATSCR-NEXT:    s_mov_b32 s34, s1
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, align 8192, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   ret void
 }
 
-; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_writelane_b32 [[CSR_VGPR]], s30, 0
-; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
-; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
-; GCN-NEXT:     s_waitcnt vmcnt(0)
-; GCN: ;;#ASMSTART
-; GCN:     v_readlane_b32 s31, [[CSR_VGPR]], 1
-; GCN:     v_readlane_b32 s30, [[CSR_VGPR]], 0
-;GCN-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @no_unused_non_csr_sgpr_for_fp() #1 {
+; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -428,31 +1408,64 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
 }
 
 ; Need a new CSR VGPR to satisfy the FP spill.
-; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN: v_mov_b32_e32
-
-; MUBUF:       s_addk_i32 s32, 0x300{{$}}
-; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
-; MUBUF-DAG:   buffer_store_dword
-; FLATSCR-DAG: scratch_store_dword
-
-; GCN: ;;#ASMSTART
-; GCN: s_mov_b32 s32, s33
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
+; MUBUF-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved initial VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved initial VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -474,32 +1487,72 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
 
 ; The byval argument exceeds the MUBUF constant offset, so a scratch
 ; register is needed to access the CSR VGPR slot.
-; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
-; GCN: s_waitcnt
-; GCN-NEXT: s_mov_b32 vcc_lo, s33
-; GCN-DAG:  s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004
-; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; MUBUF-DAG:   s_add_i32 s32, s32, 0x40300{{$}}
-; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}}
-; MUBUF-DAG:   buffer_store_dword
-; FLATSCR-DAG: scratch_store_dword
-
-; GCN: ;;#ASMSTART
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload
-; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x1004
-; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, vcc_lo
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #1 {
+; MUBUF-LABEL: scratch_reg_needed_mubuf_offset:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 vcc_lo, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s6 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40300
+; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    s_add_i32 s6, s33, 0x40100
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s6 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, vcc_lo
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: scratch_reg_needed_mubuf_offset:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 vcc_lo, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s2 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x100c
+; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s0, s33, 0x1000
+; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    s_add_i32 s2, s33, 0x1004
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s2 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, vcc_lo
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
@@ -520,25 +1573,72 @@ define void @scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8])
   ret void
 }
 
-; GCN-LABEL: {{^}}local_empty_func:
-; GCN: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define internal void @local_empty_func() #0 {
+; GCN-LABEL: local_empty_func:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   ret void
 }
 
 ; An FP is needed, despite not needing any spills
 ; TODO: Ccould see callee does not use stack and omit FP.
-; GCN-LABEL: {{^}}ipra_call_with_stack:
-; GCN: s_mov_b32 [[TMP_SGPR:s[0-9]+]], s33
-; GCN: s_mov_b32 s33, s32
-; MUBUF:   s_addk_i32 s32, 0x400
-; FLATSCR: s_add_i32 s32, s32, 16
-; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
-; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}}
-; GCN:     s_swappc_b64
-; GCN: s_mov_b32 s33, [[TMP_SGPR]]
 define void @ipra_call_with_stack() #0 {
+; MUBUF-LABEL: ipra_call_with_stack:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s18, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[16:17], -1
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[16:17]
+; MUBUF-NEXT:    s_addk_i32 s32, 0x400
+; MUBUF-NEXT:    v_writelane_b32 v1, s30, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_writelane_b32 v1, s31, 1
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_getpc_b64 s[16:17]
+; MUBUF-NEXT:    s_add_u32 s16, s16, local_empty_func@rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s17, s17, local_empty_func@rel32@hi+12
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; MUBUF-NEXT:    v_readlane_b32 s31, v1, 1
+; MUBUF-NEXT:    v_readlane_b32 s30, v1, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT:    s_mov_b32 s33, s18
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: ipra_call_with_stack:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s2, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
+; FLATSCR-NEXT:    v_writelane_b32 v1, s30, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    v_writelane_b32 v1, s31, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s33
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
+; FLATSCR-NEXT:    s_add_u32 s0, s0, local_empty_func@rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s1, s1, local_empty_func@rel32@hi+12
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT:    v_readlane_b32 s31, v1, 1
+; FLATSCR-NEXT:    v_readlane_b32 s30, v1, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT:    s_mov_b32 s33, s2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
   call void @local_empty_func()
@@ -546,21 +1646,41 @@ define void @ipra_call_with_stack() #0 {
 }
 
 ; With no free registers, we must spill the FP to memory.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
-; MUBUF:   s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; FLATSCR: s_mov_b32 s0, s33
-; GCN:     s_mov_b32 s33, s32
-; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]]
-; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 ; 4-byte Folded Spill
-; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Reload
-; MUBUF:   s_waitcnt vmcnt(0)
-; MUBUF:   v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]]
-; MUBUF:   s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; FLATSCR: s_mov_b32 s33, s0
-; GCN:     s_setpc_b64
-; MUBUF:   ScratchSize: 8
-; FLATSCR: ScratchSize: 0
 define void @callee_need_to_spill_fp_to_memory() #3 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_memory:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x200
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -578,23 +1698,313 @@ define void @callee_need_to_spill_fp_to_memory() #3 {
 ; If we have a reserved VGPR that can be used for SGPR spills, we may still
 ; need to spill the FP to memory if there are no free lanes in the reserved
 ; VGPR.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
-; MUBUF:   s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN:     s_mov_b32 s33, s32
-; MUBUF:   s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF:   s_mov_b64 exec, [[COPY_EXEC1]]
-; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], [[FP_SCRATCH_COPY]]
-; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s33 offset:[[OFF:[0-9]+]]
-; GCN-NOT: v_writelane_b32 v40, s33
-; GCN-NOT: v_readlane_b32 s33, v40
-; GCN-NOT: v_readlane_b32 s33, v40
-; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s33 offset:[[OFF]]
-; MUBUF:   v_readfirstlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[TMP_VGPR2]]
-; MUBUF:   s_xor_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; MUBUF:   s_mov_b64 exec, [[COPY_EXEC2]]
-; MUBUF:   s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v39, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v39, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -619,17 +2029,312 @@ define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
 ; the exec register is saved to s0 when saving CSR in the function prolog.
 ; Make sure that the FP save happens after restoring exec from the same
 ; register.
-; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg:
-; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; FLATSCR: s_mov_b32 s33, s32
-; GCN-NOT: v_writelane_b32 v40, s33
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]]
-; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NOT: v_readlane_b32 s33, v40
-; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN:     s_setpc_b64
 define void @callee_need_to_spill_fp_to_reg() #1 {
+; MUBUF-LABEL: callee_need_to_spill_fp_to_reg:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v40, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v40, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v40, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v40, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v40, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v40, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v40, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v40, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v40, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v40, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v40, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v40, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v40, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v40, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v40, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v40, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v40, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v40, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v40, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v40, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v40, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v40, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v40, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v40, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v40, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v40, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v40, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v40, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v40, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v40, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v40, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v40, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v40, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v40, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v40, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v40, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v40, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v40, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v40, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v40, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v40, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v40, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v40, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v40, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v40, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v40, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v40, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v40, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v40, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v40, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v40, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v40, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v40, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v40, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v40, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v40, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v40, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v40, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v40, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v40, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v40, s99, 60
+; MUBUF-NEXT:    v_writelane_b32 v40, s100, 61
+; MUBUF-NEXT:    v_writelane_b32 v40, s101, 62
+; MUBUF-NEXT:    v_writelane_b32 v41, s4, 0
+; MUBUF-NEXT:    s_addk_i32 s32, 0x300
+; MUBUF-NEXT:    v_writelane_b32 v40, s102, 63
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    v_readlane_b32 s102, v40, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v40, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v40, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v40, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v40, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v40, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v40, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v40, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v40, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v40, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v40, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v40, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v40, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v40, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v40, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v40, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v40, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v40, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v40, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v40, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v40, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v40, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v40, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v40, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v40, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v40, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v40, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v40, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v40, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v40, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v40, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v40, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v40, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v40, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v40, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v40, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v40, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v40, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v40, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v40, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v40, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v40, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v40, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v40, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v40, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v40, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v40, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v40, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v40, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v40, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v40, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v40, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v40, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v40, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v40, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v40, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v40, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v40, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v40, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v40, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v40, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v40, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v40, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v40, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    v_readlane_b32 s4, v41, 0
+; MUBUF-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_need_to_spill_fp_to_reg:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v40, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v40, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v40, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v40, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v40, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v40, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v40, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v40, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v40, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v40, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v40, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v40, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v40, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v40, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v40, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v40, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v40, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v40, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v40, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v40, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v40, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v40, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v40, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v40, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v40, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v40, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v40, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v40, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v40, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v40, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v40, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v40, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v40, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v40, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v40, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v40, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v40, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v40, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v40, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v40, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v40, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v40, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v40, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v40, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v40, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v40, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v40, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v40, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v40, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v40, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v40, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v40, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v40, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v40, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v40, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v40, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v40, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v40, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v40, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v40, s99, 60
+; FLATSCR-NEXT:    v_writelane_b32 v40, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v40, s101, 62
+; FLATSCR-NEXT:    s_add_i32 s32, s32, 8
+; FLATSCR-NEXT:    v_writelane_b32 v40, s102, 63
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v40, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v40, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v40, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v40, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v40, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v40, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v40, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v40, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v40, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v40, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v40, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v40, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v40, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v40, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v40, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v40, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v40, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v40, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v40, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v40, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v40, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v40, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v40, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v40, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v40, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v40, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v40, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v40, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v40, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v40, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v40, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v40, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v40, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v40, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v40, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v40, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v40, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v40, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v40, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v40, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v40, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v40, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v40, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v40, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v40, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v40, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v40, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v40, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v40, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v40, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v40, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v40, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v40, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v40, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v40, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v40, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v40, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v40, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v40, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v40, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v40, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v40, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v40, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v40, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
     "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
     ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
@@ -652,20 +2357,327 @@ define void @callee_need_to_spill_fp_to_reg() #1 {
 
 ; If the size of the offset exceeds the MUBUF offset field we need another
 ; scratch VGPR to hold the offset.
-; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
-; MUBUF: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; MUBUF-NEXT: s_mov_b32 s33, s32
-; MUBUF-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40100
-; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; MUBUF: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
-; GCN-NOT: v_mov_b32_e32 v0, 0x100c
-; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s33, 0x40200
-; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
-; FLATSCR: v_mov_b32_e32 v0, 0
-; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000
-; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset(ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #3 {
+; MUBUF-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s4, s33
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
+; MUBUF-NEXT:    buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    v_writelane_b32 v39, s39, 0
+; MUBUF-NEXT:    v_writelane_b32 v39, s40, 1
+; MUBUF-NEXT:    v_writelane_b32 v39, s41, 2
+; MUBUF-NEXT:    v_writelane_b32 v39, s42, 3
+; MUBUF-NEXT:    v_writelane_b32 v39, s43, 4
+; MUBUF-NEXT:    v_writelane_b32 v39, s44, 5
+; MUBUF-NEXT:    v_writelane_b32 v39, s45, 6
+; MUBUF-NEXT:    v_writelane_b32 v39, s46, 7
+; MUBUF-NEXT:    v_writelane_b32 v39, s47, 8
+; MUBUF-NEXT:    v_writelane_b32 v39, s48, 9
+; MUBUF-NEXT:    v_writelane_b32 v39, s49, 10
+; MUBUF-NEXT:    v_writelane_b32 v39, s50, 11
+; MUBUF-NEXT:    v_writelane_b32 v39, s51, 12
+; MUBUF-NEXT:    v_writelane_b32 v39, s52, 13
+; MUBUF-NEXT:    v_writelane_b32 v39, s53, 14
+; MUBUF-NEXT:    v_writelane_b32 v39, s54, 15
+; MUBUF-NEXT:    v_writelane_b32 v39, s55, 16
+; MUBUF-NEXT:    v_writelane_b32 v39, s56, 17
+; MUBUF-NEXT:    v_writelane_b32 v39, s57, 18
+; MUBUF-NEXT:    v_writelane_b32 v39, s58, 19
+; MUBUF-NEXT:    v_writelane_b32 v39, s59, 20
+; MUBUF-NEXT:    v_writelane_b32 v39, s60, 21
+; MUBUF-NEXT:    v_writelane_b32 v39, s61, 22
+; MUBUF-NEXT:    v_writelane_b32 v39, s62, 23
+; MUBUF-NEXT:    v_writelane_b32 v39, s63, 24
+; MUBUF-NEXT:    v_writelane_b32 v39, s64, 25
+; MUBUF-NEXT:    v_writelane_b32 v39, s65, 26
+; MUBUF-NEXT:    v_writelane_b32 v39, s66, 27
+; MUBUF-NEXT:    v_writelane_b32 v39, s67, 28
+; MUBUF-NEXT:    v_writelane_b32 v39, s68, 29
+; MUBUF-NEXT:    v_writelane_b32 v39, s69, 30
+; MUBUF-NEXT:    v_writelane_b32 v39, s70, 31
+; MUBUF-NEXT:    v_writelane_b32 v39, s71, 32
+; MUBUF-NEXT:    v_writelane_b32 v39, s72, 33
+; MUBUF-NEXT:    v_writelane_b32 v39, s73, 34
+; MUBUF-NEXT:    v_writelane_b32 v39, s74, 35
+; MUBUF-NEXT:    v_writelane_b32 v39, s75, 36
+; MUBUF-NEXT:    v_writelane_b32 v39, s76, 37
+; MUBUF-NEXT:    v_writelane_b32 v39, s77, 38
+; MUBUF-NEXT:    v_writelane_b32 v39, s78, 39
+; MUBUF-NEXT:    v_writelane_b32 v39, s79, 40
+; MUBUF-NEXT:    v_writelane_b32 v39, s80, 41
+; MUBUF-NEXT:    v_writelane_b32 v39, s81, 42
+; MUBUF-NEXT:    v_writelane_b32 v39, s82, 43
+; MUBUF-NEXT:    v_writelane_b32 v39, s83, 44
+; MUBUF-NEXT:    v_writelane_b32 v39, s84, 45
+; MUBUF-NEXT:    v_writelane_b32 v39, s85, 46
+; MUBUF-NEXT:    v_writelane_b32 v39, s86, 47
+; MUBUF-NEXT:    v_writelane_b32 v39, s87, 48
+; MUBUF-NEXT:    v_writelane_b32 v39, s88, 49
+; MUBUF-NEXT:    v_writelane_b32 v39, s89, 50
+; MUBUF-NEXT:    v_writelane_b32 v39, s90, 51
+; MUBUF-NEXT:    v_writelane_b32 v39, s91, 52
+; MUBUF-NEXT:    v_writelane_b32 v39, s92, 53
+; MUBUF-NEXT:    v_writelane_b32 v39, s93, 54
+; MUBUF-NEXT:    v_writelane_b32 v39, s94, 55
+; MUBUF-NEXT:    v_writelane_b32 v39, s95, 56
+; MUBUF-NEXT:    v_writelane_b32 v39, s96, 57
+; MUBUF-NEXT:    v_writelane_b32 v39, s97, 58
+; MUBUF-NEXT:    v_writelane_b32 v39, s98, 59
+; MUBUF-NEXT:    v_writelane_b32 v39, s99, 60
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
+; MUBUF-NEXT:    v_writelane_b32 v39, s100, 61
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; MUBUF-NEXT:    v_writelane_b32 v39, s101, 62
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x1000
+; MUBUF-NEXT:    v_writelane_b32 v39, s102, 63
+; MUBUF-NEXT:    buffer_store_dword v0, v1, s[0:3], s33 offen
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; clobber all VGPRs except CSR v40
+; MUBUF-NEXT:    ;;#ASMEND
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40200
+; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_add_i32 s32, s32, 0x40400
+; MUBUF-NEXT:    v_readlane_b32 s102, v39, 63
+; MUBUF-NEXT:    v_readlane_b32 s101, v39, 62
+; MUBUF-NEXT:    v_readlane_b32 s100, v39, 61
+; MUBUF-NEXT:    v_readlane_b32 s99, v39, 60
+; MUBUF-NEXT:    v_readlane_b32 s98, v39, 59
+; MUBUF-NEXT:    v_readlane_b32 s97, v39, 58
+; MUBUF-NEXT:    v_readlane_b32 s96, v39, 57
+; MUBUF-NEXT:    v_readlane_b32 s95, v39, 56
+; MUBUF-NEXT:    v_readlane_b32 s94, v39, 55
+; MUBUF-NEXT:    v_readlane_b32 s93, v39, 54
+; MUBUF-NEXT:    v_readlane_b32 s92, v39, 53
+; MUBUF-NEXT:    v_readlane_b32 s91, v39, 52
+; MUBUF-NEXT:    v_readlane_b32 s90, v39, 51
+; MUBUF-NEXT:    v_readlane_b32 s89, v39, 50
+; MUBUF-NEXT:    v_readlane_b32 s88, v39, 49
+; MUBUF-NEXT:    v_readlane_b32 s87, v39, 48
+; MUBUF-NEXT:    v_readlane_b32 s86, v39, 47
+; MUBUF-NEXT:    v_readlane_b32 s85, v39, 46
+; MUBUF-NEXT:    v_readlane_b32 s84, v39, 45
+; MUBUF-NEXT:    v_readlane_b32 s83, v39, 44
+; MUBUF-NEXT:    v_readlane_b32 s82, v39, 43
+; MUBUF-NEXT:    v_readlane_b32 s81, v39, 42
+; MUBUF-NEXT:    v_readlane_b32 s80, v39, 41
+; MUBUF-NEXT:    v_readlane_b32 s79, v39, 40
+; MUBUF-NEXT:    v_readlane_b32 s78, v39, 39
+; MUBUF-NEXT:    v_readlane_b32 s77, v39, 38
+; MUBUF-NEXT:    v_readlane_b32 s76, v39, 37
+; MUBUF-NEXT:    v_readlane_b32 s75, v39, 36
+; MUBUF-NEXT:    v_readlane_b32 s74, v39, 35
+; MUBUF-NEXT:    v_readlane_b32 s73, v39, 34
+; MUBUF-NEXT:    v_readlane_b32 s72, v39, 33
+; MUBUF-NEXT:    v_readlane_b32 s71, v39, 32
+; MUBUF-NEXT:    v_readlane_b32 s70, v39, 31
+; MUBUF-NEXT:    v_readlane_b32 s69, v39, 30
+; MUBUF-NEXT:    v_readlane_b32 s68, v39, 29
+; MUBUF-NEXT:    v_readlane_b32 s67, v39, 28
+; MUBUF-NEXT:    v_readlane_b32 s66, v39, 27
+; MUBUF-NEXT:    v_readlane_b32 s65, v39, 26
+; MUBUF-NEXT:    v_readlane_b32 s64, v39, 25
+; MUBUF-NEXT:    v_readlane_b32 s63, v39, 24
+; MUBUF-NEXT:    v_readlane_b32 s62, v39, 23
+; MUBUF-NEXT:    v_readlane_b32 s61, v39, 22
+; MUBUF-NEXT:    v_readlane_b32 s60, v39, 21
+; MUBUF-NEXT:    v_readlane_b32 s59, v39, 20
+; MUBUF-NEXT:    v_readlane_b32 s58, v39, 19
+; MUBUF-NEXT:    v_readlane_b32 s57, v39, 18
+; MUBUF-NEXT:    v_readlane_b32 s56, v39, 17
+; MUBUF-NEXT:    v_readlane_b32 s55, v39, 16
+; MUBUF-NEXT:    v_readlane_b32 s54, v39, 15
+; MUBUF-NEXT:    v_readlane_b32 s53, v39, 14
+; MUBUF-NEXT:    v_readlane_b32 s52, v39, 13
+; MUBUF-NEXT:    v_readlane_b32 s51, v39, 12
+; MUBUF-NEXT:    v_readlane_b32 s50, v39, 11
+; MUBUF-NEXT:    v_readlane_b32 s49, v39, 10
+; MUBUF-NEXT:    v_readlane_b32 s48, v39, 9
+; MUBUF-NEXT:    v_readlane_b32 s47, v39, 8
+; MUBUF-NEXT:    v_readlane_b32 s46, v39, 7
+; MUBUF-NEXT:    v_readlane_b32 s45, v39, 6
+; MUBUF-NEXT:    v_readlane_b32 s44, v39, 5
+; MUBUF-NEXT:    v_readlane_b32 s43, v39, 4
+; MUBUF-NEXT:    v_readlane_b32 s42, v39, 3
+; MUBUF-NEXT:    v_readlane_b32 s41, v39, 2
+; MUBUF-NEXT:    v_readlane_b32 s40, v39, 1
+; MUBUF-NEXT:    v_readlane_b32 s39, v39, 0
+; MUBUF-NEXT:    s_mov_b32 s32, s33
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_readfirstlane_b32 s4, v0
+; MUBUF-NEXT:    s_xor_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT:    s_add_i32 s5, s33, 0x40100
+; MUBUF-NEXT:    buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload
+; MUBUF-NEXT:    s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT:    s_mov_b32 s33, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: spill_fp_to_memory_scratch_reg_needed_mubuf_offset:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s0, s33
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1004
+; FLATSCR-NEXT:    scratch_store_dword off, v39, s1 ; 4-byte Folded Spill
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    v_writelane_b32 v39, s39, 0
+; FLATSCR-NEXT:    v_writelane_b32 v39, s40, 1
+; FLATSCR-NEXT:    v_writelane_b32 v39, s41, 2
+; FLATSCR-NEXT:    v_writelane_b32 v39, s42, 3
+; FLATSCR-NEXT:    v_writelane_b32 v39, s43, 4
+; FLATSCR-NEXT:    v_writelane_b32 v39, s44, 5
+; FLATSCR-NEXT:    v_writelane_b32 v39, s45, 6
+; FLATSCR-NEXT:    v_writelane_b32 v39, s46, 7
+; FLATSCR-NEXT:    v_writelane_b32 v39, s47, 8
+; FLATSCR-NEXT:    v_writelane_b32 v39, s48, 9
+; FLATSCR-NEXT:    v_writelane_b32 v39, s49, 10
+; FLATSCR-NEXT:    v_writelane_b32 v39, s50, 11
+; FLATSCR-NEXT:    v_writelane_b32 v39, s51, 12
+; FLATSCR-NEXT:    v_writelane_b32 v39, s52, 13
+; FLATSCR-NEXT:    v_writelane_b32 v39, s53, 14
+; FLATSCR-NEXT:    v_writelane_b32 v39, s54, 15
+; FLATSCR-NEXT:    v_writelane_b32 v39, s55, 16
+; FLATSCR-NEXT:    v_writelane_b32 v39, s56, 17
+; FLATSCR-NEXT:    v_writelane_b32 v39, s57, 18
+; FLATSCR-NEXT:    v_writelane_b32 v39, s58, 19
+; FLATSCR-NEXT:    v_writelane_b32 v39, s59, 20
+; FLATSCR-NEXT:    v_writelane_b32 v39, s60, 21
+; FLATSCR-NEXT:    v_writelane_b32 v39, s61, 22
+; FLATSCR-NEXT:    v_writelane_b32 v39, s62, 23
+; FLATSCR-NEXT:    v_writelane_b32 v39, s63, 24
+; FLATSCR-NEXT:    v_writelane_b32 v39, s64, 25
+; FLATSCR-NEXT:    v_writelane_b32 v39, s65, 26
+; FLATSCR-NEXT:    v_writelane_b32 v39, s66, 27
+; FLATSCR-NEXT:    v_writelane_b32 v39, s67, 28
+; FLATSCR-NEXT:    v_writelane_b32 v39, s68, 29
+; FLATSCR-NEXT:    v_writelane_b32 v39, s69, 30
+; FLATSCR-NEXT:    v_writelane_b32 v39, s70, 31
+; FLATSCR-NEXT:    v_writelane_b32 v39, s71, 32
+; FLATSCR-NEXT:    v_writelane_b32 v39, s72, 33
+; FLATSCR-NEXT:    v_writelane_b32 v39, s73, 34
+; FLATSCR-NEXT:    v_writelane_b32 v39, s74, 35
+; FLATSCR-NEXT:    v_writelane_b32 v39, s75, 36
+; FLATSCR-NEXT:    v_writelane_b32 v39, s76, 37
+; FLATSCR-NEXT:    v_writelane_b32 v39, s77, 38
+; FLATSCR-NEXT:    v_writelane_b32 v39, s78, 39
+; FLATSCR-NEXT:    v_writelane_b32 v39, s79, 40
+; FLATSCR-NEXT:    v_writelane_b32 v39, s80, 41
+; FLATSCR-NEXT:    v_writelane_b32 v39, s81, 42
+; FLATSCR-NEXT:    v_writelane_b32 v39, s82, 43
+; FLATSCR-NEXT:    v_writelane_b32 v39, s83, 44
+; FLATSCR-NEXT:    v_writelane_b32 v39, s84, 45
+; FLATSCR-NEXT:    v_writelane_b32 v39, s85, 46
+; FLATSCR-NEXT:    v_writelane_b32 v39, s86, 47
+; FLATSCR-NEXT:    v_writelane_b32 v39, s87, 48
+; FLATSCR-NEXT:    v_writelane_b32 v39, s88, 49
+; FLATSCR-NEXT:    v_writelane_b32 v39, s89, 50
+; FLATSCR-NEXT:    v_writelane_b32 v39, s90, 51
+; FLATSCR-NEXT:    v_writelane_b32 v39, s91, 52
+; FLATSCR-NEXT:    v_writelane_b32 v39, s92, 53
+; FLATSCR-NEXT:    v_writelane_b32 v39, s93, 54
+; FLATSCR-NEXT:    v_writelane_b32 v39, s94, 55
+; FLATSCR-NEXT:    v_writelane_b32 v39, s95, 56
+; FLATSCR-NEXT:    v_writelane_b32 v39, s96, 57
+; FLATSCR-NEXT:    v_writelane_b32 v39, s97, 58
+; FLATSCR-NEXT:    v_writelane_b32 v39, s98, 59
+; FLATSCR-NEXT:    v_writelane_b32 v39, s99, 60
+; FLATSCR-NEXT:    s_addk_i32 s32, 0x100c
+; FLATSCR-NEXT:    v_writelane_b32 v39, s100, 61
+; FLATSCR-NEXT:    v_writelane_b32 v39, s101, 62
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1000
+; FLATSCR-NEXT:    v_writelane_b32 v39, s102, 63
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s1
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber nonpreserved SGPRs and 64 CSRs
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; clobber all VGPRs except CSR v40
+; FLATSCR-NEXT:    ;;#ASMEND
+; FLATSCR-NEXT:    v_readlane_b32 s102, v39, 63
+; FLATSCR-NEXT:    v_readlane_b32 s101, v39, 62
+; FLATSCR-NEXT:    v_readlane_b32 s100, v39, 61
+; FLATSCR-NEXT:    v_readlane_b32 s99, v39, 60
+; FLATSCR-NEXT:    v_readlane_b32 s98, v39, 59
+; FLATSCR-NEXT:    v_readlane_b32 s97, v39, 58
+; FLATSCR-NEXT:    v_readlane_b32 s96, v39, 57
+; FLATSCR-NEXT:    v_readlane_b32 s95, v39, 56
+; FLATSCR-NEXT:    v_readlane_b32 s94, v39, 55
+; FLATSCR-NEXT:    v_readlane_b32 s93, v39, 54
+; FLATSCR-NEXT:    v_readlane_b32 s92, v39, 53
+; FLATSCR-NEXT:    v_readlane_b32 s91, v39, 52
+; FLATSCR-NEXT:    v_readlane_b32 s90, v39, 51
+; FLATSCR-NEXT:    v_readlane_b32 s89, v39, 50
+; FLATSCR-NEXT:    v_readlane_b32 s88, v39, 49
+; FLATSCR-NEXT:    v_readlane_b32 s87, v39, 48
+; FLATSCR-NEXT:    v_readlane_b32 s86, v39, 47
+; FLATSCR-NEXT:    v_readlane_b32 s85, v39, 46
+; FLATSCR-NEXT:    v_readlane_b32 s84, v39, 45
+; FLATSCR-NEXT:    v_readlane_b32 s83, v39, 44
+; FLATSCR-NEXT:    v_readlane_b32 s82, v39, 43
+; FLATSCR-NEXT:    v_readlane_b32 s81, v39, 42
+; FLATSCR-NEXT:    v_readlane_b32 s80, v39, 41
+; FLATSCR-NEXT:    v_readlane_b32 s79, v39, 40
+; FLATSCR-NEXT:    v_readlane_b32 s78, v39, 39
+; FLATSCR-NEXT:    v_readlane_b32 s77, v39, 38
+; FLATSCR-NEXT:    v_readlane_b32 s76, v39, 37
+; FLATSCR-NEXT:    v_readlane_b32 s75, v39, 36
+; FLATSCR-NEXT:    v_readlane_b32 s74, v39, 35
+; FLATSCR-NEXT:    v_readlane_b32 s73, v39, 34
+; FLATSCR-NEXT:    v_readlane_b32 s72, v39, 33
+; FLATSCR-NEXT:    v_readlane_b32 s71, v39, 32
+; FLATSCR-NEXT:    v_readlane_b32 s70, v39, 31
+; FLATSCR-NEXT:    v_readlane_b32 s69, v39, 30
+; FLATSCR-NEXT:    v_readlane_b32 s68, v39, 29
+; FLATSCR-NEXT:    v_readlane_b32 s67, v39, 28
+; FLATSCR-NEXT:    v_readlane_b32 s66, v39, 27
+; FLATSCR-NEXT:    v_readlane_b32 s65, v39, 26
+; FLATSCR-NEXT:    v_readlane_b32 s64, v39, 25
+; FLATSCR-NEXT:    v_readlane_b32 s63, v39, 24
+; FLATSCR-NEXT:    v_readlane_b32 s62, v39, 23
+; FLATSCR-NEXT:    v_readlane_b32 s61, v39, 22
+; FLATSCR-NEXT:    v_readlane_b32 s60, v39, 21
+; FLATSCR-NEXT:    v_readlane_b32 s59, v39, 20
+; FLATSCR-NEXT:    v_readlane_b32 s58, v39, 19
+; FLATSCR-NEXT:    v_readlane_b32 s57, v39, 18
+; FLATSCR-NEXT:    v_readlane_b32 s56, v39, 17
+; FLATSCR-NEXT:    v_readlane_b32 s55, v39, 16
+; FLATSCR-NEXT:    v_readlane_b32 s54, v39, 15
+; FLATSCR-NEXT:    v_readlane_b32 s53, v39, 14
+; FLATSCR-NEXT:    v_readlane_b32 s52, v39, 13
+; FLATSCR-NEXT:    v_readlane_b32 s51, v39, 12
+; FLATSCR-NEXT:    v_readlane_b32 s50, v39, 11
+; FLATSCR-NEXT:    v_readlane_b32 s49, v39, 10
+; FLATSCR-NEXT:    v_readlane_b32 s48, v39, 9
+; FLATSCR-NEXT:    v_readlane_b32 s47, v39, 8
+; FLATSCR-NEXT:    v_readlane_b32 s46, v39, 7
+; FLATSCR-NEXT:    v_readlane_b32 s45, v39, 6
+; FLATSCR-NEXT:    v_readlane_b32 s44, v39, 5
+; FLATSCR-NEXT:    v_readlane_b32 s43, v39, 4
+; FLATSCR-NEXT:    v_readlane_b32 s42, v39, 3
+; FLATSCR-NEXT:    v_readlane_b32 s41, v39, 2
+; FLATSCR-NEXT:    v_readlane_b32 s40, v39, 1
+; FLATSCR-NEXT:    v_readlane_b32 s39, v39, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, s33
+; FLATSCR-NEXT:    s_xor_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT:    s_add_i32 s1, s33, 0x1004
+; FLATSCR-NEXT:    scratch_load_dword v39, off, s1 ; 4-byte Folded Reload
+; FLATSCR-NEXT:    s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT:    s_mov_b32 s33, s0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
 
diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
index 593f40fd1b25e..1821872b82c0a 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll
@@ -1,51 +1,83 @@
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
+; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 
 ; Test calls when called by other callable functions rather than
 ; kernels.
 
 declare void @external_void_func_i32(i32) #0
 
-; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
-; GCN: s_waitcnt
-
 ; Spill CSR VGPR used for SGPR spilling
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2
-; GCN-DAG: v_writelane_b32 v40, s30, 0
-; GCN-DAG: v_writelane_b32 v40, s31, 1
-
-; GCN: s_swappc_b64
-
-; GCN: v_readlane_b32 s31, v40, 1
-; GCN: v_readlane_b32 s30, v40, 0
-; GCN: s_mov_b32 s32, s33
-
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2
-; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define void @test_func_call_external_void_func_i32_imm() #0 {
+; GCN-LABEL: test_func_call_external_void_func_i32_imm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s16, 2
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   call void @external_void_func_i32(i32 42)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use:
-; GCN: s_waitcnt
-; GCN: s_mov_b32 s33, s32
-; GCN-DAG: s_addk_i32 s32, 0x1400{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
+; GCN-LABEL: test_func_call_external_void_func_i32_imm_stack_use:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s16, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[18:19]
+; GCN-NEXT:    s_addk_i32 s32, 0x1400
+; GCN-NEXT:    v_writelane_b32 v40, s16, 2
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, external_void_func_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, external_void_func_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v0, 42
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep15 = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 16
   store volatile i32 0, ptr addrspace(5) %alloca
@@ -57,3 +89,7 @@ define void @test_func_call_external_void_func_i32_imm_stack_use() #0 {
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FIJI: {{.*}}
+; GFX9: {{.*}}
+; HAWAII: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 0676bc79a46f5..cd7f0c62b0011 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -1,29 +1,59 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
 target datalayout = "A5"
 
 ; FIXME: Why is this commuted only sometimes?
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
+; FIJI-LABEL: i32_fastcc_i32_i32:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; FIJI-NEXT:    s_setpc_b64 s[30:31]
+;
+; HAWAII-LABEL: i32_fastcc_i32_i32:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; HAWAII-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %add0 = add i32 %arg0, %arg1
   ret i32 %add0
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32_stack_object:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mov_b32_e32 [[K:v[0-9]+]], 9
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GCN: buffer_store_dword [[K]], off, s[0:3], s32 offset:20
-; GCN: s_waitcnt vmcnt(0)
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 68
 define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
+; FIJI-LABEL: i32_fastcc_i32_i32_stack_object:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v2, 9
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; FIJI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
+; FIJI-NEXT:    s_setpc_b64 s[30:31]
+;
+; HAWAII-LABEL: i32_fastcc_i32_i32_stack_object:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HAWAII-NEXT:    v_mov_b32_e32 v2, 9
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; HAWAII-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; HAWAII-NEXT:    s_waitcnt vmcnt(0)
+; HAWAII-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: i32_fastcc_i32_i32_stack_object:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v2, 9
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
   store volatile i32 9, ptr addrspace(5) %gep
@@ -31,19 +61,33 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
   ret i32 %add0
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
 define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 68
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -52,12 +96,18 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_callee_stack_object:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:20
-; GCN: s_setpc_b64
-; GCN: ; ScratchSize: 136
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_stack_object@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_stack_object@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -66,45 +116,143 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
 define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
 }
 
 ; It doesn't make sense to do a tail from a kernel
-; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
-;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
 define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
+; FIJI-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; FIJI:       ; %bb.0: ; %entry
+; FIJI-NEXT:    s_add_i32 s6, s6, s9
+; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; FIJI-NEXT:    s_add_u32 s0, s0, s9
+; FIJI-NEXT:    s_addc_u32 s1, s1, 0
+; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIJI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FIJI-NEXT:    s_getpc_b64 s[6:7]
+; FIJI-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; FIJI-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; FIJI-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; FIJI-NEXT:    s_mov_b32 s32, 0
+; FIJI-NEXT:    s_waitcnt lgkmcnt(0)
+; FIJI-NEXT:    v_mov_b32_e32 v0, s4
+; FIJI-NEXT:    v_mov_b32_e32 v1, s5
+; FIJI-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; FIJI-NEXT:    s_endpgm
+;
+; HAWAII-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; HAWAII:       ; %bb.0: ; %entry
+; HAWAII-NEXT:    s_add_i32 s6, s6, s9
+; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HAWAII-NEXT:    s_add_u32 s0, s0, s9
+; HAWAII-NEXT:    s_addc_u32 s1, s1, 0
+; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HAWAII-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; HAWAII-NEXT:    s_getpc_b64 s[6:7]
+; HAWAII-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; HAWAII-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; HAWAII-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; HAWAII-NEXT:    s_mov_b32 s32, 0
+; HAWAII-NEXT:    s_waitcnt lgkmcnt(0)
+; HAWAII-NEXT:    v_mov_b32_e32 v0, s4
+; HAWAII-NEXT:    v_mov_b32_e32 v1, s5
+; HAWAII-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; HAWAII-NEXT:    s_endpgm
+;
+; GFX9-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s9
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[6:7]
+; GFX9-NEXT:    s_add_u32 s6, s6, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s7, s7, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT:    s_endpgm
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
-; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s32{{$}}
-; GCN-NEXT: s_waitcnt vmcnt(0)
-
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-
-; GCN-NEXT: s_setpc_b64 s[30:31]
 define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) byval(i32) align 4 %arg1) #1 {
+; FIJI-LABEL: i32_fastcc_i32_byval_i32:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; FIJI-NEXT:    s_setpc_b64 s[30:31]
+;
+; HAWAII-LABEL: i32_fastcc_i32_byval_i32:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HAWAII-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; HAWAII-NEXT:    s_waitcnt vmcnt(0)
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; HAWAII-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: i32_fastcc_i32_byval_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %arg1.load = load i32, ptr addrspace(5) %arg1, align 4
   %add0 = add i32 %arg0, %arg1.load
   ret i32 %add0
 }
 
 ; Tail call disallowed with byval in parent.
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
-; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
-; GCN: s_swappc_b64
-; GCN-NOT: v_readlane_b32 s32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, ptr addrspace(5) byval(i32) %b.byval, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], s33
+; GCN-NEXT:    v_writelane_b32 v40, s4, 2
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
   ret i32 %ret
@@ -113,34 +261,56 @@ entry:
 ; Tail call disallowed with byval in parent, not callee. The stack
 ; usage of incoming arguments must be <= the outgoing stack
 ; arguments.
-
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
-; GCN-NOT: v0
-; GCN-NOT: s32
-; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16
-; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}}
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:16
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_byval_i32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_byval_i32@rel32@hi+12
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-
-; CIVI-NEXT: v_add_{{i|u}}32_e32 v0, vcc, v0, v1
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_0]]
-; CIVI: v_add_{{i|u}}32_e32 v0, vcc, v0, [[LOAD_1]]
-
-
-; GFX9-NEXT: v_add_u32_e32 v0, v0, v1
-; GFX9: v_add3_u32 v0, v0, v3, v2
-
-; GCN-NEXT: s_setpc_b64
 define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
+; FIJI-LABEL: i32_fastcc_i32_i32_a32i32:
+; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; FIJI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; FIJI-NEXT:    s_waitcnt vmcnt(1)
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
+; FIJI-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; FIJI-NEXT:    s_setpc_b64 s[30:31]
+;
+; HAWAII-LABEL: i32_fastcc_i32_i32_a32i32:
+; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HAWAII-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4
+; HAWAII-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; HAWAII-NEXT:    s_waitcnt vmcnt(1)
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; HAWAII-NEXT:    s_waitcnt vmcnt(0)
+; HAWAII-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; HAWAII-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: i32_fastcc_i32_i32_a32i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add3_u32 v0, v0, v3, v2
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %val_firststack = extractvalue [32 x i32] %large, 30
   %val_laststack = extractvalue [32 x i32] %large, 31
   %add0 = add i32 %arg0, %arg1
@@ -150,31 +320,49 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 }
 
 ; FIXME: Why load and store same location for stack args?
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
-
-; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword [[LOAD_2:v[0-9]+]], off, s[0:3], s32 offset:8
-
-; GCN-NOT: s32
-
-; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dword [[LOAD_2]], off, s[0:3], s32 offset:8
-
-; GCN-NOT: s32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:32
-; GCN: s_setpc_b64
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v34, 9
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -186,54 +374,114 @@ entry:
 ; If the callee requires more stack argument space than the caller,
 ; don't do a tail call.
 ; TODO: Do we really need this restriction?
-
-; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
-; GCN: s_swappc_b64
-; GCN: s_setpc_b64
 define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
+; GCN-LABEL: no_sibling_call_callee_more_stack_space:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s4, 2
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    v_writelane_b32 v40, s31, 1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_readlane_b32 s31, v40, 1
+; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s4, v40, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_mov_b32 s33, s4
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
   ret i32 %ret
 }
 
 ; Have another non-tail in the function
-; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
-; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec
-; GCN-DAG: s_addk_i32 s32, 0x400
-; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2
-
-; GCN-DAG: s_getpc_b64 s[4:5]
-; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
-; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
-
-; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
-; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
-
-
-; GCN: s_swappc_b64
-
-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
-; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1
-; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0
-; GCN-NEXT: s_mov_b32 s32, s33
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2
-; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
-; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[8:9]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s4, s33
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v42, s4, 2
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_writelane_b32 v42, s30, 0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v42, s31, 1
+; GCN-NEXT:    v_mov_b32_e32 v40, v1
+; GCN-NEXT:    v_mov_b32_e32 v41, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v0, v41
+; GCN-NEXT:    v_mov_b32_e32 v1, v40
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12
+; GCN-NEXT:    v_readlane_b32 s31, v42, 1
+; GCN-NEXT:    v_readlane_b32 s30, v42, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    v_readlane_b32 s6, v42, 2
+; GCN-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[8:9]
+; GCN-NEXT:    s_mov_b32 s33, s6
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
@@ -242,16 +490,25 @@ entry:
 
 ; Have stack object in caller and stack passed arguments. SP should be
 ; in same place at function exit.
-
-; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
-; GCN-NOT: s33
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-
-; GCN-NOT: s33
-
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
-; GCN: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v34, 9
+; GCN-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:32
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -260,13 +517,52 @@ entry:
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
-; GCN-NOT: s33
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:48
-
-; GCN-NOT: s33
-; GCN: s_setpc_b64 s[4:5]
 define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
+; GCN-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, i32_fastcc_i32_i32_a32i32@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, i32_fastcc_i32_i32_a32i32@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    v_mov_b32_e32 v2, 9
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:48
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -278,11 +574,18 @@ entry:
 @func_ptr_gv = external unnamed_addr addrspace(4) constant ptr, align 4
 
 ; Do support tail calls with a uniform, but unknown, callee.
-; GCN-LABEL: {{^}}indirect_uniform_sibling_call_i32_fastcc_i32_i32:
-; GCN: s_load_dwordx2 [[GV_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN: s_load_dwordx2 [[FUNC_PTR:s\[[0-9]+:[0-9]+\]]], [[GV_ADDR]]
-; GCN: s_setpc_b64 [[FUNC_PTR]]
 define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
+; GCN-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, func_ptr_gv@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, func_ptr_gv@gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
   %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
@@ -291,14 +594,279 @@ entry:
 
 ; We can't support a tail call to a divergent target. Use a waterfall
 ; loop around a regular call
-; GCN-LABEL: {{^}}indirect_divergent_sibling_call_i32_fastcc_i32_i32:
-; GCN: v_readfirstlane_b32
-; GCN: v_readfirstlane_b32
-; GCN: s_and_saveexec_b64
-; GCN: s_swappc_b64
-; GCN: s_cbranch_execnz
-; GCN: s_setpc_b64
 define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr %func.ptr, i32 %a, i32 %b, i32 %c) #1 {
+; FIJI-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; FIJI:       ; %bb.0: ; %entry
+; FIJI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FIJI-NEXT:    s_mov_b32 s16, s33
+; FIJI-NEXT:    s_mov_b32 s33, s32
+; FIJI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; FIJI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; FIJI-NEXT:    s_mov_b64 exec, s[18:19]
+; FIJI-NEXT:    v_writelane_b32 v40, s16, 18
+; FIJI-NEXT:    v_writelane_b32 v40, s30, 0
+; FIJI-NEXT:    v_writelane_b32 v40, s31, 1
+; FIJI-NEXT:    v_writelane_b32 v40, s34, 2
+; FIJI-NEXT:    v_writelane_b32 v40, s35, 3
+; FIJI-NEXT:    v_writelane_b32 v40, s36, 4
+; FIJI-NEXT:    v_writelane_b32 v40, s37, 5
+; FIJI-NEXT:    v_writelane_b32 v40, s38, 6
+; FIJI-NEXT:    v_writelane_b32 v40, s39, 7
+; FIJI-NEXT:    v_writelane_b32 v40, s40, 8
+; FIJI-NEXT:    v_writelane_b32 v40, s41, 9
+; FIJI-NEXT:    v_writelane_b32 v40, s42, 10
+; FIJI-NEXT:    v_writelane_b32 v40, s43, 11
+; FIJI-NEXT:    v_writelane_b32 v40, s44, 12
+; FIJI-NEXT:    v_writelane_b32 v40, s45, 13
+; FIJI-NEXT:    v_writelane_b32 v40, s46, 14
+; FIJI-NEXT:    v_writelane_b32 v40, s47, 15
+; FIJI-NEXT:    v_writelane_b32 v40, s48, 16
+; FIJI-NEXT:    s_mov_b32 s42, s15
+; FIJI-NEXT:    s_mov_b32 s43, s14
+; FIJI-NEXT:    s_mov_b32 s44, s13
+; FIJI-NEXT:    s_mov_b32 s45, s12
+; FIJI-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; FIJI-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; FIJI-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; FIJI-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; FIJI-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
+; FIJI-NEXT:    s_mov_b64 s[46:47], exec
+; FIJI-NEXT:    s_addk_i32 s32, 0x400
+; FIJI-NEXT:    v_writelane_b32 v40, s49, 17
+; FIJI-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; FIJI-NEXT:    v_readfirstlane_b32 s16, v0
+; FIJI-NEXT:    v_readfirstlane_b32 s17, v1
+; FIJI-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; FIJI-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; FIJI-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; FIJI-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; FIJI-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; FIJI-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; FIJI-NEXT:    s_mov_b32 s12, s45
+; FIJI-NEXT:    s_mov_b32 s13, s44
+; FIJI-NEXT:    s_mov_b32 s14, s43
+; FIJI-NEXT:    s_mov_b32 s15, s42
+; FIJI-NEXT:    v_mov_b32_e32 v0, v2
+; FIJI-NEXT:    v_mov_b32_e32 v1, v3
+; FIJI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; FIJI-NEXT:    v_mov_b32_e32 v4, v0
+; FIJI-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; FIJI-NEXT:    ; implicit-def: $vgpr31
+; FIJI-NEXT:    ; implicit-def: $vgpr2
+; FIJI-NEXT:    ; implicit-def: $vgpr3
+; FIJI-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; FIJI-NEXT:    s_cbranch_execnz .LBB18_1
+; FIJI-NEXT:  ; %bb.2:
+; FIJI-NEXT:    s_mov_b64 exec, s[46:47]
+; FIJI-NEXT:    v_mov_b32_e32 v0, v4
+; FIJI-NEXT:    v_readlane_b32 s49, v40, 17
+; FIJI-NEXT:    v_readlane_b32 s48, v40, 16
+; FIJI-NEXT:    v_readlane_b32 s47, v40, 15
+; FIJI-NEXT:    v_readlane_b32 s46, v40, 14
+; FIJI-NEXT:    v_readlane_b32 s45, v40, 13
+; FIJI-NEXT:    v_readlane_b32 s44, v40, 12
+; FIJI-NEXT:    v_readlane_b32 s43, v40, 11
+; FIJI-NEXT:    v_readlane_b32 s42, v40, 10
+; FIJI-NEXT:    v_readlane_b32 s41, v40, 9
+; FIJI-NEXT:    v_readlane_b32 s40, v40, 8
+; FIJI-NEXT:    v_readlane_b32 s39, v40, 7
+; FIJI-NEXT:    v_readlane_b32 s38, v40, 6
+; FIJI-NEXT:    v_readlane_b32 s37, v40, 5
+; FIJI-NEXT:    v_readlane_b32 s36, v40, 4
+; FIJI-NEXT:    v_readlane_b32 s35, v40, 3
+; FIJI-NEXT:    v_readlane_b32 s34, v40, 2
+; FIJI-NEXT:    v_readlane_b32 s31, v40, 1
+; FIJI-NEXT:    v_readlane_b32 s30, v40, 0
+; FIJI-NEXT:    s_mov_b32 s32, s33
+; FIJI-NEXT:    v_readlane_b32 s4, v40, 18
+; FIJI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; FIJI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; FIJI-NEXT:    s_mov_b64 exec, s[6:7]
+; FIJI-NEXT:    s_mov_b32 s33, s4
+; FIJI-NEXT:    s_waitcnt vmcnt(0)
+; FIJI-NEXT:    s_setpc_b64 s[30:31]
+;
+; HAWAII-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; HAWAII:       ; %bb.0: ; %entry
+; HAWAII-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HAWAII-NEXT:    s_mov_b32 s16, s33
+; HAWAII-NEXT:    s_mov_b32 s33, s32
+; HAWAII-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; HAWAII-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HAWAII-NEXT:    s_mov_b64 exec, s[18:19]
+; HAWAII-NEXT:    v_writelane_b32 v40, s16, 18
+; HAWAII-NEXT:    v_writelane_b32 v40, s30, 0
+; HAWAII-NEXT:    v_writelane_b32 v40, s31, 1
+; HAWAII-NEXT:    v_writelane_b32 v40, s34, 2
+; HAWAII-NEXT:    v_writelane_b32 v40, s35, 3
+; HAWAII-NEXT:    v_writelane_b32 v40, s36, 4
+; HAWAII-NEXT:    v_writelane_b32 v40, s37, 5
+; HAWAII-NEXT:    v_writelane_b32 v40, s38, 6
+; HAWAII-NEXT:    v_writelane_b32 v40, s39, 7
+; HAWAII-NEXT:    v_writelane_b32 v40, s40, 8
+; HAWAII-NEXT:    v_writelane_b32 v40, s41, 9
+; HAWAII-NEXT:    v_writelane_b32 v40, s42, 10
+; HAWAII-NEXT:    v_writelane_b32 v40, s43, 11
+; HAWAII-NEXT:    v_writelane_b32 v40, s44, 12
+; HAWAII-NEXT:    v_writelane_b32 v40, s45, 13
+; HAWAII-NEXT:    v_writelane_b32 v40, s46, 14
+; HAWAII-NEXT:    v_writelane_b32 v40, s47, 15
+; HAWAII-NEXT:    v_writelane_b32 v40, s48, 16
+; HAWAII-NEXT:    s_mov_b32 s42, s15
+; HAWAII-NEXT:    s_mov_b32 s43, s14
+; HAWAII-NEXT:    s_mov_b32 s44, s13
+; HAWAII-NEXT:    s_mov_b32 s45, s12
+; HAWAII-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; HAWAII-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; HAWAII-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; HAWAII-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; HAWAII-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; HAWAII-NEXT:    s_mov_b64 s[46:47], exec
+; HAWAII-NEXT:    s_addk_i32 s32, 0x400
+; HAWAII-NEXT:    v_writelane_b32 v40, s49, 17
+; HAWAII-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; HAWAII-NEXT:    v_readfirstlane_b32 s16, v0
+; HAWAII-NEXT:    v_readfirstlane_b32 s17, v1
+; HAWAII-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; HAWAII-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; HAWAII-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; HAWAII-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; HAWAII-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; HAWAII-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; HAWAII-NEXT:    s_mov_b32 s12, s45
+; HAWAII-NEXT:    s_mov_b32 s13, s44
+; HAWAII-NEXT:    s_mov_b32 s14, s43
+; HAWAII-NEXT:    s_mov_b32 s15, s42
+; HAWAII-NEXT:    v_mov_b32_e32 v0, v2
+; HAWAII-NEXT:    v_mov_b32_e32 v1, v3
+; HAWAII-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; HAWAII-NEXT:    v_mov_b32_e32 v4, v0
+; HAWAII-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; HAWAII-NEXT:    ; implicit-def: $vgpr31
+; HAWAII-NEXT:    ; implicit-def: $vgpr2
+; HAWAII-NEXT:    ; implicit-def: $vgpr3
+; HAWAII-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; HAWAII-NEXT:    s_cbranch_execnz .LBB18_1
+; HAWAII-NEXT:  ; %bb.2:
+; HAWAII-NEXT:    s_mov_b64 exec, s[46:47]
+; HAWAII-NEXT:    v_mov_b32_e32 v0, v4
+; HAWAII-NEXT:    v_readlane_b32 s49, v40, 17
+; HAWAII-NEXT:    v_readlane_b32 s48, v40, 16
+; HAWAII-NEXT:    v_readlane_b32 s47, v40, 15
+; HAWAII-NEXT:    v_readlane_b32 s46, v40, 14
+; HAWAII-NEXT:    v_readlane_b32 s45, v40, 13
+; HAWAII-NEXT:    v_readlane_b32 s44, v40, 12
+; HAWAII-NEXT:    v_readlane_b32 s43, v40, 11
+; HAWAII-NEXT:    v_readlane_b32 s42, v40, 10
+; HAWAII-NEXT:    v_readlane_b32 s41, v40, 9
+; HAWAII-NEXT:    v_readlane_b32 s40, v40, 8
+; HAWAII-NEXT:    v_readlane_b32 s39, v40, 7
+; HAWAII-NEXT:    v_readlane_b32 s38, v40, 6
+; HAWAII-NEXT:    v_readlane_b32 s37, v40, 5
+; HAWAII-NEXT:    v_readlane_b32 s36, v40, 4
+; HAWAII-NEXT:    v_readlane_b32 s35, v40, 3
+; HAWAII-NEXT:    v_readlane_b32 s34, v40, 2
+; HAWAII-NEXT:    v_readlane_b32 s31, v40, 1
+; HAWAII-NEXT:    v_readlane_b32 s30, v40, 0
+; HAWAII-NEXT:    s_mov_b32 s32, s33
+; HAWAII-NEXT:    v_readlane_b32 s4, v40, 18
+; HAWAII-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; HAWAII-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HAWAII-NEXT:    s_mov_b64 exec, s[6:7]
+; HAWAII-NEXT:    s_mov_b32 s33, s4
+; HAWAII-NEXT:    s_waitcnt vmcnt(0)
+; HAWAII-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s16, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[18:19]
+; GFX9-NEXT:    v_writelane_b32 v40, s16, 18
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v40, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v40, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v40, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v40, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v40, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v40, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v40, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v40, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s48, 16
+; GFX9-NEXT:    s_mov_b32 s42, s15
+; GFX9-NEXT:    s_mov_b32 s43, s14
+; GFX9-NEXT:    s_mov_b32 s44, s13
+; GFX9-NEXT:    s_mov_b32 s45, s12
+; GFX9-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GFX9-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX9-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-NEXT:    s_mov_b64 s[46:47], exec
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_writelane_b32 v40, s49, 17
+; GFX9-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
+; GFX9-NEXT:    s_and_saveexec_b64 s[48:49], vcc
+; GFX9-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GFX9-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT:    s_mov_b32 s12, s45
+; GFX9-NEXT:    s_mov_b32 s13, s44
+; GFX9-NEXT:    s_mov_b32 s14, s43
+; GFX9-NEXT:    s_mov_b32 s15, s42
+; GFX9-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX9-NEXT:    ; implicit-def: $vgpr31
+; GFX9-NEXT:    ; implicit-def: $vgpr2
+; GFX9-NEXT:    ; implicit-def: $vgpr3
+; GFX9-NEXT:    s_xor_b64 exec, exec, s[48:49]
+; GFX9-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX9-NEXT:  ; %bb.2:
+; GFX9-NEXT:    s_mov_b64 exec, s[46:47]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_readlane_b32 s49, v40, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v40, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v40, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v40, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v40, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v40, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v40, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v40, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v40, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v40, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    s_mov_b32 s32, s33
+; GFX9-NEXT:    v_readlane_b32 s4, v40, 18
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %b, %c
   %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
@@ -307,30 +875,30 @@ entry:
 
 declare hidden void @void_fastcc_multi_byval(i32 %a, ptr addrspace(5) byval([3 x i32]) align 16, ptr addrspace(5) byval([2 x i64]))
 
-; GCN-LABEL: {{^}}sibling_call_fastcc_multi_byval:
-; GCN-DAG: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
-
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:160
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:164
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:168
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:172
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s32 offset:28{{$}}
-
-; GCN: s_setpc_b64 [[TARGET_ADDR]]
 define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
+; GCN-LABEL: sibling_call_fastcc_multi_byval:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 9
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:172
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:168
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:164
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:160
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:24
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
+; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_multi_byval@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_multi_byval@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca0 = alloca [3 x i32], align 16, addrspace(5)
   %alloca1 = alloca [2 x i64], align 8, addrspace(5)
@@ -343,26 +911,55 @@ entry:
 declare hidden void @void_fastcc_byval_and_stack_passed(ptr addrspace(5) byval([3 x i32]) align 16, [32 x i32], i32)
 
 ; Callee has a byval and non-byval stack passed argument
-; GCN-LABEL: {{^}}sibling_call_byval_and_stack_passed:
-; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
-
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:144
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:148
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:152
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:16
-
-; GCN: v_mov_b32_e32 v0, 0
-; GCN: v_mov_b32_e32 v30, 0
-
-; GCN: s_getpc_b64 [[TARGET_ADDR:s\[[0-9]+:[0-9]+\]]]
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64 [[TARGET_ADDR]]
 define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64 x i32]) #1 {
+; GCN-LABEL: sibling_call_byval_and_stack_passed:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mov_b32_e32 v1, 9
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12
+; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NEXT:    v_mov_b32_e32 v6, 0
+; GCN-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NEXT:    v_mov_b32_e32 v10, 0
+; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NEXT:    v_mov_b32_e32 v14, 0
+; GCN-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NEXT:    v_mov_b32_e32 v16, 0
+; GCN-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NEXT:    v_mov_b32_e32 v22, 0
+; GCN-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NEXT:    v_mov_b32_e32 v30, 0
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, void_fastcc_byval_and_stack_passed@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, void_fastcc_byval_and_stack_passed@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %alloca = alloca [3 x i32], align 16, addrspace(5)
   store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
@@ -372,13 +969,14 @@ entry:
 
 declare hidden fastcc i64 @i64_fastcc_i64(i64 %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_i64_fastcc_i64:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
+; GCN-LABEL: sibling_call_i64_fastcc_i64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
   ret i64 %ret
@@ -386,13 +984,14 @@ entry:
 
 declare hidden fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_p1i8_fastcc_p1i8:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspace(1) %a) #1 {
+; GCN-LABEL: sibling_call_p1i8_fastcc_p1i8:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
   ret ptr addrspace(1) %ret
@@ -400,13 +999,14 @@ entry:
 
 declare hidden fastcc i16 @i16_fastcc_i16(i16 %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_i16_fastcc_i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
+; GCN-LABEL: sibling_call_i16_fastcc_i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
   ret i16 %ret
@@ -414,13 +1014,14 @@ entry:
 
 declare hidden fastcc half @f16_fastcc_f16(half %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_f16_fastcc_f16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
+; GCN-LABEL: sibling_call_f16_fastcc_f16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc half @f16_fastcc_f16(half %a)
   ret half %ret
@@ -428,13 +1029,14 @@ entry:
 
 declare hidden fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v3i16_fastcc_v3i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1 {
+; GCN-LABEL: sibling_call_v3i16_fastcc_v3i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
   ret <3 x i16> %ret
@@ -442,13 +1044,14 @@ entry:
 
 declare hidden fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v4i16_fastcc_v4i16:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1 {
+; GCN-LABEL: sibling_call_v4i16_fastcc_v4i16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
   ret <4 x i16> %ret
@@ -456,13 +1059,14 @@ entry:
 
 declare hidden fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %arg0)
 
-; GCN-LABEL: {{^}}sibling_call_v2i64_fastcc_v2i64:
-; GCN: s_waitcnt
-; GCN-NEXT: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN-NEXT: s_setpc_b64
 define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1 {
+; GCN-LABEL: sibling_call_v2i64_fastcc_v2i64:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_getpc_b64 s[16:17]
+; GCN-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64@rel32@lo+4
+; GCN-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64@rel32@hi+12
+; GCN-NEXT:    s_setpc_b64 s[16:17]
 entry:
   %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
   ret <2 x i64> %ret

From f4de28a63c81c909df28b6b065fad19e2189c54e Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Tue, 28 Jan 2025 00:17:04 -0800
Subject: [PATCH 361/432] [StrTable] Switch intrinsics to StringTable and work
 around MSVC (#123548)

Historically, the main example of *very* large string tables used the
`EmitCharArray` to work around MSVC limitations with string literals,
but that was switched (without removing the API) in order to consolidate
on a nicer emission primitive.

While this large string table in `IntrinsicsImpl.inc` seems to compile
correctly on MSVC without the work around in `EmitCharArray` (and that
this PR adds back to the nicer emission path), other users have
repeatedly hit this MSVC limitation as you can see in the discussion on
PR https://github.com/llvm/llvm-project/pull/120534. This PR teaches the
string offset table emission to look at
the size of the table and switch to the char array emission strategy
when the table becomes too large.

This work around does have the downside of making compile times worse
for large string tables, but that appears unavoidable until we can
identify known good MSVC versions and switch to requiring them for all
LLVM users. It also reduces searchability of the generated string table
-- I looked at emitting a comment with each string but it is tricky
because the escaping rules for an inline comment are different from
those of of a string literal, and there's no real way to turn the string
literal into a comment.

While improving the output in this way, also clean up the output to not
emit an extraneous empty string at the end of the string table, and
update the `StringTable` class to not look for that. It isn't actually
used by anything and is wasteful.

This PR also switches the `IntrinsicsImpl.inc` string tables over to the
new `StringTable` runtime abstraction. I didn't want to do this until
landing the MSVC workaround in case it caused even this example to start
hitting the MSVC bug, but I wanted to switch here so that I could
simplify the API for emitting the string table with the workaround
present. With the two different emission strategies, its important to
use a very exact syntax and that seems better encapsulated in the API.

Last but not least, the `SDNodeInfoEmitter` is updated, including its
tests to match the new output.

This PR should unblock landing
https://github.com/llvm/llvm-project/pull/120534 and letting us switch
all of
Clang's builtins to use string tables. That PR has all the details
motivating the overall effort.

Follow-up patches will try to consolidate the remaining users onto the
single interface, but those at least were easy to separate into
follow-ups and keep this PR somewhat smaller.
---
 .../TableGen/ClangDiagnosticsEmitter.cpp      |  6 +-
 llvm/include/llvm/ADT/StringTable.h           |  9 +-
 .../llvm/TableGen/StringToOffsetTable.h       | 93 ++++++++++++-------
 llvm/lib/IR/Intrinsics.cpp                    | 19 ++--
 llvm/test/TableGen/MixedCasedMnemonic.td      |  2 +-
 .../ambiguous-constraints.td                  | 16 ++--
 llvm/test/TableGen/SDNodeInfoEmitter/basic.td | 26 ++++--
 .../TableGen/SDNodeInfoEmitter/namespace.td   | 14 +--
 .../SDNodeInfoEmitter/skipped-nodes.td        |  7 +-
 .../utils/TableGen/Basic/IntrinsicEmitter.cpp |  7 +-
 llvm/utils/TableGen/OptionParserEmitter.cpp   |  8 +-
 llvm/utils/TableGen/SDNodeInfoEmitter.cpp     |  4 +-
 12 files changed, 118 insertions(+), 93 deletions(-)

diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 5f03efdb80434..50dbe4d5a8cab 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1785,8 +1785,7 @@ static void emitDiagArrays(DiagsInGroupTy &DiagsInGroup,
 /// This creates an `llvm::StringTable` of all the diagnostic group names.
 static void emitDiagGroupNames(const StringToOffsetTable &GroupNames,
                                raw_ostream &OS) {
-  GroupNames.EmitStringLiteralDef(
-      OS, "static constexpr llvm::StringTable DiagGroupNames");
+  GroupNames.EmitStringTableDef(OS, "DiagGroupNames");
   OS << "\n";
 }
 
@@ -1939,9 +1938,6 @@ void clang::EmitClangDiagGroups(const RecordKeeper &Records, raw_ostream &OS) {
   inferPedantic.compute(&DiagsInPedantic, &GroupsInPedantic);
 
   StringToOffsetTable GroupNames;
-  // Add an empty string to the table first so we can use `llvm::StringTable`.
-  // TODO: Factor this into `StringToOffsetTable`.
-  GroupNames.GetOrAddStringOffset("");
   for (const auto &[Name, Group] : DiagsInGroup) {
     GroupNames.GetOrAddStringOffset(Name);
   }
diff --git a/llvm/include/llvm/ADT/StringTable.h b/llvm/include/llvm/ADT/StringTable.h
index ce5efa1e06ea6..b3c4a414ed6b4 100644
--- a/llvm/include/llvm/ADT/StringTable.h
+++ b/llvm/include/llvm/ADT/StringTable.h
@@ -78,14 +78,11 @@ class StringTable {
     // support `constexpr`.
     assert(!Table.empty() && "Requires at least a valid empty string.");
     assert(Table.data()[0] == '\0' && "Offset zero must be the empty string.");
-    // Ensure that `strlen` from any offset cannot overflow the end of the table
-    // by insisting on a null byte at the end. We also insist on the last string
-    // within the table being *separately* null terminated. This structure is
-    // used to enable predictable iteration over all the strings when needed.
+    // Regardless of how many strings are in the table, the last one should also
+    // be null terminated. This also ensures that computing `strlen` on the
+    // strings can't accidentally run past the end of the table.
     assert(Table.data()[Table.size() - 1] == '\0' &&
            "Last byte must be a null byte.");
-    assert(Table.data()[Table.size() - 2] == '\0' &&
-           "Next-to-last byte must be a null byte.");
   }
 
   // Get a string from the table starting with the provided offset. The returned
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index d4bb685acce32..e716411514bd6 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -27,6 +27,12 @@ class StringToOffsetTable {
   std::string AggregateString;
 
 public:
+  StringToOffsetTable() {
+    // Ensure we always put the empty string at offset zero. That lets empty
+    // initialization also be zero initialization for offsets into the table.
+    GetOrAddStringOffset("");
+  }
+
   bool empty() const { return StringOffset.empty(); }
   size_t size() const { return AggregateString.size(); }
 
@@ -51,28 +57,71 @@ class StringToOffsetTable {
     return II->second;
   }
 
-  // Emit the string using string literal concatenation, for better readability
-  // and searchability.
-  void EmitStringLiteralDef(raw_ostream &OS, const Twine &Decl,
-                            const Twine &Indent = "  ") const {
+  // Emit a string table definition with the provided name and indent.
+  //
+  // When possible, this uses string-literal concatenation to emit the string
+  // contents in a readable and searchable way. However, for (very) large string
+  // tables MSVC cannot reliably use string literals and so there we use a large
+  // character array. We still use a line oriented emission and add comments to
+  // provide searchability even in this case.
+  //
+  // The string table, and its input string contents, are always emitted as both
+  // `static` and `constexpr`. Both `Name` and (`Name` + "Storage") must be
+  // valid identifiers to declare.
+  void EmitStringTableDef(raw_ostream &OS, const Twine &Name,
+                          const Twine &Indent = "") const {
     OS << formatv(R"(
 #ifdef __GNUC__
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Woverlength-strings"
 #endif
-{0}{1} = )",
-                  Indent, Decl);
+{0}static constexpr char {1}Storage[] = )",
+                  Indent, Name);
+
+    // MSVC silently miscompiles string literals longer than 64k in some
+    // circumstances. When the string table is longer, emit it as an array of
+    // character literals.
+    bool UseChars = AggregateString.size() > (64 * 1024);
+    OS << (UseChars ? "{\n" : "\n");
+
+    llvm::ListSeparator LineSep(UseChars ? ",\n" : "\n");
+    llvm::SmallVector<StringRef> Strings(split(AggregateString, '\0'));
+    // We should always have an empty string at the start, and because these are
+    // null terminators rather than separators, we'll have one at the end as
+    // well. Skip the end one.
+    assert(Strings.front().empty() && "Expected empty initial string!");
+    assert(Strings.back().empty() &&
+           "Expected empty string at the end due to terminators!");
+    Strings.pop_back();
+    for (StringRef Str : Strings) {
+      OS << LineSep << Indent << "  ";
+      // If we can, just emit this as a string literal to be concatenated.
+      if (!UseChars) {
+        OS << "\"";
+        OS.write_escaped(Str);
+        OS << "\\0\"";
+        continue;
+      }
 
-    for (StringRef Str : split(AggregateString, '\0')) {
-      OS << "\n" << Indent << "  \"";
-      OS.write_escaped(Str);
-      OS << "\\0\"";
+      llvm::ListSeparator CharSep(", ");
+      for (char C : Str) {
+        OS << CharSep << "'";
+        OS.write_escaped(StringRef(&C, 1));
+        OS << "'";
+      }
+      OS << CharSep << "'\\0'";
     }
-    OS << R"(;
+    OS << LineSep << Indent << (UseChars ? "};" : "  ;");
+
+    OS << formatv(R"(
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
-)";
+
+{0}static constexpr llvm::StringTable {1} =
+{0}    {1}Storage;
+)",
+                  Indent, Name);
   }
 
   // Emit the string as one single string.
@@ -110,26 +159,6 @@ class StringToOffsetTable {
     }
     O << "\"";
   }
-
-  /// Emit the string using character literals. MSVC has a limitation that
-  /// string literals cannot be longer than 64K.
-  void EmitCharArray(raw_ostream &O) {
-    assert(AggregateString.find(')') == std::string::npos &&
-           "can't emit raw string with closing parens");
-    int Count = 0;
-    O << ' ';
-    for (char C : AggregateString) {
-      O << " \'";
-      O.write_escaped(StringRef(&C, 1));
-      O << "\',";
-      Count++;
-      if (Count > 14) {
-        O << "\n ";
-        Count = 0;
-      }
-    }
-    O << '\n';
-  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index ec1184e8d835d..be8f33dc22f54 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringTable.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
@@ -40,7 +41,7 @@ using namespace llvm;
 
 StringRef Intrinsic::getBaseName(ID id) {
   assert(id < num_intrinsics && "Invalid intrinsic ID!");
-  return IntrinsicNameTable + IntrinsicNameOffsetTable[id];
+  return IntrinsicNameTable[IntrinsicNameOffsetTable[id]];
 }
 
 StringRef Intrinsic::getName(ID id) {
@@ -649,20 +650,20 @@ static int lookupLLVMIntrinsicByName(ArrayRef<unsigned> NameOffsetTable,
       // `equal_range` requires the comparison to work with either side being an
       // offset or the value. Detect which kind each side is to set up the
       // compared strings.
-      const char *LHSStr;
+      StringRef LHSStr;
       if constexpr (std::is_integral_v<decltype(LHS)>) {
-        LHSStr = &IntrinsicNameTable[LHS];
+        LHSStr = IntrinsicNameTable[LHS];
       } else {
         LHSStr = LHS;
       }
-      const char *RHSStr;
+      StringRef RHSStr;
       if constexpr (std::is_integral_v<decltype(RHS)>) {
-        RHSStr = &IntrinsicNameTable[RHS];
+        RHSStr = IntrinsicNameTable[RHS];
       } else {
         RHSStr = RHS;
       }
-      return strncmp(LHSStr + CmpStart, RHSStr + CmpStart, CmpEnd - CmpStart) <
-             0;
+      return strncmp(LHSStr.data() + CmpStart, RHSStr.data() + CmpStart,
+                     CmpEnd - CmpStart) < 0;
     };
     LastLow = Low;
     std::tie(Low, High) = std::equal_range(Low, High, Name.data(), Cmp);
@@ -672,7 +673,7 @@ static int lookupLLVMIntrinsicByName(ArrayRef<unsigned> NameOffsetTable,
 
   if (LastLow == NameOffsetTable.end())
     return -1;
-  StringRef NameFound = &IntrinsicNameTable[*LastLow];
+  StringRef NameFound = IntrinsicNameTable[*LastLow];
   if (Name == NameFound ||
       (Name.starts_with(NameFound) && Name[NameFound.size()] == '.'))
     return LastLow - NameOffsetTable.begin();
@@ -716,7 +717,7 @@ Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) {
 
   // If the intrinsic is not overloaded, require an exact match. If it is
   // overloaded, require either exact or prefix match.
-  const auto MatchSize = strlen(&IntrinsicNameTable[NameOffsetTable[Idx]]);
+  const auto MatchSize = IntrinsicNameTable[NameOffsetTable[Idx]].size();
   assert(Name.size() >= MatchSize && "Expected either exact or prefix match");
   bool IsExactMatch = Name.size() == MatchSize;
   return IsExactMatch || Intrinsic::isOverloaded(ID) ? ID
diff --git a/llvm/test/TableGen/MixedCasedMnemonic.td b/llvm/test/TableGen/MixedCasedMnemonic.td
index cb224ac59c6de..14ab104c7e120 100644
--- a/llvm/test/TableGen/MixedCasedMnemonic.td
+++ b/llvm/test/TableGen/MixedCasedMnemonic.td
@@ -41,7 +41,7 @@ def :MnemonicAlias<"InstB", "BInst">;
 
 // Check that the matcher lower()s the mnemonics it matches.
 // MATCHER: static const char MnemonicTable[] =
-// MATCHER-NEXT: "\005ainst\005binst";
+// MATCHER-NEXT: "\000\005ainst\005binst";
 
 // Check that aInst appears before BInst in the match table.
 // This shows that the mnemonics are sorted in a case-insensitive way,
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td
index 668464190e6d8..c09e2198dbeba 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/ambiguous-constraints.td
@@ -14,16 +14,17 @@ def my_node_b : SDNode<"MyTargetISD::NODE", SDTypeProfile<1, 0, [SDTCisVT<0, f32
 // CHECK-NEXT:    NODE = ISD::BUILTIN_OP_END,
 // CHECK-NEXT:  };
 
-// CHECK:       static const char MyTargetSDNodeNames[] =
+// CHECK:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
 // CHECK-NEXT:    "MyTargetISD::NODE\0"
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:    ;
 
 // CHECK:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT:      {1, 0, 0, 0, 0, 0, 0, 0}, // NODE
+// CHECK-NEXT:      {1, 0, 0, 0, 0, 1, 0, 0}, // NODE
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeInfo MyTargetGenSDNodeInfo(
@@ -54,18 +55,19 @@ def my_node_2b : SDNode<"MyTargetISD::NODE_2", SDTypeProfile<1, 0, [SDTCisVT<0,
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm::MyTargetISD
 
-// CHECK:       static const char MyTargetSDNodeNames[] =
+// CHECK:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
 // CHECK-NEXT:    "MyTargetISD::NODE_1\0"
 // CHECK-NEXT:    "MyTargetISD::NODE_2\0"
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:    ;
 
 // CHECK:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* 0 */ {SDTCisVT, 0, 0, MVT::i32},
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT:      {1, 0, 0, 0, 0, 0, 0, 1}, // NODE_1
-// CHECK-NEXT:      {1, 0, 0, 0, 0, 20, 0, 0}, // NODE_2
+// CHECK-NEXT:      {1, 0, 0, 0, 0, 1, 0, 1}, // NODE_1
+// CHECK-NEXT:      {1, 0, 0, 0, 0, 21, 0, 0}, // NODE_2
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeInfo MyTargetGenSDNodeInfo(
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/basic.td b/llvm/test/TableGen/SDNodeInfoEmitter/basic.td
index 5332b4f458dfd..b8bff520bbcaa 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/basic.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/basic.td
@@ -28,12 +28,16 @@ def MyTarget : Target;
 // CHECK-NEXT:  #pragma GCC diagnostic push
 // CHECK-NEXT:  #pragma GCC diagnostic ignored "-Woverlength-strings"
 // CHECK-NEXT:  #endif
-// CHECK-NEXT:  static const char MyTargetSDNodeNames[] =
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:  static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
+// CHECK-NEXT:    ;
 // CHECK-NEXT:  #ifdef __GNUC__
 // CHECK-NEXT:  #pragma GCC diagnostic pop
 // CHECK-NEXT:  #endif
 // CHECK-EMPTY:
+// CHECK-NEXT:  static constexpr llvm::StringTable MyTargetSDNodeNames =
+// CHECK-NEXT:      MyTargetSDNodeNamesStorage;
+// CHECK-EMPTY:
 // CHECK-NEXT:  static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
 // CHECK-NEXT:  };
@@ -70,16 +74,17 @@ def my_noop : SDNode<"MyTargetISD::NOOP", SDTypeProfile<0, 0, []>>;
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm::MyTargetISD
 
-// CHECK:       static const char MyTargetSDNodeNames[] =
+// CHECK:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
 // CHECK-NEXT:    "MyTargetISD::NOOP\0"
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:    ;
 
 // CHECK:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT:      {0, 0, 0, 0, 0, 0, 0, 0}, // NOOP
+// CHECK-NEXT:      {0, 0, 0, 0, 0, 1, 0, 0}, // NOOP
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeInfo MyTargetGenSDNodeInfo(
@@ -148,11 +153,12 @@ def my_node_3 : SDNode<
 // CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm::MyTargetISD
 
-// CHECK:       static const char MyTargetSDNodeNames[] =
+// CHECK:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
 // CHECK-NEXT:    "MyTargetISD::NODE_1\0"
 // CHECK-NEXT:    "MyTargetISD::NODE_2\0"
 // CHECK-NEXT:    "MyTargetISD::NODE_3\0"
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:    ;
 
 // CHECK:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* 0 */ {SDTCisVT, 1, 0, MVT::i2},
@@ -173,9 +179,9 @@ def my_node_3 : SDNode<
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT:          {1, 1, 0|1<<SDNPHasChain, 0, 0, 0, 0, 2}, // NODE_1
-// CHECK-NEXT:          {3, 1, 0|1<<SDNPVariadic|1<<SDNPMemOperand, 0, 42, 20, 11, 4}, // NODE_2
-// CHECK-NEXT:          {2, -1, 0|1<<SDNPHasChain|1<<SDNPOutGlue|1<<SDNPInGlue|1<<SDNPOptInGlue, 0|1<<SDNFIsStrictFP, 24, 40, 2, 13}, // NODE_3
+// CHECK-NEXT:          {1, 1, 0|1<<SDNPHasChain, 0, 0, 1, 0, 2}, // NODE_1
+// CHECK-NEXT:          {3, 1, 0|1<<SDNPVariadic|1<<SDNPMemOperand, 0, 42, 21, 11, 4}, // NODE_2
+// CHECK-NEXT:          {2, -1, 0|1<<SDNPHasChain|1<<SDNPOutGlue|1<<SDNPInGlue|1<<SDNPOptInGlue, 0|1<<SDNFIsStrictFP, 24, 41, 2, 13}, // NODE_3
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT: static const SDNodeInfo MyTargetGenSDNodeInfo(
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/namespace.td b/llvm/test/TableGen/SDNodeInfoEmitter/namespace.td
index 844c12bd182fc..217fb7c9fd475 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/namespace.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/namespace.td
@@ -19,8 +19,9 @@ def node_2 : SDNode<"MyCustomISD::NODE", SDTypeProfile<0, 1, [SDTCisVT<0, i2>]>>
 // EMPTY-EMPTY:
 // EMPTY-NEXT:   } // namespace llvm::EmptyISD
 
-// EMPTY:        static const char MyTargetSDNodeNames[] =
-// EMPTY-NEXT:     "\0";
+// EMPTY:        static constexpr char MyTargetSDNodeNamesStorage[] =
+// EMPTY-NEXT:     "\0"
+// EMPTY-NEXT:     ;
 
 // EMPTY:        static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // EMPTY-NEXT:     /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
@@ -43,9 +44,10 @@ def node_2 : SDNode<"MyCustomISD::NODE", SDTypeProfile<0, 1, [SDTCisVT<0, i2>]>>
 // COMMON-EMPTY:
 // COMMON-NEXT:  } // namespace llvm::[[NS]]
 
-// COMMON:       static const char MyTargetSDNodeNames[] =
+// COMMON:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// COMMON-NEXT:    "\0"
 // COMMON-NEXT:    "[[NS]]::NODE\0"
-// COMMON-NEXT:    "\0";
+// COMMON-NEXT:    ;
 
 // COMMON:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // TARGET-NEXT:    /* 0 */ {SDTCisVT, 0, 0, MVT::i1},
@@ -53,8 +55,8 @@ def node_2 : SDNode<"MyCustomISD::NODE", SDTypeProfile<0, 1, [SDTCisVT<0, i2>]>>
 // COMMON-NEXT:  };
 // COMMON-EMPTY:
 // COMMON-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// TARGET-NEXT:      {1, 0, 0, 0, 0, 0, 0, 1}, // NODE
-// CUSTOM-NEXT:      {0, 1, 0, 0, 0, 0, 0, 1}, // NODE
+// TARGET-NEXT:      {1, 0, 0, 0, 0, 1, 0, 1}, // NODE
+// CUSTOM-NEXT:      {0, 1, 0, 0, 0, 1, 0, 1}, // NODE
 // COMMON-NEXT:  };
 // COMMON-EMPTY:
 // COMMON-NEXT:  static const SDNodeInfo MyTargetGenSDNodeInfo(
diff --git a/llvm/test/TableGen/SDNodeInfoEmitter/skipped-nodes.td b/llvm/test/TableGen/SDNodeInfoEmitter/skipped-nodes.td
index ed278f262ca8f..abd6ad3bda3bc 100644
--- a/llvm/test/TableGen/SDNodeInfoEmitter/skipped-nodes.td
+++ b/llvm/test/TableGen/SDNodeInfoEmitter/skipped-nodes.td
@@ -69,16 +69,17 @@ def node_5b : SDNode<"MyTargetISD::NODE_5", SDTypeProfile<0, 0, []>, [SDNPHasCha
 // CHECK-NEXT:    COMPAT = ISD::BUILTIN_OP_END,
 // CHECK-NEXT:  };
 
-// CHECK:       static const char MyTargetSDNodeNames[] =
+// CHECK:       static constexpr char MyTargetSDNodeNamesStorage[] =
+// CHECK-NEXT:    "\0"
 // CHECK-NEXT:    "MyTargetISD::COMPAT\0"
-// CHECK-NEXT:    "\0";
+// CHECK-NEXT:    ;
 
 // CHECK:       static const SDTypeConstraint MyTargetSDTypeConstraints[] = {
 // CHECK-NEXT:    /* dummy */ {SDTCisVT, 0, 0, MVT::INVALID_SIMPLE_VALUE_TYPE}
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeDesc MyTargetSDNodeDescs[] = {
-// CHECK-NEXT:      {1, -1, 0, 0, 0, 0, 0, 0}, // COMPAT
+// CHECK-NEXT:      {1, -1, 0, 0, 0, 1, 0, 0}, // COMPAT
 // CHECK-NEXT:  };
 // CHECK-EMPTY:
 // CHECK-NEXT:  static const SDNodeInfo MyTargetGenSDNodeInfo(
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index fc2b8908a35b8..6b36fddcb4bce 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -252,8 +252,7 @@ void IntrinsicEmitter::EmitIntrinsicToNameTable(
 
 )";
 
-  Table.EmitStringLiteralDef(OS, "static constexpr char IntrinsicNameTable[]",
-                             /*Indent=*/"");
+  Table.EmitStringTableDef(OS, "IntrinsicNameTable", /*Indent=*/"");
 
   OS << R"(
 static constexpr unsigned IntrinsicNameOffsetTable[] = {
@@ -759,13 +758,13 @@ Intrinsic::getIntrinsicFor{}Builtin(StringRef TargetPrefix,
   }
 
   if (!Table.empty()) {
-    Table.EmitStringLiteralDef(OS, "static constexpr char BuiltinNames[]");
+    Table.EmitStringTableDef(OS, "BuiltinNames");
 
     OS << R"(
   struct BuiltinEntry {
     ID IntrinsicID;
     unsigned StrTabOffset;
-    const char *getName() const { return &BuiltinNames[StrTabOffset]; }
+    const char *getName() const { return BuiltinNames[StrTabOffset].data(); }
     bool operator<(StringRef RHS) const {
       return strncmp(getName(), RHS.data(), RHS.size()) < 0;
     }
diff --git a/llvm/utils/TableGen/OptionParserEmitter.cpp b/llvm/utils/TableGen/OptionParserEmitter.cpp
index 35a452890b0ec..d17cad41e6a7e 100644
--- a/llvm/utils/TableGen/OptionParserEmitter.cpp
+++ b/llvm/utils/TableGen/OptionParserEmitter.cpp
@@ -287,10 +287,6 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
   array_pod_sort(PrefixesUnion.begin(), PrefixesUnion.end());
 
   llvm::StringToOffsetTable Table;
-  // Make sure the empty string is the zero-th one in the table. This both makes
-  // it easy to check for empty strings (zero offset == empty) and makes
-  // initialization cheaper for empty strings.
-  Table.GetOrAddStringOffset("");
   // We can add all the prefixes via the union.
   for (const auto &Prefix : PrefixesUnion)
     Table.GetOrAddStringOffset(Prefix);
@@ -303,9 +299,7 @@ static void emitOptionParser(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "/////////\n";
   OS << "// String table\n\n";
   OS << "#ifdef OPTTABLE_STR_TABLE_CODE\n";
-  Table.EmitStringLiteralDef(
-      OS, "static constexpr llvm::StringTable OptionStrTable",
-      /*Indent=*/"");
+  Table.EmitStringTableDef(OS, "OptionStrTable", /*Indent=*/"");
   OS << "#endif // OPTTABLE_STR_TABLE_CODE\n\n";
 
   // Dump prefixes.
diff --git a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
index cb971b089f5a4..63ee0deb87110 100644
--- a/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
+++ b/llvm/utils/TableGen/SDNodeInfoEmitter.cpp
@@ -167,9 +167,7 @@ std::vector<unsigned> SDNodeInfoEmitter::emitNodeNames(raw_ostream &OS) const {
     NameOffsets.push_back(NameTable.GetOrAddStringOffset(DebugName));
   }
 
-  NameTable.EmitStringLiteralDef(
-      OS, "static const char " + Target.getName() + "SDNodeNames[]",
-      /*Indent=*/"");
+  NameTable.EmitStringTableDef(OS, Target.getName() + "SDNodeNames");
   OS << '\n';
 
   return NameOffsets;

From f76f534e8c81c01d7ee2c8ab36cb28b9542531ca Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Tue, 28 Jan 2025 13:55:35 +0530
Subject: [PATCH 362/432] [NVPTX][Script] Update lit.cfg.py (#124395)

This patch updates lit.cfg.py to include
the latest ptxas executable versions upto
12.8.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/test/lit.cfg.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index b17d41fa11af7..3c0069d10412a 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -311,6 +311,9 @@ def enable_ptxas(ptxas_executable):
             (12, 2),
             (12, 3),
             (12, 4),
+            (12, 5),
+            (12, 6),
+            (12, 8),
         ]
 
         def version_int(ver):

From 458542f454cdb769801f0b6459405b429503e00a Mon Sep 17 00:00:00 2001
From: Adam Siemieniuk <adam.siemieniuk@intel.com>
Date: Tue, 28 Jan 2025 09:55:59 +0100
Subject: [PATCH 363/432] [mlir][linalg] Relax structured op region filler
 check (#123741)

Removes assert on output type from structure op region filler to allow
more graceful error handling.
---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp |  5 +----
 mlir/test/Dialect/Linalg/invalid.mlir    | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index c13b663dbf05b..7c45f8805c205 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -121,14 +121,11 @@ using RegionBuilderFn = llvm::function_ref<void(ImplicitLocOpBuilder &, Block &,
 /// `regionBuilder`. The method is used by both named structured ops created by
 /// ods-gen and by manually defined C++ ops. It is called by both builders and
 /// parsers and creates a block with arguments corresponding to the elemental
-/// types of `inputTypes` and `outputTypes`. All output types are asserted to be
-/// ShapedType.
+/// types of `inputTypes` and `outputTypes`.
 static void fillStructuredOpRegion(OpBuilder &opBuilder, Region &region,
                                    TypeRange inputTypes, TypeRange outputTypes,
                                    ArrayRef<NamedAttribute> attrs,
                                    RegionBuilderFn regionBuilder) {
-  assert(llvm::all_of(outputTypes, llvm::IsaPred<ShapedType>));
-
   SmallVector<Type, 8> argTypes;
   SmallVector<Location, 8> argLocs;
   for (auto containers : {inputTypes, outputTypes}) {
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
index a59472377a732..0853856d93303 100644
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -370,6 +370,24 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>,
 
 // -----
 
+func.func @invalid_scalar_input_matmul(%arg0: f32, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
+  // expected-error @+1 {{'linalg.matmul' op expected operand rank (0) to match the result rank of indexing_map #0 (2)}}
+  linalg.matmul ins(%arg0, %arg1 : f32, memref<3x4xf32>)
+                outs(%arg2 : memref<2x4xf32>)
+  return
+}
+
+// -----
+
+func.func @invalid_scalar_output_matmul(%arg0: memref<2x3xf32>, %arg1: memref<3x4xf32>, %arg2: f32) {
+  // expected-error @+1 {{'linalg.matmul' op operand #2 must be variadic of shaped of any type values, but got 'f32'}}
+  linalg.matmul ins(%arg0, %arg1 : memref<2x3xf32>, memref<3x4xf32>)
+                outs(%arg2 : f32)
+  return
+}
+
+// -----
+
 func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) {
   // expected-error @+1 {{expected attribute value}}
   linalg.matmul indexing_maps = [

From 1295aa2e814d1747d69520e34e2c5fb2888e666d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 28 Jan 2025 09:57:00 +0100
Subject: [PATCH 364/432] [Clang] Add -fwrapv-pointer flag (#122486)

GCC supports three flags related to overflow behavior:
 * `-fwrapv`: Makes signed integer overflow well-defined.
 * `-fwrapv-pointer`: Makes pointer overflow well-defined.
* `-fno-strict-overflow`: Implies `-fwrapv -fwrapv-pointer`, making both
signed integer overflow and pointer overflow well-defined.

Clang currently only supports `-fno-strict-overflow` and `-fwrapv`, but
not `-fwrapv-pointer`.

This PR proposes to introduce `-fwrapv-pointer` and adjust the semantics
of `-fwrapv` to match GCC.

This allows signed integer overflow and pointer overflow to be
controlled independently, while `-fno-strict-overflow` still exists to
control both at the same time (and that option is consistent across GCC
and Clang).
---
 clang/docs/ReleaseNotes.rst                   | 15 ++++++++++-
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  5 ++++
 clang/lib/CodeGen/CGBuiltin.cpp               |  2 +-
 clang/lib/CodeGen/CGExpr.cpp                  | 27 +++++++++----------
 clang/lib/CodeGen/CGExprScalar.cpp            | 12 ++++-----
 clang/lib/Driver/SanitizerArgs.cpp            |  3 +++
 clang/lib/Driver/ToolChains/CommonArgs.cpp    | 15 ++++++++---
 clang/lib/Frontend/CompilerInvocation.cpp     |  4 +++
 clang/lib/Sema/SemaExpr.cpp                   |  2 +-
 clang/test/CodeGen/integer-overflow.c         | 10 +++----
 clang/test/CodeGen/pointer-overflow.c         | 12 +++++++++
 clang/test/Driver/clang_wrapv_opts.c          | 25 +++++++++++------
 .../Sema/tautological-pointer-comparison.c    |  2 +-
 14 files changed, 94 insertions(+), 41 deletions(-)
 create mode 100644 clang/test/CodeGen/pointer-overflow.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bf4dbf9a4dd26..de24ab9cab5c6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -79,7 +79,15 @@ code bases.
   Undefined behavior due to pointer addition overflow can be reliably detected
   using ``-fsanitize=pointer-overflow``. It is also possible to use
   ``-fno-strict-overflow`` to opt-in to a language dialect where signed integer
-  and pointer overflow are well-defined.
+  and pointer overflow are well-defined. Since Clang 20, it is also possible
+  to use ``-fwrapv-pointer`` to only make pointer overflow well-defined, while
+  not affecting the behavior of signed integer overflow.
+
+- The ``-fwrapv`` flag now only makes signed integer overflow well-defined,
+  without affecting pointer overflow, which is controlled by a new
+  ``-fwrapv-pointer`` flag. The ``-fno-strict-overflow`` flag now implies
+  both ``-fwrapv`` and ``-fwrapv-pointer`` and as such retains its old meaning.
+  The new behavior matches GCC.
 
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
@@ -521,6 +529,11 @@ New Compiler Flags
 - clang-cl and clang-dxc now support ``-fdiagnostics-color=[auto|never|always]``
   in addition to ``-f[no-]color-diagnostics``.
 
+- The new ``-fwrapv-pointer`` flag opts-in to a language dialect where pointer
+  overflow is well-defined. The ``-fwrapv`` flag previously implied
+  ``-fwrapv-pointer`` as well, but no longer does. ``-fno-strict-overflow``
+  implies ``-fwrapv -fwrapv-pointer``. The flags now match GCC.
+
 Deprecated Compiler Flags
 -------------------------
 
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index a980be853d53e..1bcec212fd332 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -407,6 +407,7 @@ VALUE_LANGOPT(TrivialAutoVarInitMaxSize, 32, 0,
              "stop trivial automatic variable initialization if var size exceeds the specified size (in bytes). Must be greater than 0.")
 ENUM_LANGOPT(SignedOverflowBehavior, SignedOverflowBehaviorTy, 2, SOB_Undefined,
              "signed integer overflow handling")
+LANGOPT(PointerOverflowDefined, 1, 0, "make pointer overflow defined")
 ENUM_LANGOPT(ThreadModel  , ThreadModelKind, 2, ThreadModelKind::POSIX, "Thread Model")
 
 BENIGN_LANGOPT(ArrowDepth, 32, 256,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6c171a62bbeee..75f2487143ee8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4301,6 +4301,11 @@ def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>,
   HelpText<"Treat signed integer overflow as two's complement">;
 def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, FlangOption]>;
+def fwrapv_pointer : Flag<["-"], "fwrapv-pointer">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, CC1Option, FlangOption, FC1Option]>,
+  HelpText<"Treat pointer overflow as two's complement">;
+def fno_wrapv_pointer : Flag<["-"], "fno-wrapv-pointer">, Group<f_Group>,
+  Visibility<[ClangOption, CLOption, FlangOption]>;
 def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Store string literals as writable data">,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 27dfab11c5531..7ec9d59bfed5c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -22284,7 +22284,7 @@ RValue CodeGenFunction::EmitBuiltinAlignTo(const CallExpr *E, bool AlignUp) {
     // By adding the mask, we ensure that align_up on an already aligned
     // value will not change the value.
     if (Args.Src->getType()->isPointerTy()) {
-      if (getLangOpts().isSignedOverflowDefined())
+      if (getLangOpts().PointerOverflowDefined)
         SrcForMask =
             Builder.CreateGEP(Int8Ty, SrcForMask, Args.Mask, "over_boundary");
       else
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d921b4f19212c..9676e61cf322d 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -4318,14 +4318,14 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
     // GEP indexes are signed, and scaling an index isn't permitted to
     // signed-overflow, so we use the same semantics for our explicit
     // multiply.  We suppress this if overflow is not undefined behavior.
-    if (getLangOpts().isSignedOverflowDefined()) {
+    if (getLangOpts().PointerOverflowDefined) {
       Idx = Builder.CreateMul(Idx, numElements);
     } else {
       Idx = Builder.CreateNSWMul(Idx, numElements);
     }
 
     Addr = emitArraySubscriptGEP(*this, Addr, Idx, vla->getElementType(),
-                                 !getLangOpts().isSignedOverflowDefined(),
+                                 !getLangOpts().PointerOverflowDefined,
                                  SignedIndices, E->getExprLoc());
 
   } else if (const ObjCObjectType *OIT = E->getType()->getAs<ObjCObjectType>()){
@@ -4415,7 +4415,7 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
     QualType arrayType = Array->getType();
     Addr = emitArraySubscriptGEP(
         *this, ArrayLV.getAddress(), {CGM.getSize(CharUnits::Zero()), Idx},
-        E->getType(), !getLangOpts().isSignedOverflowDefined(), SignedIndices,
+        E->getType(), !getLangOpts().PointerOverflowDefined, SignedIndices,
         E->getExprLoc(), &arrayType, E->getBase());
     EltBaseInfo = ArrayLV.getBaseInfo();
     EltTBAAInfo = CGM.getTBAAInfoForSubobject(ArrayLV, E->getType());
@@ -4424,10 +4424,9 @@ LValue CodeGenFunction::EmitArraySubscriptExpr(const ArraySubscriptExpr *E,
     Addr = EmitPointerWithAlignment(E->getBase(), &EltBaseInfo, &EltTBAAInfo);
     auto *Idx = EmitIdxAfterBase(/*Promote*/true);
     QualType ptrType = E->getBase()->getType();
-    Addr = emitArraySubscriptGEP(*this, Addr, Idx, E->getType(),
-                                 !getLangOpts().isSignedOverflowDefined(),
-                                 SignedIndices, E->getExprLoc(), &ptrType,
-                                 E->getBase());
+    Addr = emitArraySubscriptGEP(
+        *this, Addr, Idx, E->getType(), !getLangOpts().PointerOverflowDefined,
+        SignedIndices, E->getExprLoc(), &ptrType, E->getBase());
   }
 
   LValue LV = MakeAddrLValue(Addr, E->getType(), EltBaseInfo, EltTBAAInfo);
@@ -4572,11 +4571,11 @@ LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
                 : llvm::ConstantInt::get(IntPtrTy, ConstLength);
         Idx = Builder.CreateAdd(LowerBoundVal, LengthVal, "lb_add_len",
                                 /*HasNUW=*/false,
-                                !getLangOpts().isSignedOverflowDefined());
+                                !getLangOpts().PointerOverflowDefined);
         if (Length && LowerBound) {
           Idx = Builder.CreateSub(
               Idx, llvm::ConstantInt::get(IntPtrTy, /*V=*/1), "idx_sub_1",
-              /*HasNUW=*/false, !getLangOpts().isSignedOverflowDefined());
+              /*HasNUW=*/false, !getLangOpts().PointerOverflowDefined);
         }
       } else
         Idx = llvm::ConstantInt::get(IntPtrTy, ConstLength + ConstLowerBound);
@@ -4602,7 +4601,7 @@ LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
             Length->getType()->hasSignedIntegerRepresentation());
         Idx = Builder.CreateSub(
             LengthVal, llvm::ConstantInt::get(IntPtrTy, /*V=*/1), "len_sub_1",
-            /*HasNUW=*/false, !getLangOpts().isSignedOverflowDefined());
+            /*HasNUW=*/false, !getLangOpts().PointerOverflowDefined);
       } else {
         ConstLength = ConstLength.zextOrTrunc(PointerWidthInBits);
         --ConstLength;
@@ -4629,12 +4628,12 @@ LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
     // GEP indexes are signed, and scaling an index isn't permitted to
     // signed-overflow, so we use the same semantics for our explicit
     // multiply.  We suppress this if overflow is not undefined behavior.
-    if (getLangOpts().isSignedOverflowDefined())
+    if (getLangOpts().PointerOverflowDefined)
       Idx = Builder.CreateMul(Idx, NumElements);
     else
       Idx = Builder.CreateNSWMul(Idx, NumElements);
     EltPtr = emitArraySubscriptGEP(*this, Base, Idx, VLA->getElementType(),
-                                   !getLangOpts().isSignedOverflowDefined(),
+                                   !getLangOpts().PointerOverflowDefined,
                                    /*signedIndices=*/false, E->getExprLoc());
   } else if (const Expr *Array = isSimpleArrayDecayOperand(E->getBase())) {
     // If this is A[i] where A is an array, the frontend will have decayed the
@@ -4654,7 +4653,7 @@ LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
     // Propagate the alignment from the array itself to the result.
     EltPtr = emitArraySubscriptGEP(
         *this, ArrayLV.getAddress(), {CGM.getSize(CharUnits::Zero()), Idx},
-        ResultExprTy, !getLangOpts().isSignedOverflowDefined(),
+        ResultExprTy, !getLangOpts().PointerOverflowDefined,
         /*signedIndices=*/false, E->getExprLoc());
     BaseInfo = ArrayLV.getBaseInfo();
     TBAAInfo = CGM.getTBAAInfoForSubobject(ArrayLV, ResultExprTy);
@@ -4663,7 +4662,7 @@ LValue CodeGenFunction::EmitArraySectionExpr(const ArraySectionExpr *E,
         emitOMPArraySectionBase(*this, E->getBase(), BaseInfo, TBAAInfo, BaseTy,
                                 ResultExprTy, IsLowerBound);
     EltPtr = emitArraySubscriptGEP(*this, Base, Idx, ResultExprTy,
-                                   !getLangOpts().isSignedOverflowDefined(),
+                                   !getLangOpts().PointerOverflowDefined,
                                    /*signedIndices=*/false, E->getExprLoc());
   }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index ac499e490ee87..df850421c72c6 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -3043,7 +3043,7 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
       llvm::Value *numElts = CGF.getVLASize(vla).NumElts;
       if (!isInc) numElts = Builder.CreateNSWNeg(numElts, "vla.negsize");
       llvm::Type *elemTy = CGF.ConvertTypeForMem(vla->getElementType());
-      if (CGF.getLangOpts().isSignedOverflowDefined())
+      if (CGF.getLangOpts().PointerOverflowDefined)
         value = Builder.CreateGEP(elemTy, value, numElts, "vla.inc");
       else
         value = CGF.EmitCheckedInBoundsGEP(
@@ -3054,7 +3054,7 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     } else if (type->isFunctionType()) {
       llvm::Value *amt = Builder.getInt32(amount);
 
-      if (CGF.getLangOpts().isSignedOverflowDefined())
+      if (CGF.getLangOpts().PointerOverflowDefined)
         value = Builder.CreateGEP(CGF.Int8Ty, value, amt, "incdec.funcptr");
       else
         value =
@@ -3066,7 +3066,7 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     } else {
       llvm::Value *amt = Builder.getInt32(amount);
       llvm::Type *elemTy = CGF.ConvertTypeForMem(type);
-      if (CGF.getLangOpts().isSignedOverflowDefined())
+      if (CGF.getLangOpts().PointerOverflowDefined)
         value = Builder.CreateGEP(elemTy, value, amt, "incdec.ptr");
       else
         value = CGF.EmitCheckedInBoundsGEP(
@@ -3179,7 +3179,7 @@ ScalarExprEmitter::EmitScalarPrePostIncDec(const UnaryOperator *E, LValue LV,
     llvm::Value *sizeValue =
       llvm::ConstantInt::get(CGF.SizeTy, size.getQuantity());
 
-    if (CGF.getLangOpts().isSignedOverflowDefined())
+    if (CGF.getLangOpts().PointerOverflowDefined)
       value = Builder.CreateGEP(CGF.Int8Ty, value, sizeValue, "incdec.objptr");
     else
       value = CGF.EmitCheckedInBoundsGEP(
@@ -4075,7 +4075,7 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
     // signed-overflow, so we use the same semantics for our explicit
     // multiply.  We suppress this if overflow is not undefined behavior.
     llvm::Type *elemTy = CGF.ConvertTypeForMem(vla->getElementType());
-    if (CGF.getLangOpts().isSignedOverflowDefined()) {
+    if (CGF.getLangOpts().PointerOverflowDefined) {
       index = CGF.Builder.CreateMul(index, numElements, "vla.index");
       pointer = CGF.Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
     } else {
@@ -4096,7 +4096,7 @@ static Value *emitPointerArithmetic(CodeGenFunction &CGF,
   else
     elemTy = CGF.ConvertTypeForMem(elementType);
 
-  if (CGF.getLangOpts().isSignedOverflowDefined())
+  if (CGF.getLangOpts().PointerOverflowDefined)
     return CGF.Builder.CreateGEP(elemTy, pointer, index, "add.ptr");
 
   return CGF.EmitCheckedInBoundsGEP(
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index a0d6919c6dc8d..3420472c0f75e 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -575,6 +575,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
                                      options::OPT_fstrict_overflow, false);
         if (Args.hasFlagNoClaim(options::OPT_fwrapv, options::OPT_fno_wrapv, S))
           Add &= ~SanitizerKind::SignedIntegerOverflow;
+        if (Args.hasFlagNoClaim(options::OPT_fwrapv_pointer,
+                                options::OPT_fno_wrapv_pointer, S))
+          Add &= ~SanitizerKind::PointerOverflow;
       }
       Add &= Supported;
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index c045069c34424..2c4b082bcce4a 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -3095,12 +3095,19 @@ void tools::renderCommonIntegerOverflowOptions(const ArgList &Args,
                                                ArgStringList &CmdArgs) {
   // -fno-strict-overflow implies -fwrapv if it isn't disabled, but
   // -fstrict-overflow won't turn off an explicitly enabled -fwrapv.
+  bool StrictOverflow = Args.hasFlag(options::OPT_fstrict_overflow,
+                                     options::OPT_fno_strict_overflow, true);
   if (Arg *A = Args.getLastArg(options::OPT_fwrapv, options::OPT_fno_wrapv)) {
     if (A->getOption().matches(options::OPT_fwrapv))
       CmdArgs.push_back("-fwrapv");
-  } else if (Arg *A = Args.getLastArg(options::OPT_fstrict_overflow,
-                                      options::OPT_fno_strict_overflow)) {
-    if (A->getOption().matches(options::OPT_fno_strict_overflow))
-      CmdArgs.push_back("-fwrapv");
+  } else if (!StrictOverflow) {
+    CmdArgs.push_back("-fwrapv");
+  }
+  if (Arg *A = Args.getLastArg(options::OPT_fwrapv_pointer,
+                               options::OPT_fno_wrapv_pointer)) {
+    if (A->getOption().matches(options::OPT_fwrapv_pointer))
+      CmdArgs.push_back("-fwrapv-pointer");
+  } else if (!StrictOverflow) {
+    CmdArgs.push_back("-fwrapv-pointer");
   }
 }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 3bf124e4827be..44dd69972f8e5 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3721,6 +3721,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
   } else if (Opts.SignedOverflowBehavior == LangOptions::SOB_Defined) {
     GenerateArg(Consumer, OPT_fwrapv);
   }
+  if (Opts.PointerOverflowDefined)
+    GenerateArg(Consumer, OPT_fwrapv_pointer);
 
   if (Opts.MSCompatibilityVersion != 0) {
     unsigned Major = Opts.MSCompatibilityVersion / 10000000;
@@ -4138,6 +4140,8 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   }
   else if (Args.hasArg(OPT_fwrapv))
     Opts.setSignedOverflowBehavior(LangOptions::SOB_Defined);
+  if (Args.hasArg(OPT_fwrapv_pointer))
+    Opts.PointerOverflowDefined = true;
 
   Opts.MSCompatibilityVersion = 0;
   if (const Arg *A = Args.getLastArg(OPT_fms_compatibility_version)) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 176627c3df37c..e253e3a17328f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -11798,7 +11798,7 @@ static std::optional<bool> isTautologicalBoundsCheck(Sema &S, const Expr *LHS,
                                                      const Expr *RHS,
                                                      BinaryOperatorKind Opc) {
   if (!LHS->getType()->isPointerType() ||
-      S.getLangOpts().isSignedOverflowDefined())
+      S.getLangOpts().PointerOverflowDefined)
     return std::nullopt;
 
   // Canonicalize to >= or < predicate.
diff --git a/clang/test/CodeGen/integer-overflow.c b/clang/test/CodeGen/integer-overflow.c
index 9e8cde8b33b16..a3a66e6137bed 100644
--- a/clang/test/CodeGen/integer-overflow.c
+++ b/clang/test/CodeGen/integer-overflow.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - | FileCheck %s --check-prefix=DEFAULT
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fwrapv | FileCheck %s --check-prefix=WRAPV
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -ftrapv | FileCheck %s --check-prefix=TRAPV
-// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow | FileCheck %s --check-prefixes=CATCH_UB,CATCH_UB_POINTER
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow | FileCheck %s --check-prefixes=CATCH_UB,NOCATCH_UB_POINTER
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fsanitize=signed-integer-overflow -fwrapv | FileCheck %s --check-prefixes=CATCH_UB,NOCATCH_UB_POINTER
 // RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -ftrapv -ftrapv-handler foo | FileCheck %s --check-prefix=TRAPV_HANDLER
 
@@ -57,14 +57,14 @@ void test1(void) {
   // TRAPV_HANDLER: foo(
   --a;
   
-  // -fwrapv should turn off inbounds for GEP's, PR9256
+  // -fwrapv does not affect inbounds for GEP's.
+  // This is controlled by -fwrapv-pointer instead.
   extern int* P;
   ++P;
   // DEFAULT: getelementptr inbounds nuw i32, ptr
-  // WRAPV: getelementptr i32, ptr
+  // WRAPV: getelementptr inbounds nuw i32, ptr
   // TRAPV: getelementptr inbounds nuw i32, ptr
-  // CATCH_UB_POINTER: getelementptr inbounds nuw i32, ptr
-  // NOCATCH_UB_POINTER: getelementptr i32, ptr
+  // NOCATCH_UB_POINTER: getelementptr inbounds nuw i32, ptr
 
   // PR9350: char pre-increment never overflows.
   extern volatile signed char PR9350_char_inc;
diff --git a/clang/test/CodeGen/pointer-overflow.c b/clang/test/CodeGen/pointer-overflow.c
new file mode 100644
index 0000000000000..9c7821b841980
--- /dev/null
+++ b/clang/test/CodeGen/pointer-overflow.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fwrapv | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -ftrapv | FileCheck %s --check-prefix=DEFAULT
+// RUN: %clang_cc1 -triple x86_64-apple-darwin %s -emit-llvm -o - -fwrapv-pointer | FileCheck %s --check-prefix=FWRAPV-POINTER
+
+void test(void) {
+  // -fwrapv-pointer should turn off inbounds for GEP's
+  extern int* P;
+  ++P;
+  // DEFAULT: getelementptr inbounds nuw i32, ptr
+  // FWRAPV-POINTER: getelementptr i32, ptr
+}
diff --git a/clang/test/Driver/clang_wrapv_opts.c b/clang/test/Driver/clang_wrapv_opts.c
index 826468e0678d0..9f3a884324dcd 100644
--- a/clang/test/Driver/clang_wrapv_opts.c
+++ b/clang/test/Driver/clang_wrapv_opts.c
@@ -1,11 +1,20 @@
-// RUN: %clang -### -S -fwrapv -fno-wrapv -fwrapv %s 2>&1 | FileCheck -check-prefix=CHECK1 %s
-// CHECK1: -fwrapv
+// RUN: %clang -### -S -fwrapv -fno-wrapv -fwrapv -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1 %s
+// CHECK1: "-fwrapv"
 //
-// RUN: %clang -### -S -fstrict-overflow -fno-strict-overflow %s 2>&1 | FileCheck -check-prefix=CHECK2 %s
-// CHECK2: -fwrapv
+// RUN: %clang -### -S -fwrapv-pointer -fno-wrapv-pointer -fwrapv-pointer -Werror %s 2>&1 | FileCheck -check-prefix=CHECK1-POINTER %s
+// CHECK1-POINTER: "-fwrapv-pointer"
 //
-// RUN: %clang -### -S -fwrapv -fstrict-overflow %s 2>&1 | FileCheck -check-prefix=CHECK3 %s
-// CHECK3: -fwrapv
+// RUN: %clang -### -S -fstrict-overflow -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK2 %s
+// CHECK2: "-fwrapv"{{.*}}"-fwrapv-pointer"
 //
-// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow %s 2>&1 | FileCheck -check-prefix=CHECK4 %s
-// CHECK4-NOT: -fwrapv
+// RUN: %clang -### -S -fwrapv -fstrict-overflow -Werror -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3 %s --implicit-check-not="-fwrapv-pointer"
+// CHECK3: "-fwrapv"
+//
+// RUN: %clang -### -S -fwrapv-pointer -fstrict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK3-POINTER %s --implicit-check-not="-fwrapv"
+// CHECK3-POINTER: "-fwrapv-pointer"
+//
+// RUN: %clang -### -S -fno-wrapv -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4 %s --implicit-check-not="-fwrapv"
+// CHECK4: "-fwrapv-pointer"
+//
+// RUN: %clang -### -S -fno-wrapv-pointer -fno-strict-overflow -Werror %s 2>&1 | FileCheck -check-prefix=CHECK4-POINTER %s --implicit-check-not="-fwrapv-pointer"
+// CHECK4-POINTER: "-fwrapv"
diff --git a/clang/test/Sema/tautological-pointer-comparison.c b/clang/test/Sema/tautological-pointer-comparison.c
index 1c5973b01a30d..f2a944b5305e4 100644
--- a/clang/test/Sema/tautological-pointer-comparison.c
+++ b/clang/test/Sema/tautological-pointer-comparison.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -fsyntax-only -fwrapv -verify=fwrapv %s
+// RUN: %clang_cc1 -fsyntax-only -fwrapv-pointer -verify=fwrapv %s
 
 // fwrapv-no-diagnostics
 

From 0f61558b97aab16fedd38648f88377ff3f860e06 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Tue, 28 Jan 2025 09:11:27 +0000
Subject: [PATCH 365/432] [LoopVectorize][NFC] Remove unused variable in
 addUsersInExitBlocks (#124553)

We were allocating a VPTypeAnalysis object on the stack,
but never using it for anything.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f592e5557c17d..16e5adb01a564 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9072,7 +9072,6 @@ addUsersInExitBlocks(VPlan &Plan,
 
   auto *MiddleVPBB = Plan.getMiddleBlock();
   VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
-  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
 
   // Introduce extract for exiting values and update the VPIRInstructions
   // modeling the corresponding LCSSA phis.

From 4a00c84fbb74e6bc3e010d9046324488cd612ad4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Csan=C3=A1d=20Hajd=C3=BA?= <csanad.hajdu@arm.com>
Date: Tue, 28 Jan 2025 10:16:40 +0100
Subject: [PATCH 366/432] [AArch64] Allow register offset addressing mode for
 prefetch (#124534)

Previously instruction selection failed to generate PRFM instructions
with register offsets because `AArch64ISD::PREFETCH` is not a
`MemSDNode`.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  12 +-
 .../AArch64/arm64-prefetch-addrmode.ll        | 147 ++++++++++++++++++
 2 files changed, 155 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/arm64-prefetch-addrmode.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6aa8cd4f0232a..1387a224fa660 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -665,6 +665,10 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
+static bool isMemOpOrPrefetch(SDNode *N) {
+  return isa<MemSDNode>(*N) || N->getOpcode() == AArch64ISD::PREFETCH;
+}
+
 /// Determine whether it is worth it to fold SHL into the addressing
 /// mode.
 static bool isWorthFoldingSHL(SDValue V) {
@@ -682,9 +686,9 @@ static bool isWorthFoldingSHL(SDValue V) {
   // computation, since the computation will be kept.
   const SDNode *Node = V.getNode();
   for (SDNode *UI : Node->users())
-    if (!isa<MemSDNode>(*UI))
+    if (!isMemOpOrPrefetch(UI))
       for (SDNode *UII : UI->users())
-        if (!isa<MemSDNode>(*UII))
+        if (!isMemOpOrPrefetch(UII))
           return false;
   return true;
 }
@@ -1248,7 +1252,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
   // computation, since the computation will be kept.
   const SDNode *Node = N.getNode();
   for (SDNode *UI : Node->users()) {
-    if (!isa<MemSDNode>(*UI))
+    if (!isMemOpOrPrefetch(UI))
       return false;
   }
 
@@ -1332,7 +1336,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   // computation, since the computation will be kept.
   const SDNode *Node = N.getNode();
   for (SDNode *UI : Node->users()) {
-    if (!isa<MemSDNode>(*UI))
+    if (!isMemOpOrPrefetch(UI))
       return false;
   }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-prefetch-addrmode.ll b/llvm/test/CodeGen/AArch64/arm64-prefetch-addrmode.ll
new file mode 100644
index 0000000000000..44202ffba6374
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-prefetch-addrmode.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+
+define void @imm9(ptr %object) {
+; CHECK-LABEL: imm9:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfum pldl1keep, [x0, #7]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 7
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm9_max(ptr %object) {
+; CHECK-LABEL: imm9_max:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfum pldl1keep, [x0, #255]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 255
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm9_above_max(ptr %object) {
+; CHECK-LABEL: imm9_above_max:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #257
+; CHECK-NEXT:    prfm pldl1keep, [x8]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 257  ; 256 would use the imm12 mode
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm9_min(ptr %object) {
+; CHECK-LABEL: imm9_min:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfum pldl1keep, [x0, #-256]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 -256
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm9_below_min(ptr %object) {
+; CHECK-LABEL: imm9_below_min:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #257
+; CHECK-NEXT:    prfm pldl1keep, [x8]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 -257
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm12(ptr %object) {
+; CHECK-LABEL: imm12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, #8]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 1
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm12_max(ptr %object) {
+; CHECK-LABEL: imm12_max:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, #32760]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 4095
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @imm12_above_max(ptr %object) {
+; CHECK-LABEL: imm12_above_max:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #32768 // =0x8000
+; CHECK-NEXT:    prfm pldl1keep, [x0, x8]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 4096
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg(ptr %object, i64 %a) {
+; CHECK-LABEL: reg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, x1]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 %a
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg_shift(ptr %object, i64 %a) {
+; CHECK-LABEL: reg_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, x1, lsl #3]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 %a
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg_sext(ptr %object, i32 %a) {
+; CHECK-LABEL: reg_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, w1, sxtw]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i32 %a
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg_sext_shift(ptr %object, i32 %a) {
+; CHECK-LABEL: reg_sext_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, w1, sxtw #3]
+; CHECK-NEXT:    ret
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i32 %a
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg_zext(ptr %object, i32 %a) {
+; CHECK-LABEL: reg_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, w1, uxtw]
+; CHECK-NEXT:    ret
+  %a.zext = zext i32 %a to i64
+  %incdec.ptr = getelementptr inbounds i8, ptr %object, i64 %a.zext
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}
+
+define void @reg_zext_shift(ptr %object, i32 %a) {
+; CHECK-LABEL: reg_zext_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    prfm pldl1keep, [x0, w1, uxtw #3]
+; CHECK-NEXT:    ret
+  %a.zext = zext i32 %a to i64
+  %incdec.ptr = getelementptr inbounds i64, ptr %object, i64 %a.zext
+  call void @llvm.prefetch.p0(ptr %incdec.ptr, i32 0, i32 3, i32 1)
+  ret void
+}

From cfc8ef0ad8f70be22b44a8e1b56856795eb18282 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 28 Jan 2025 09:18:34 +0000
Subject: [PATCH 367/432] [libclc] Move copysign to CLC library; fix & optimize
 (#124598)

This commit moves the implementation of the copysign builtin to the CLC
library.

It simultaneously optimizes it for vector types by avoiding
scalarization. It does so by using the __builtin_elementwise_copysign
clang builtins, which can handle vector types.

It also fixes a bug in the half/fp16 implementation of the builtin. This
version was using an incorrect mask (0x7FFFF instead of 0x7FFF) and was
thus preserving the original sign bit, rather than masking it out.
---
 libclc/clc/include/clc/math/clc_copysign.h   | 12 +++++++++
 libclc/clc/include/clc/shared/binary_def.inc | 10 +++++++
 libclc/clc/lib/clspv/SOURCES                 |  1 +
 libclc/clc/lib/generic/SOURCES               |  1 +
 libclc/clc/lib/generic/math/clc_copysign.cl  | 27 +++++++++++++++++++
 libclc/clc/lib/spirv/SOURCES                 |  1 +
 libclc/clc/lib/spirv64/SOURCES               |  1 +
 libclc/generic/lib/math/copysign.cl          | 28 +++-----------------
 8 files changed, 57 insertions(+), 24 deletions(-)
 create mode 100644 libclc/clc/include/clc/math/clc_copysign.h
 create mode 100644 libclc/clc/include/clc/shared/binary_def.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_copysign.cl

diff --git a/libclc/clc/include/clc/math/clc_copysign.h b/libclc/clc/include/clc/math/clc_copysign.h
new file mode 100644
index 0000000000000..2b08acf73795f
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_copysign.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_MATH_CLC_COPYSIGN_H__
+#define __CLC_MATH_CLC_COPYSIGN_H__
+
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#define __CLC_FUNCTION __clc_copysign
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_COPYSIGN_H__
diff --git a/libclc/clc/include/clc/shared/binary_def.inc b/libclc/clc/include/clc/shared/binary_def.inc
new file mode 100644
index 0000000000000..e6ef867f12c6b
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def.inc
@@ -0,0 +1,10 @@
+#include <clc/utils.h>
+
+#ifndef __CLC_FUNCTION
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+#endif
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a,
+                                              __CLC_GENTYPE b) {
+  return __CLC_FUNCTION(FUNCTION)(a, b);
+}
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index 81f90a24d00d6..d5f5c68fd7ebf 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -1,4 +1,5 @@
 ../generic/math/clc_ceil.cl
+../generic/math/clc_copysign.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
 ../generic/math/clc_mad.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 59dad8e860689..9feda65c45f4b 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -5,6 +5,7 @@ geometric/clc_dot.cl
 integer/clc_abs.cl
 integer/clc_abs_diff.cl
 math/clc_ceil.cl
+math/clc_copysign.cl
 math/clc_fabs.cl
 math/clc_floor.cl
 math/clc_mad.cl
diff --git a/libclc/clc/lib/generic/math/clc_copysign.cl b/libclc/clc/lib/generic/math/clc_copysign.cl
new file mode 100644
index 0000000000000..e225ec80692fb
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_copysign.cl
@@ -0,0 +1,27 @@
+#include <clc/clcmacro.h>
+#include <clc/internal/clc.h>
+
+_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(float, __clc_copysign,
+                                        __builtin_elementwise_copysign, float,
+                                        float)
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(double, __clc_copysign,
+                                        __builtin_elementwise_copysign, double,
+                                        double)
+
+#endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN_NO_SCALARIZE(half, __clc_copysign,
+                                        __builtin_elementwise_copysign, half,
+                                        half)
+
+#endif
+
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index 813b1e3d69937..509236d587cd0 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -3,6 +3,7 @@
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
+../generic/math/clc_copysign.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
 ../generic/math/clc_mad.cl
diff --git a/libclc/clc/lib/spirv64/SOURCES b/libclc/clc/lib/spirv64/SOURCES
index 813b1e3d69937..509236d587cd0 100644
--- a/libclc/clc/lib/spirv64/SOURCES
+++ b/libclc/clc/lib/spirv64/SOURCES
@@ -3,6 +3,7 @@
 ../generic/common/clc_smoothstep.cl
 ../generic/geometric/clc_dot.cl
 ../generic/math/clc_ceil.cl
+../generic/math/clc_copysign.cl
 ../generic/math/clc_fabs.cl
 ../generic/math/clc_floor.cl
 ../generic/math/clc_mad.cl
diff --git a/libclc/generic/lib/math/copysign.cl b/libclc/generic/lib/math/copysign.cl
index 08045bebf88a1..59cd4f6520305 100644
--- a/libclc/generic/lib/math/copysign.cl
+++ b/libclc/generic/lib/math/copysign.cl
@@ -1,27 +1,7 @@
 #include <clc/clc.h>
-#include <clc/clcmacro.h>
+#include <clc/math/clc_copysign.h>
 
-_CLC_DEFINE_BINARY_BUILTIN(float, copysign, __builtin_copysignf, float, float)
+#define FUNCTION copysign
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
-#ifdef cl_khr_fp64
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-_CLC_DEFINE_BINARY_BUILTIN(double, copysign, __builtin_copysign, double, double)
-
-#endif
-
-#ifdef cl_khr_fp16
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-_CLC_DEF _CLC_OVERLOAD half copysign(half x, half y)
-{
-   ushort sign_x = as_ushort(x) & 0x8000u;
-   ushort unsigned_y = as_ushort(y) & 0x7ffffu;
-
-   return as_half((ushort)(sign_x | unsigned_y));
-}
-_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, copysign, half, half)
-
-#endif
+#include <clc/math/gentype.inc>

From 9c92824dd63852f3e1145b383d7e6a0cdf1bb97d Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Tue, 28 Jan 2025 04:31:07 -0500
Subject: [PATCH 368/432] [libc++] Remove Android header no longer in use
 (#124691)

929f159777bec47c80a3b302f190261d426e1c3b removed the use of
`__ANDROID_API__`
---
 libcxx/src/system_error.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/libcxx/src/system_error.cpp b/libcxx/src/system_error.cpp
index d5ec73084f638..164fb72621c17 100644
--- a/libcxx/src/system_error.cpp
+++ b/libcxx/src/system_error.cpp
@@ -21,10 +21,6 @@
 
 #include "include/config_elast.h"
 
-#if defined(__ANDROID__)
-#  include <android/api-level.h>
-#endif
-
 #if defined(_LIBCPP_WIN32API)
 #  include <windows.h>
 #  include <winerror.h>

From de4bbbfdccb6172c563b07889ecfb06bc4974a7e Mon Sep 17 00:00:00 2001
From: SivanShani-Arm <sivan.shani@arm.com>
Date: Tue, 28 Jan 2025 09:53:45 +0000
Subject: [PATCH 369/432] [Build Attributes] Standardize names according to
 convention. (#124556)

The de-facto convention for build attributes file and class names seems
to be 'Attrs' in the class name and 'Attributes' in the file name.

Accordingly, change file ARMBuildAttrs.cpp -> ARMBuildAttributes.cpp And
class AArch64BuildAttrs --> AArch64BuildAttributes
---
 .../llvm/Support/AArch64BuildAttributes.h     |  4 +-
 llvm/lib/Support/AArch64BuildAttributes.cpp   | 26 +++---
 ...MBuildAttrs.cpp => ARMBuildAttributes.cpp} |  0
 llvm/lib/Support/CMakeLists.txt               |  2 +-
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp | 70 +++++++--------
 .../AArch64/AsmParser/AArch64AsmParser.cpp    | 87 +++++++++----------
 .../MCTargetDesc/AArch64ELFStreamer.cpp       | 50 +++++------
 .../MCTargetDesc/AArch64TargetStreamer.cpp    |  4 +-
 .../MCTargetDesc/AArch64TargetStreamer.h      |  9 +-
 9 files changed, 118 insertions(+), 134 deletions(-)
 rename llvm/lib/Support/{ARMBuildAttrs.cpp => ARMBuildAttributes.cpp} (100%)

diff --git a/llvm/include/llvm/Support/AArch64BuildAttributes.h b/llvm/include/llvm/Support/AArch64BuildAttributes.h
index ea293b72f9bb1..2479992cf8e79 100644
--- a/llvm/include/llvm/Support/AArch64BuildAttributes.h
+++ b/llvm/include/llvm/Support/AArch64BuildAttributes.h
@@ -22,7 +22,7 @@
 
 namespace llvm {
 
-namespace AArch64BuildAttributes {
+namespace AArch64BuildAttrs {
 
 /// AArch64 build attributes vendors IDs (a.k.a subsection name)
 enum VendorID : unsigned {
@@ -69,7 +69,7 @@ enum FeatureAndBitsFlag : unsigned {
   Feature_PAC_Flag = 1 << 1,
   Feature_GCS_Flag = 1 << 2
 };
-} // namespace AArch64BuildAttributes
+} // namespace AArch64BuildAttrs
 } // namespace llvm
 
 #endif // LLVM_SUPPORT_AARCH64BUILDATTRIBUTES_H
\ No newline at end of file
diff --git a/llvm/lib/Support/AArch64BuildAttributes.cpp b/llvm/lib/Support/AArch64BuildAttributes.cpp
index 4a6b2fd538803..e36667ca711e0 100644
--- a/llvm/lib/Support/AArch64BuildAttributes.cpp
+++ b/llvm/lib/Support/AArch64BuildAttributes.cpp
@@ -10,9 +10,9 @@
 #include "llvm/ADT/StringSwitch.h"
 
 using namespace llvm;
-using namespace llvm::AArch64BuildAttributes;
+using namespace llvm::AArch64BuildAttrs;
 
-StringRef AArch64BuildAttributes::getVendorName(unsigned Vendor) {
+StringRef AArch64BuildAttrs::getVendorName(unsigned Vendor) {
   switch (Vendor) {
   case AEABI_FEATURE_AND_BITS:
     return "aeabi_feature_and_bits";
@@ -25,14 +25,14 @@ StringRef AArch64BuildAttributes::getVendorName(unsigned Vendor) {
     return "";
   }
 }
-VendorID AArch64BuildAttributes::getVendorID(StringRef Vendor) {
+VendorID AArch64BuildAttrs::getVendorID(StringRef Vendor) {
   return StringSwitch<VendorID>(Vendor)
       .Case("aeabi_feature_and_bits", AEABI_FEATURE_AND_BITS)
       .Case("aeabi_pauthabi", AEABI_PAUTHABI)
       .Default(VENDOR_UNKNOWN);
 }
 
-StringRef AArch64BuildAttributes::getOptionalStr(unsigned Optional) {
+StringRef AArch64BuildAttrs::getOptionalStr(unsigned Optional) {
   switch (Optional) {
   case REQUIRED:
     return "required";
@@ -43,18 +43,18 @@ StringRef AArch64BuildAttributes::getOptionalStr(unsigned Optional) {
     return "";
   }
 }
-SubsectionOptional AArch64BuildAttributes::getOptionalID(StringRef Optional) {
+SubsectionOptional AArch64BuildAttrs::getOptionalID(StringRef Optional) {
   return StringSwitch<SubsectionOptional>(Optional)
       .Case("required", REQUIRED)
       .Case("optional", OPTIONAL)
       .Default(OPTIONAL_NOT_FOUND);
 }
-StringRef AArch64BuildAttributes::getSubsectionOptionalUnknownError() {
+StringRef AArch64BuildAttrs::getSubsectionOptionalUnknownError() {
   return "unknown AArch64 build attributes optionality, expected "
          "required|optional";
 }
 
-StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
+StringRef AArch64BuildAttrs::getTypeStr(unsigned Type) {
   switch (Type) {
   case ULEB128:
     return "uleb128";
@@ -65,17 +65,17 @@ StringRef AArch64BuildAttributes::getTypeStr(unsigned Type) {
     return "";
   }
 }
-SubsectionType AArch64BuildAttributes::getTypeID(StringRef Type) {
+SubsectionType AArch64BuildAttrs::getTypeID(StringRef Type) {
   return StringSwitch<SubsectionType>(Type)
       .Cases("uleb128", "ULEB128", ULEB128)
       .Cases("ntbs", "NTBS", NTBS)
       .Default(TYPE_NOT_FOUND);
 }
-StringRef AArch64BuildAttributes::getSubsectionTypeUnknownError() {
+StringRef AArch64BuildAttrs::getSubsectionTypeUnknownError() {
   return "unknown AArch64 build attributes type, expected uleb128|ntbs";
 }
 
-StringRef AArch64BuildAttributes::getPauthABITagsStr(unsigned PauthABITag) {
+StringRef AArch64BuildAttrs::getPauthABITagsStr(unsigned PauthABITag) {
   switch (PauthABITag) {
   case TAG_PAUTH_PLATFORM:
     return "Tag_PAuth_Platform";
@@ -87,7 +87,7 @@ StringRef AArch64BuildAttributes::getPauthABITagsStr(unsigned PauthABITag) {
   }
 }
 
-PauthABITags AArch64BuildAttributes::getPauthABITagsID(StringRef PauthABITag) {
+PauthABITags AArch64BuildAttrs::getPauthABITagsID(StringRef PauthABITag) {
   return StringSwitch<PauthABITags>(PauthABITag)
       .Case("Tag_PAuth_Platform", TAG_PAUTH_PLATFORM)
       .Case("Tag_PAuth_Schema", TAG_PAUTH_SCHEMA)
@@ -95,7 +95,7 @@ PauthABITags AArch64BuildAttributes::getPauthABITagsID(StringRef PauthABITag) {
 }
 
 StringRef
-AArch64BuildAttributes::getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
+AArch64BuildAttrs::getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
   switch (FeatureAndBitsTag) {
   case TAG_FEATURE_BTI:
     return "Tag_Feature_BTI";
@@ -110,7 +110,7 @@ AArch64BuildAttributes::getFeatureAndBitsTagsStr(unsigned FeatureAndBitsTag) {
 }
 
 FeatureAndBitsTags
-AArch64BuildAttributes::getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) {
+AArch64BuildAttrs::getFeatureAndBitsTagsID(StringRef FeatureAndBitsTag) {
   return StringSwitch<FeatureAndBitsTags>(FeatureAndBitsTag)
       .Case("Tag_Feature_BTI", TAG_FEATURE_BTI)
       .Case("Tag_Feature_PAC", TAG_FEATURE_PAC)
diff --git a/llvm/lib/Support/ARMBuildAttrs.cpp b/llvm/lib/Support/ARMBuildAttributes.cpp
similarity index 100%
rename from llvm/lib/Support/ARMBuildAttrs.cpp
rename to llvm/lib/Support/ARMBuildAttributes.cpp
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 122240c27b1fc..a6d8a25818866 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -143,7 +143,7 @@ add_llvm_component_library(LLVMSupport
   APFloat.cpp
   APInt.cpp
   APSInt.cpp
-  ARMBuildAttrs.cpp
+  ARMBuildAttributes.cpp
   AArch64BuildAttributes.cpp
   ARMAttributeParser.cpp
   ARMWinEH.cpp
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 8d8520c68232b..c6b4a219d201f 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -359,7 +359,7 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
   if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("branch-target-enforcement"))) {
     if (!BTE->isZero()) {
-      BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_BTI_Flag;
+      BAFlags |= AArch64BuildAttrs::FeatureAndBitsFlag::Feature_BTI_Flag;
       GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
     }
   }
@@ -367,7 +367,7 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
   if (const auto *GCS = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("guarded-control-stack"))) {
     if (!GCS->isZero()) {
-      BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_GCS_Flag;
+      BAFlags |= AArch64BuildAttrs::FeatureAndBitsFlag::Feature_GCS_Flag;
       GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_GCS;
     }
   }
@@ -375,7 +375,7 @@ void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
   if (const auto *Sign = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("sign-return-address"))) {
     if (!Sign->isZero()) {
-      BAFlags |= AArch64BuildAttributes::FeatureAndBitsFlag::Feature_PAC_Flag;
+      BAFlags |= AArch64BuildAttrs::FeatureAndBitsFlag::Feature_PAC_Flag;
       GNUFlags |= ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
     }
   }
@@ -478,45 +478,35 @@ void AArch64AsmPrinter::emitAttributes(unsigned Flags,
 
   if (PAuthABIPlatform || PAuthABIVersion) {
     TS->emitAtributesSubsection(
-        AArch64BuildAttributes::getVendorName(
-            AArch64BuildAttributes::AEABI_PAUTHABI),
-        AArch64BuildAttributes::SubsectionOptional::REQUIRED,
-        AArch64BuildAttributes::SubsectionType::ULEB128);
-    TS->emitAttribute(AArch64BuildAttributes::getVendorName(
-                          AArch64BuildAttributes::AEABI_PAUTHABI),
-                      AArch64BuildAttributes::TAG_PAUTH_PLATFORM,
-                      PAuthABIPlatform, "", false);
-    TS->emitAttribute(AArch64BuildAttributes::getVendorName(
-                          AArch64BuildAttributes::AEABI_PAUTHABI),
-                      AArch64BuildAttributes::TAG_PAUTH_SCHEMA, PAuthABIVersion,
-                      "", false);
-  }
-
-  unsigned BTIValue =
-      (Flags & AArch64BuildAttributes::Feature_BTI_Flag) ? 1 : 0;
-  unsigned PACValue =
-      (Flags & AArch64BuildAttributes::Feature_PAC_Flag) ? 1 : 0;
-  unsigned GCSValue =
-      (Flags & AArch64BuildAttributes::Feature_GCS_Flag) ? 1 : 0;
+        AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI),
+        AArch64BuildAttrs::SubsectionOptional::REQUIRED,
+        AArch64BuildAttrs::SubsectionType::ULEB128);
+    TS->emitAttribute(
+        AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI),
+        AArch64BuildAttrs::TAG_PAUTH_PLATFORM, PAuthABIPlatform, "", false);
+    TS->emitAttribute(
+        AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI),
+        AArch64BuildAttrs::TAG_PAUTH_SCHEMA, PAuthABIVersion, "", false);
+  }
+
+  unsigned BTIValue = (Flags & AArch64BuildAttrs::Feature_BTI_Flag) ? 1 : 0;
+  unsigned PACValue = (Flags & AArch64BuildAttrs::Feature_PAC_Flag) ? 1 : 0;
+  unsigned GCSValue = (Flags & AArch64BuildAttrs::Feature_GCS_Flag) ? 1 : 0;
 
   if (BTIValue || PACValue || GCSValue) {
-    TS->emitAtributesSubsection(
-        AArch64BuildAttributes::getVendorName(
-            AArch64BuildAttributes::AEABI_FEATURE_AND_BITS),
-        AArch64BuildAttributes::SubsectionOptional::OPTIONAL,
-        AArch64BuildAttributes::SubsectionType::ULEB128);
-    TS->emitAttribute(AArch64BuildAttributes::getVendorName(
-                          AArch64BuildAttributes::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttributes::TAG_FEATURE_BTI, BTIValue, "",
-                      false);
-    TS->emitAttribute(AArch64BuildAttributes::getVendorName(
-                          AArch64BuildAttributes::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttributes::TAG_FEATURE_PAC, PACValue, "",
-                      false);
-    TS->emitAttribute(AArch64BuildAttributes::getVendorName(
-                          AArch64BuildAttributes::AEABI_FEATURE_AND_BITS),
-                      AArch64BuildAttributes::TAG_FEATURE_GCS, GCSValue, "",
-                      false);
+    TS->emitAtributesSubsection(AArch64BuildAttrs::getVendorName(
+                                    AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
+                                AArch64BuildAttrs::SubsectionOptional::OPTIONAL,
+                                AArch64BuildAttrs::SubsectionType::ULEB128);
+    TS->emitAttribute(AArch64BuildAttrs::getVendorName(
+                          AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
+                      AArch64BuildAttrs::TAG_FEATURE_BTI, BTIValue, "", false);
+    TS->emitAttribute(AArch64BuildAttrs::getVendorName(
+                          AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
+                      AArch64BuildAttrs::TAG_FEATURE_PAC, PACValue, "", false);
+    TS->emitAttribute(AArch64BuildAttrs::getVendorName(
+                          AArch64BuildAttrs::AEABI_FEATURE_AND_BITS),
+                      AArch64BuildAttrs::TAG_FEATURE_GCS, GCSValue, "", false);
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 27b052825d213..43f07be15e9d1 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -7843,10 +7843,10 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
 
   // Consume the name (subsection name)
   StringRef SubsectionName;
-  AArch64BuildAttributes::VendorID SubsectionNameID;
+  AArch64BuildAttrs::VendorID SubsectionNameID;
   if (Parser.getTok().is(AsmToken::Identifier)) {
     SubsectionName = Parser.getTok().getIdentifier();
-    SubsectionNameID = AArch64BuildAttributes::getVendorID(SubsectionName);
+    SubsectionNameID = AArch64BuildAttrs::getVendorID(SubsectionName);
   } else {
     Error(Parser.getTok().getLoc(), "subsection name not found");
     return true;
@@ -7863,14 +7863,14 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
       getTargetStreamer().getAtributesSubsectionByName(SubsectionName);
 
   // Consume the first parameter (optionality parameter)
-  AArch64BuildAttributes::SubsectionOptional IsOptional;
+  AArch64BuildAttrs::SubsectionOptional IsOptional;
   // options: optional/required
   if (Parser.getTok().is(AsmToken::Identifier)) {
     StringRef Optionality = Parser.getTok().getIdentifier();
-    IsOptional = AArch64BuildAttributes::getOptionalID(Optionality);
-    if (AArch64BuildAttributes::OPTIONAL_NOT_FOUND == IsOptional) {
+    IsOptional = AArch64BuildAttrs::getOptionalID(Optionality);
+    if (AArch64BuildAttrs::OPTIONAL_NOT_FOUND == IsOptional) {
       Error(Parser.getTok().getLoc(),
-            AArch64BuildAttributes::getSubsectionOptionalUnknownError() + ": " +
+            AArch64BuildAttrs::getSubsectionOptionalUnknownError() + ": " +
                 Optionality);
       return true;
     }
@@ -7879,10 +7879,10 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
         Error(Parser.getTok().getLoc(),
               "optionality mismatch! subsection '" + SubsectionName +
                   "' already exists with optionality defined as '" +
-                  AArch64BuildAttributes::getOptionalStr(
+                  AArch64BuildAttrs::getOptionalStr(
                       SubsectionExists->IsOptional) +
                   "' and not '" +
-                  AArch64BuildAttributes::getOptionalStr(IsOptional) + "'");
+                  AArch64BuildAttrs::getOptionalStr(IsOptional) + "'");
         return true;
       }
     }
@@ -7892,15 +7892,15 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     return true;
   }
   // Check for possible IsOptional unaccepted values for known subsections
-  if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID) {
-    if (AArch64BuildAttributes::REQUIRED == IsOptional) {
+  if (AArch64BuildAttrs::AEABI_FEATURE_AND_BITS == SubsectionNameID) {
+    if (AArch64BuildAttrs::REQUIRED == IsOptional) {
       Error(Parser.getTok().getLoc(),
             "aeabi_feature_and_bits must be marked as optional");
       return true;
     }
   }
-  if (AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) {
-    if (AArch64BuildAttributes::OPTIONAL == IsOptional) {
+  if (AArch64BuildAttrs::AEABI_PAUTHABI == SubsectionNameID) {
+    if (AArch64BuildAttrs::OPTIONAL == IsOptional) {
       Error(Parser.getTok().getLoc(),
             "aeabi_pauthabi must be marked as required");
       return true;
@@ -7913,25 +7913,23 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
   }
 
   // Consume the second parameter (type parameter)
-  AArch64BuildAttributes::SubsectionType Type;
+  AArch64BuildAttrs::SubsectionType Type;
   if (Parser.getTok().is(AsmToken::Identifier)) {
     StringRef Name = Parser.getTok().getIdentifier();
-    Type = AArch64BuildAttributes::getTypeID(Name);
-    if (AArch64BuildAttributes::TYPE_NOT_FOUND == Type) {
+    Type = AArch64BuildAttrs::getTypeID(Name);
+    if (AArch64BuildAttrs::TYPE_NOT_FOUND == Type) {
       Error(Parser.getTok().getLoc(),
-            AArch64BuildAttributes::getSubsectionTypeUnknownError() + ": " +
-                Name);
+            AArch64BuildAttrs::getSubsectionTypeUnknownError() + ": " + Name);
       return true;
     }
     if (SubsectionExists) {
       if (Type != SubsectionExists->ParameterType) {
-        Error(Parser.getTok().getLoc(),
-              "type mismatch! subsection '" + SubsectionName +
-                  "' already exists with type defined as '" +
-                  AArch64BuildAttributes::getTypeStr(
-                      SubsectionExists->ParameterType) +
-                  "' and not '" + AArch64BuildAttributes::getTypeStr(Type) +
-                  "'");
+        Error(
+            Parser.getTok().getLoc(),
+            "type mismatch! subsection '" + SubsectionName +
+                "' already exists with type defined as '" +
+                AArch64BuildAttrs::getTypeStr(SubsectionExists->ParameterType) +
+                "' and not '" + AArch64BuildAttrs::getTypeStr(Type) + "'");
         return true;
       }
     }
@@ -7941,9 +7939,9 @@ bool AArch64AsmParser::parseDirectiveAeabiSubSectionHeader(SMLoc L) {
     return true;
   }
   // Check for possible unaccepted 'type' values for known subsections
-  if (AArch64BuildAttributes::AEABI_FEATURE_AND_BITS == SubsectionNameID ||
-      AArch64BuildAttributes::AEABI_PAUTHABI == SubsectionNameID) {
-    if (AArch64BuildAttributes::NTBS == Type) {
+  if (AArch64BuildAttrs::AEABI_FEATURE_AND_BITS == SubsectionNameID ||
+      AArch64BuildAttrs::AEABI_PAUTHABI == SubsectionNameID) {
+    if (AArch64BuildAttrs::NTBS == Type) {
       Error(Parser.getTok().getLoc(),
             SubsectionName + " must be marked as ULEB128");
       return true;
@@ -7978,14 +7976,13 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
   StringRef ActiveSubsectionName = ActiveSubsection->VendorName;
   unsigned ActiveSubsectionType = ActiveSubsection->ParameterType;
 
-  unsigned ActiveSubsectionID = AArch64BuildAttributes::VENDOR_UNKNOWN;
-  if (AArch64BuildAttributes::getVendorName(
-          AArch64BuildAttributes::AEABI_PAUTHABI) == ActiveSubsectionName)
-    ActiveSubsectionID = AArch64BuildAttributes::AEABI_PAUTHABI;
-  if (AArch64BuildAttributes::getVendorName(
-          AArch64BuildAttributes::AEABI_FEATURE_AND_BITS) ==
+  unsigned ActiveSubsectionID = AArch64BuildAttrs::VENDOR_UNKNOWN;
+  if (AArch64BuildAttrs::getVendorName(AArch64BuildAttrs::AEABI_PAUTHABI) ==
       ActiveSubsectionName)
-    ActiveSubsectionID = AArch64BuildAttributes::AEABI_FEATURE_AND_BITS;
+    ActiveSubsectionID = AArch64BuildAttrs::AEABI_PAUTHABI;
+  if (AArch64BuildAttrs::getVendorName(
+          AArch64BuildAttrs::AEABI_FEATURE_AND_BITS) == ActiveSubsectionName)
+    ActiveSubsectionID = AArch64BuildAttrs::AEABI_FEATURE_AND_BITS;
 
   StringRef TagStr = "";
   unsigned Tag;
@@ -7995,21 +7992,21 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
     default:
       assert(0 && "Subsection name error");
       break;
-    case AArch64BuildAttributes::VENDOR_UNKNOWN:
+    case AArch64BuildAttrs::VENDOR_UNKNOWN:
       // Private subsection, accept any tag.
       break;
-    case AArch64BuildAttributes::AEABI_PAUTHABI:
-      Tag = AArch64BuildAttributes::getPauthABITagsID(TagStr);
-      if (AArch64BuildAttributes::PAUTHABI_TAG_NOT_FOUND == Tag) {
+    case AArch64BuildAttrs::AEABI_PAUTHABI:
+      Tag = AArch64BuildAttrs::getPauthABITagsID(TagStr);
+      if (AArch64BuildAttrs::PAUTHABI_TAG_NOT_FOUND == Tag) {
         Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" +
                                             TagStr + "' for subsection '" +
                                             ActiveSubsectionName + "'");
         return true;
       }
       break;
-    case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS:
-      Tag = AArch64BuildAttributes::getFeatureAndBitsTagsID(TagStr);
-      if (AArch64BuildAttributes::FEATURE_AND_BITS_TAG_NOT_FOUND == Tag) {
+    case AArch64BuildAttrs::AEABI_FEATURE_AND_BITS:
+      Tag = AArch64BuildAttrs::getFeatureAndBitsTagsID(TagStr);
+      if (AArch64BuildAttrs::FEATURE_AND_BITS_TAG_NOT_FOUND == Tag) {
         Error(Parser.getTok().getLoc(), "unknown AArch64 build attribute '" +
                                             TagStr + "' for subsection '" +
                                             ActiveSubsectionName + "'");
@@ -8035,7 +8032,7 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
   unsigned ValueInt = unsigned(-1);
   std::string ValueStr = "";
   if (Parser.getTok().is(AsmToken::Integer)) {
-    if (AArch64BuildAttributes::NTBS == ActiveSubsectionType) {
+    if (AArch64BuildAttrs::NTBS == ActiveSubsectionType) {
       Error(
           Parser.getTok().getLoc(),
           "active subsection type is NTBS (string), found ULEB128 (unsigned)");
@@ -8043,7 +8040,7 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
     }
     ValueInt = getTok().getIntVal();
   } else if (Parser.getTok().is(AsmToken::Identifier)) {
-    if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) {
+    if (AArch64BuildAttrs::ULEB128 == ActiveSubsectionType) {
       Error(
           Parser.getTok().getLoc(),
           "active subsection type is ULEB128 (unsigned), found NTBS (string)");
@@ -8051,7 +8048,7 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
     }
     ValueStr = Parser.getTok().getIdentifier();
   } else if (Parser.getTok().is(AsmToken::String)) {
-    if (AArch64BuildAttributes::ULEB128 == ActiveSubsectionType) {
+    if (AArch64BuildAttrs::ULEB128 == ActiveSubsectionType) {
       Error(
           Parser.getTok().getLoc(),
           "active subsection type is ULEB128 (unsigned), found NTBS (string)");
@@ -8064,7 +8061,7 @@ bool AArch64AsmParser::parseDirectiveAeabiAArch64Attr(SMLoc L) {
   }
   // Check for possible unaccepted values for known tags (AEABI_PAUTHABI,
   // AEABI_FEATURE_AND_BITS)
-  if (!(ActiveSubsectionID == AArch64BuildAttributes::VENDOR_UNKNOWN) &&
+  if (!(ActiveSubsectionID == AArch64BuildAttrs::VENDOR_UNKNOWN) &&
       TagStr != "") { // TagStr was a recognized string
     if (0 != ValueInt && 1 != ValueInt) {
       Error(Parser.getTok().getLoc(),
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 9f7a60074daeb..6b5c5f36cbd4b 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -160,13 +160,13 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
       return;
     }
 
-    unsigned VendorID = AArch64BuildAttributes::getVendorID(VendorName);
+    unsigned VendorID = AArch64BuildAttrs::getVendorID(VendorName);
 
     switch (VendorID) {
     default:
       assert(0 && "Subsection name error");
       break;
-    case AArch64BuildAttributes::VENDOR_UNKNOWN:
+    case AArch64BuildAttrs::VENDOR_UNKNOWN:
       if (unsigned(-1) != Value) {
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
@@ -179,7 +179,7 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
       }
       break;
     // Note: AEABI_FEATURE_AND_BITS takes only unsigned values
-    case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS:
+    case AArch64BuildAttrs::AEABI_FEATURE_AND_BITS:
       switch (Tag) {
       default: // allow emitting any attribute by number
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
@@ -188,19 +188,18 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
                                              Override);
         break;
-      case AArch64BuildAttributes::TAG_FEATURE_BTI:
-      case AArch64BuildAttributes::TAG_FEATURE_GCS:
-      case AArch64BuildAttributes::TAG_FEATURE_PAC:
+      case AArch64BuildAttrs::TAG_FEATURE_BTI:
+      case AArch64BuildAttrs::TAG_FEATURE_GCS:
+      case AArch64BuildAttrs::TAG_FEATURE_PAC:
         OS << "\t.aeabi_attribute" << "\t"
-           << AArch64BuildAttributes::getFeatureAndBitsTagsStr(Tag) << ", "
-           << Value;
+           << AArch64BuildAttrs::getFeatureAndBitsTagsStr(Tag) << ", " << Value;
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
                                              Override);
         break;
       }
       break;
     // Note: AEABI_PAUTHABI takes only unsigned values
-    case AArch64BuildAttributes::AEABI_PAUTHABI:
+    case AArch64BuildAttrs::AEABI_PAUTHABI:
       switch (Tag) {
       default: // allow emitting any attribute by number
         OS << "\t.aeabi_attribute" << "\t" << Tag << ", " << Value;
@@ -209,10 +208,10 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
                                              Override);
         break;
-      case AArch64BuildAttributes::TAG_PAUTH_PLATFORM:
-      case AArch64BuildAttributes::TAG_PAUTH_SCHEMA:
+      case AArch64BuildAttrs::TAG_PAUTH_PLATFORM:
+      case AArch64BuildAttrs::TAG_PAUTH_SCHEMA:
         OS << "\t.aeabi_attribute" << "\t"
-           << AArch64BuildAttributes::getPauthABITagsStr(Tag) << ", " << Value;
+           << AArch64BuildAttrs::getPauthABITagsStr(Tag) << ", " << Value;
         AArch64TargetStreamer::emitAttribute(VendorName, Tag, Value, "",
                                              Override);
         break;
@@ -223,19 +222,18 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
   }
 
   void emitAtributesSubsection(
-      StringRef SubsectionName,
-      AArch64BuildAttributes::SubsectionOptional Optional,
-      AArch64BuildAttributes::SubsectionType ParameterType) override {
+      StringRef SubsectionName, AArch64BuildAttrs::SubsectionOptional Optional,
+      AArch64BuildAttrs::SubsectionType ParameterType) override {
     // The AArch64 build attributes assembly subsection header format:
     // ".aeabi_subsection name, optional, parameter type"
     // optional: required (0) optional (1)
     // parameter type: uleb128 or ULEB128 (0) ntbs or NTBS (1)
-    unsigned SubsectionID = AArch64BuildAttributes::getVendorID(SubsectionName);
+    unsigned SubsectionID = AArch64BuildAttrs::getVendorID(SubsectionName);
 
     assert((0 == Optional || 1 == Optional) &&
-           AArch64BuildAttributes::getSubsectionOptionalUnknownError().data());
+           AArch64BuildAttrs::getSubsectionOptionalUnknownError().data());
     assert((0 == ParameterType || 1 == ParameterType) &&
-           AArch64BuildAttributes::getSubsectionTypeUnknownError().data());
+           AArch64BuildAttrs::getSubsectionTypeUnknownError().data());
 
     std::string SubsectionTag = ".aeabi_subsection";
     StringRef OptionalStr = getOptionalStr(Optional);
@@ -246,20 +244,20 @@ class AArch64TargetAsmStreamer : public AArch64TargetStreamer {
       // Treated as a private subsection
       break;
     }
-    case AArch64BuildAttributes::AEABI_PAUTHABI: {
-      assert(AArch64BuildAttributes::REQUIRED == Optional &&
+    case AArch64BuildAttrs::AEABI_PAUTHABI: {
+      assert(AArch64BuildAttrs::REQUIRED == Optional &&
              "subsection .aeabi-pauthabi should be marked as "
              "required and not as optional");
-      assert(AArch64BuildAttributes::ULEB128 == ParameterType &&
+      assert(AArch64BuildAttrs::ULEB128 == ParameterType &&
              "subsection .aeabi-pauthabi should be "
              "marked as uleb128 and not as ntbs");
       break;
     }
-    case AArch64BuildAttributes::AEABI_FEATURE_AND_BITS: {
-      assert(AArch64BuildAttributes::OPTIONAL == Optional &&
+    case AArch64BuildAttrs::AEABI_FEATURE_AND_BITS: {
+      assert(AArch64BuildAttrs::OPTIONAL == Optional &&
              "subsection .aeabi_feature_and_bits should be "
              "marked as optional and not as required");
-      assert(AArch64BuildAttributes::ULEB128 == ParameterType &&
+      assert(AArch64BuildAttrs::ULEB128 == ParameterType &&
              "subsection .aeabi_feature_and_bits should "
              "be marked as uleb128 and not as ntbs");
       break;
@@ -421,8 +419,8 @@ AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
 }
 
 void AArch64TargetELFStreamer::emitAtributesSubsection(
-    StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional,
-    AArch64BuildAttributes::SubsectionType ParameterType) {
+    StringRef VendorName, AArch64BuildAttrs::SubsectionOptional IsOptional,
+    AArch64BuildAttrs::SubsectionType ParameterType) {
   AArch64TargetStreamer::emitAtributesSubsection(VendorName, IsOptional,
                                                  ParameterType);
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index 74ffe5f97f1b6..1ed4a81a97673 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -153,8 +153,8 @@ MCTargetStreamer *llvm::createAArch64NullTargetStreamer(MCStreamer &S) {
 }
 
 void AArch64TargetStreamer::emitAtributesSubsection(
-    StringRef VendorName, AArch64BuildAttributes::SubsectionOptional IsOptional,
-    AArch64BuildAttributes::SubsectionType ParameterType) {
+    StringRef VendorName, AArch64BuildAttrs::SubsectionOptional IsOptional,
+    AArch64BuildAttrs::SubsectionType ParameterType) {
 
   // If exists, return.
   for (MCELFStreamer::AttributeSubSection &SubSection : AttributeSubSections) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
index b2b9afe867073..a33f0bc78c213 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.h
@@ -97,8 +97,8 @@ class AArch64TargetStreamer : public MCTargetStreamer {
   /// Build attributes implementation
   virtual void
   emitAtributesSubsection(StringRef VendorName,
-                          AArch64BuildAttributes::SubsectionOptional IsOptional,
-                          AArch64BuildAttributes::SubsectionType ParameterType);
+                          AArch64BuildAttrs::SubsectionOptional IsOptional,
+                          AArch64BuildAttrs::SubsectionType ParameterType);
   virtual void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
                              std::string String, bool Override);
   void activateAtributesSubsection(StringRef VendorName);
@@ -124,9 +124,8 @@ class AArch64TargetELFStreamer : public AArch64TargetStreamer {
 
   /// Build attributes implementation
   void emitAtributesSubsection(
-      StringRef VendorName,
-      AArch64BuildAttributes::SubsectionOptional IsOptional,
-      AArch64BuildAttributes::SubsectionType ParameterType) override;
+      StringRef VendorName, AArch64BuildAttrs::SubsectionOptional IsOptional,
+      AArch64BuildAttrs::SubsectionType ParameterType) override;
   void emitAttribute(StringRef VendorName, unsigned Tag, unsigned Value,
                      std::string String, bool Override = false) override;
   void emitInst(uint32_t Inst) override;

From d69b785733554f31d6538bbd6faaa2f018dd320e Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 28 Jan 2025 18:57:43 +0900
Subject: [PATCH 370/432] [bazel] Add Builtins for #122873(NVPTX) and
 #123460(Hexagon)

---
 .../llvm-project-overlay/clang/BUILD.bazel    | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index a328bfa95d6fd..a17b0b54a49be 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -347,6 +347,34 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "basic_builtins_hexagon_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsHexagon.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsHexagon.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsHexagon.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
+    name = "basic_builtins_nvptx_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsNVPTX.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsNVPTX.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsNVPTX.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
 gentbl(
     name = "basic_builtins_spirv_gen",
     tbl_outs = [(
@@ -743,6 +771,8 @@ cc_library(
         ":basic_attr_gen",
         ":basic_builtins_bpf_gen",
         ":basic_builtins_gen",
+        ":basic_builtins_hexagon_gen",
+        ":basic_builtins_nvptx_gen",
         ":basic_builtins_riscv_gen",
         ":basic_builtins_spirv_gen",
         ":basic_builtins_x86_64_gen",

From 8fe7860610e3ff699831e11e4d57e38a198c40e8 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 28 Jan 2025 10:02:28 +0000
Subject: [PATCH 371/432] LAA/test: cover invariant stores with unit stride
 (#124586)

LoopAccessAnalysis is missing coverage of the special-case of invariant
stores with unit stride. It was previously determined that
stride-versioning for stores is not profitable, but test coverage is
missing. Fix this.
---
 .../invariant-dependence-before.ll            | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
index 2139804753ef5..c0b044aef0d62 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dependence-before.ll
@@ -784,3 +784,38 @@ loop:
 exit:
   ret void
 }
+
+define void @invariant_stores_unit_stride(i32 %offset, ptr noalias %dst.1, ptr %dst.2) {
+; CHECK-LABEL: 'invariant_stores_unit_stride'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %add = add i32 %offset, 3
+  br label %loop
+
+loop:
+  %iv.2 = phi i32 [ 0, %entry ], [ %iv.2.next, %loop ]
+  %iv.3 = phi i32 [ 0, %entry ], [ %iv.3.next, %loop ]
+  %iv.mul  = mul i32 %iv.3, %add
+  %gep.mul = getelementptr i8, ptr %dst.1, i32 %iv.mul
+  store i32 0, ptr %gep.mul, align 8
+  %iv.2.mul  = mul i32 %iv.2, %offset
+  %gep = getelementptr i32, ptr %dst.2, i32 %iv.2.mul
+  store i32 0, ptr %gep, align 8
+  %iv.2.next = add i32 %iv.2, 1
+  %iv.3.next = add i32 %iv.3, 1
+  %ec = icmp eq i32 %iv.3, 200
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

From 71ab44a8193c56e4ef9aede4d9bac8e14760c6c6 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <wolfgang.pieb@sony.com>
Date: Wed, 25 Sep 2024 15:08:39 +0100
Subject: [PATCH 372/432] [Clang] Add "extend lifetime" flags and release note
 (#110000)

Following the commit that added the fake use intrinsic to LLVM, this patch
adds a pair of flags for the clang frontend that emit fake use intrinsics,
for the purpose of extending the lifetime of variables (either all source
variables, or just the `this` pointer). This patch does not implement the
fake use intrinsic emission of the flags themselves, it simply adds the flags,
the corresponding release note, and the attachment of the `has_fake_uses`
attribute to affected functions; the remaining functionality appears in the
next patch.

Co-authored-by: Stephen Tozer <stephen.tozer@sony.com>
---
 clang/docs/ReleaseNotes.rst                  | 14 ++++++++++++++
 clang/include/clang/Basic/CodeGenOptions.def |  3 +++
 clang/include/clang/Basic/CodeGenOptions.h   |  6 ++++++
 clang/include/clang/Driver/Options.td        | 20 ++++++++++++++++++++
 clang/lib/Driver/ToolChains/Clang.cpp        |  2 ++
 clang/test/Driver/extend-variable-liveness.c | 15 +++++++++++++++
 6 files changed, 60 insertions(+)
 create mode 100644 clang/test/Driver/extend-variable-liveness.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index de24ab9cab5c6..ad524ad3c5bbb 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -519,6 +519,20 @@ New Compiler Flags
   only for thread-local variables, and none (which corresponds to the
   existing ``-fno-c++-static-destructors`` flag) skips all static
   destructors registration.
+- The ``-fextend-variable-liveness`` flag has been added to allow for improved
+  debugging of optimized code. Using ``-fextend-variable-liveness`` will cause
+  Clang to generate code that tries to preserve the liveness of source variables
+  through optimizations, meaning that variables will typically be visible in a
+  debugger more often. The flag has two levels: ``-fextend-variable-liveness``,
+  or ``-fextend-variable-liveness=all``, extendes the liveness of all user
+  variables and the ``this`` pointer. Alternatively ``-fextend-this-ptr``, or
+  ``-fextend-variable-liveness=this``, has the same behaviour but applies only
+  to the ``this`` variable in C++ class member functions, meaning its effect is
+  a strict subset of ``-fextend-variable-liveness``. Note that this flag
+  modifies the results of optimizations that Clang performs, which will result
+  in reduced performance in generated code; however, this feature will not
+  extend the liveness of some variables in cases where doing so would likely
+  have a severe impact on generated code performance.
 
 - The ``-Warray-compare`` warning has been added to warn about array comparison
   on versions older than C++20.
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 1ab8c7fb4d3c3..259972bdf8f00 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -393,6 +393,9 @@ CODEGENOPT(EnableTLSDESC, 1, 0)
 /// Bit size of immediate TLS offsets (0 == use the default).
 VALUE_CODEGENOPT(TLSSize, 8, 0)
 
+/// The types of variables that we will extend the live ranges of.
+ENUM_CODEGENOPT(ExtendVariableLiveness, ExtendVariableLivenessKind, 2, ExtendVariableLivenessKind::None)
+
 /// The default stack protector guard offset to use.
 VALUE_CODEGENOPT(StackProtectorGuardOffset, 32, INT_MAX)
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index b64ad74d711c6..c531c656f42b7 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -95,6 +95,12 @@ class CodeGenOptions : public CodeGenOptionsBase {
     Embed_Marker    // Embed a marker as a placeholder for bitcode.
   };
 
+  enum class ExtendVariableLivenessKind {
+    None,
+    This,
+    All,
+  };
+
   enum InlineAsmDialectKind {
     IAD_ATT,
     IAD_Intel,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 75f2487143ee8..6fa8a8273aca3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4349,6 +4349,26 @@ def stack_usage_file : Separate<["-"], "stack-usage-file">,
   Visibility<[CC1Option]>,
   HelpText<"Filename (or -) to write stack usage output to">,
   MarshallingInfoString<CodeGenOpts<"StackUsageOutput">>;
+def fextend_variable_liveness_EQ : Joined<["-"], "fextend-variable-liveness=">,
+  Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Extend the liveness of user variables through optimizations to "
+           "prevent stale or optimized-out variable values when debugging. Can "
+           "be applied to all user variables, or just to the C++ 'this' ptr. "
+           "May choose not to extend the liveness of some variables, such as "
+           "non-scalars larger than 4 unsigned ints, or variables in any "
+           "inlined functions.">,
+  Values<"all,this,none">,
+  NormalizedValues<["All", "This", "None"]>,
+  NormalizedValuesScope<"CodeGenOptions::ExtendVariableLivenessKind">,
+  MarshallingInfoEnum<CodeGenOpts<"ExtendVariableLiveness">, "None">;
+def fextend_this_ptr_liveness : Flag<["-"], "fextend-this-ptr-liveness">,
+  Visibility<[ClangOption, CC1Option]>,
+  Alias<fextend_variable_liveness_EQ>, AliasArgs<["this"]>,
+  HelpText<"Alias for -fextend-variable-liveness=this.">;
+def fextend_variable_liveness : Flag<["-"], "fextend-variable-liveness">,
+  Visibility<[ClangOption, CC1Option]>,
+  Alias<fextend_variable_liveness_EQ>, AliasArgs<["all"]>,
+  HelpText<"Alias for -fextend-variable-liveness=all.">;
 
 defm unique_basic_block_section_names : BoolFOption<"unique-basic-block-section-names",
   CodeGenOpts<"UniqueBasicBlockSectionNames">, DefaultFalse,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 518113e20cb06..589de953be5be 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7706,6 +7706,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_fretain_comments_from_system_headers))
     CmdArgs.push_back("-fretain-comments-from-system-headers");
 
+  Args.AddLastArg(CmdArgs, options::OPT_fextend_variable_liveness_EQ);
+
   // Forward -fcomment-block-commands to -cc1.
   Args.AddAllArgs(CmdArgs, options::OPT_fcomment_block_commands);
   // Forward -fparse-all-comments to -cc1.
diff --git a/clang/test/Driver/extend-variable-liveness.c b/clang/test/Driver/extend-variable-liveness.c
new file mode 100644
index 0000000000000..bdd89d6f7721c
--- /dev/null
+++ b/clang/test/Driver/extend-variable-liveness.c
@@ -0,0 +1,15 @@
+// Tests that -fextend-variable-liveness and its aliases are correctly passed
+// by the driver.
+
+// RUN: %clang -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,DEFAULT
+// RUN: %clang -fextend-variable-liveness=none -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,NONE
+// RUN: %clang -fextend-variable-liveness=this -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THIS
+// RUN: %clang -fextend-this-ptr-liveness -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,THIS
+// RUN: %clang -fextend-variable-liveness=all -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,ALL
+// RUN: %clang -fextend-variable-liveness -### -c %s 2>&1 | FileCheck %s --check-prefixes=CHECK,ALL
+
+// CHECK:       "-cc1"
+// DEFAULT-NOT: -fextend-variable-liveness
+// NONE-SAME:   "-fextend-variable-liveness=none"
+// THIS-SAME:   "-fextend-variable-liveness=this"
+// ALL-SAME:    "-fextend-variable-liveness=all"

From 7cd6f85578147573af63dffe74e14fa7713ba18e Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Tue, 28 Jan 2025 15:49:21 +0530
Subject: [PATCH 373/432] [CodeGen][NFC] Format RegisterCoalescer sources
 (#124697)

---
 llvm/lib/CodeGen/RegisterCoalescer.cpp | 844 +++++++++++++------------
 llvm/lib/CodeGen/RegisterCoalescer.h   | 128 ++--
 2 files changed, 489 insertions(+), 483 deletions(-)

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 8313927dd2aa1..4203a75b0c70e 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -62,15 +62,15 @@ using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
-STATISTIC(numJoins    , "Number of interval joins performed");
-STATISTIC(numCrossRCs , "Number of cross class joins performed");
-STATISTIC(numCommutes , "Number of instruction commuting performed");
-STATISTIC(numExtends  , "Number of copies extended");
-STATISTIC(NumReMats   , "Number of instructions re-materialized");
-STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(numJoins, "Number of interval joins performed");
+STATISTIC(numCrossRCs, "Number of cross class joins performed");
+STATISTIC(numCommutes, "Number of instruction commuting performed");
+STATISTIC(numExtends, "Number of copies extended");
+STATISTIC(NumReMats, "Number of instructions re-materialized");
+STATISTIC(NumInflated, "Number of register classes inflated");
 STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
-STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
-STATISTIC(NumShrinkToUses,  "Number of shrinkToUses called");
+STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved");
+STATISTIC(NumShrinkToUses, "Number of shrinkToUses called");
 
 static cl::opt<bool> EnableJoining("join-liveintervals",
                                    cl::desc("Coalesce copies (default=true)"),
@@ -81,20 +81,20 @@ static cl::opt<bool> UseTerminalRule("terminal-rule",
                                      cl::init(false), cl::Hidden);
 
 /// Temporary flag to test critical edge unsplitting.
-static cl::opt<bool>
-EnableJoinSplits("join-splitedges",
-  cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
+static cl::opt<bool> EnableJoinSplits(
+    "join-splitedges",
+    cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
 
 /// Temporary flag to test global copy optimization.
-static cl::opt<cl::boolOrDefault>
-EnableGlobalCopies("join-globalcopies",
-  cl::desc("Coalesce copies that span blocks (default=subtarget)"),
-  cl::init(cl::BOU_UNSET), cl::Hidden);
+static cl::opt<cl::boolOrDefault> EnableGlobalCopies(
+    "join-globalcopies",
+    cl::desc("Coalesce copies that span blocks (default=subtarget)"),
+    cl::init(cl::BOU_UNSET), cl::Hidden);
 
-static cl::opt<bool>
-VerifyCoalescing("verify-coalescing",
-         cl::desc("Verify machine instrs before and after register coalescing"),
-         cl::Hidden);
+static cl::opt<bool> VerifyCoalescing(
+    "verify-coalescing",
+    cl::desc("Verify machine instrs before and after register coalescing"),
+    cl::Hidden);
 
 static cl::opt<unsigned> LateRematUpdateThreshold(
     "late-remat-update-threshold", cl::Hidden,
@@ -120,283 +120,282 @@ static cl::opt<unsigned> LargeIntervalFreqThreshold(
 
 namespace {
 
-  class JoinVals;
-
-  class RegisterCoalescer : public MachineFunctionPass,
-                            private LiveRangeEdit::Delegate {
-    MachineFunction* MF = nullptr;
-    MachineRegisterInfo* MRI = nullptr;
-    const TargetRegisterInfo* TRI = nullptr;
-    const TargetInstrInfo* TII = nullptr;
-    LiveIntervals *LIS = nullptr;
-    const MachineLoopInfo* Loops = nullptr;
-    AliasAnalysis *AA = nullptr;
-    RegisterClassInfo RegClassInfo;
-
-    /// Position and VReg of a PHI instruction during coalescing.
-    struct PHIValPos {
-      SlotIndex SI;    ///< Slot where this PHI occurs.
-      Register Reg;    ///< VReg the PHI occurs in.
-      unsigned SubReg; ///< Qualifying subregister for Reg.
-    };
-
-    /// Map from debug instruction number to PHI position during coalescing.
-    DenseMap<unsigned, PHIValPos> PHIValToPos;
-    /// Index of, for each VReg, which debug instruction numbers and
-    /// corresponding PHIs are sensitive to coalescing. Each VReg may have
-    /// multiple PHI defs, at different positions.
-    DenseMap<Register, SmallVector<unsigned, 2>> RegToPHIIdx;
-
-    /// Debug variable location tracking -- for each VReg, maintain an
-    /// ordered-by-slot-index set of DBG_VALUEs, to help quick
-    /// identification of whether coalescing may change location validity.
-    using DbgValueLoc = std::pair<SlotIndex, MachineInstr*>;
-    DenseMap<Register, std::vector<DbgValueLoc>> DbgVRegToValues;
-
-    /// A LaneMask to remember on which subregister live ranges we need to call
-    /// shrinkToUses() later.
-    LaneBitmask ShrinkMask;
-
-    /// True if the main range of the currently coalesced intervals should be
-    /// checked for smaller live intervals.
-    bool ShrinkMainRange = false;
-
-    /// True if the coalescer should aggressively coalesce global copies
-    /// in favor of keeping local copies.
-    bool JoinGlobalCopies = false;
-
-    /// True if the coalescer should aggressively coalesce fall-thru
-    /// blocks exclusively containing copies.
-    bool JoinSplitEdges = false;
-
-    /// Copy instructions yet to be coalesced.
-    SmallVector<MachineInstr*, 8> WorkList;
-    SmallVector<MachineInstr*, 8> LocalWorkList;
-
-    /// Set of instruction pointers that have been erased, and
-    /// that may be present in WorkList.
-    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
-
-    /// Dead instructions that are about to be deleted.
-    SmallVector<MachineInstr*, 8> DeadDefs;
-
-    /// Virtual registers to be considered for register class inflation.
-    SmallVector<Register, 8> InflateRegs;
-
-    /// The collection of live intervals which should have been updated
-    /// immediately after rematerialiation but delayed until
-    /// lateLiveIntervalUpdate is called.
-    DenseSet<Register> ToBeUpdated;
-
-    /// Record how many times the large live interval with many valnos
-    /// has been tried to join with other live interval.
-    DenseMap<Register, unsigned long> LargeLIVisitCounter;
-
-    /// Recursively eliminate dead defs in DeadDefs.
-    void eliminateDeadDefs(LiveRangeEdit *Edit = nullptr);
-
-    /// LiveRangeEdit callback for eliminateDeadDefs().
-    void LRE_WillEraseInstruction(MachineInstr *MI) override;
-
-    /// Coalesce the LocalWorkList.
-    void coalesceLocals();
-
-    /// Join compatible live intervals
-    void joinAllIntervals();
-
-    /// Coalesce copies in the specified MBB, putting
-    /// copies that cannot yet be coalesced into WorkList.
-    void copyCoalesceInMBB(MachineBasicBlock *MBB);
-
-    /// Tries to coalesce all copies in CurrList. Returns true if any progress
-    /// was made.
-    bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
-
-    /// If one def has many copy like uses, and those copy uses are all
-    /// rematerialized, the live interval update needed for those
-    /// rematerializations will be delayed and done all at once instead
-    /// of being done multiple times. This is to save compile cost because
-    /// live interval update is costly.
-    void lateLiveIntervalUpdate();
-
-    /// Check if the incoming value defined by a COPY at \p SLRQ in the subrange
-    /// has no value defined in the predecessors. If the incoming value is the
-    /// same as defined by the copy itself, the value is considered undefined.
-    bool copyValueUndefInPredecessors(LiveRange &S,
-                                      const MachineBasicBlock *MBB,
-                                      LiveQueryResult SLRQ);
-
-    /// Set necessary undef flags on subregister uses after pruning out undef
-    /// lane segments from the subrange.
-    void setUndefOnPrunedSubRegUses(LiveInterval &LI, Register Reg,
-                                    LaneBitmask PrunedLanes);
-
-    /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
-    /// src/dst of the copy instruction CopyMI.  This returns true if the copy
-    /// was successfully coalesced away. If it is not currently possible to
-    /// coalesce this interval, but it may be possible if other things get
-    /// coalesced, then it returns true by reference in 'Again'.
-    bool joinCopy(MachineInstr *CopyMI, bool &Again,
-                  SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
-
-    /// Attempt to join these two intervals.  On failure, this
-    /// returns false.  The output "SrcInt" will not have been modified, so we
-    /// can use this information below to update aliases.
-    bool joinIntervals(CoalescerPair &CP);
-
-    /// Attempt joining two virtual registers. Return true on success.
-    bool joinVirtRegs(CoalescerPair &CP);
-
-    /// If a live interval has many valnos and is coalesced with other
-    /// live intervals many times, we regard such live interval as having
-    /// high compile time cost.
-    bool isHighCostLiveInterval(LiveInterval &LI);
-
-    /// Attempt joining with a reserved physreg.
-    bool joinReservedPhysReg(CoalescerPair &CP);
-
-    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
-    /// Subranges in @p LI which only partially interfere with the desired
-    /// LaneMask are split as necessary. @p LaneMask are the lanes that
-    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
-    /// lanemasks already adjusted to the coalesced register.
-    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
-                           LaneBitmask LaneMask, CoalescerPair &CP,
-                           unsigned DstIdx);
-
-    /// Join the liveranges of two subregisters. Joins @p RRange into
-    /// @p LRange, @p RRange may be invalid afterwards.
-    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
-                          LaneBitmask LaneMask, const CoalescerPair &CP);
-
-    /// We found a non-trivially-coalescable copy. If the source value number is
-    /// defined by a copy from the destination reg see if we can merge these two
-    /// destination reg valno# into a single value number, eliminating a copy.
-    /// This returns true if an interval was modified.
-    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
-
-    /// Return true if there are definitions of IntB
-    /// other than BValNo val# that can reach uses of AValno val# of IntA.
-    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
-                              VNInfo *AValNo, VNInfo *BValNo);
-
-    /// We found a non-trivially-coalescable copy.
-    /// If the source value number is defined by a commutable instruction and
-    /// its other operand is coalesced to the copy dest register, see if we
-    /// can transform the copy into a noop by commuting the definition.
-    /// This returns a pair of two flags:
-    /// - the first element is true if an interval was modified,
-    /// - the second element is true if the destination interval needs
-    ///   to be shrunk after deleting the copy.
-    std::pair<bool,bool> removeCopyByCommutingDef(const CoalescerPair &CP,
-                                                  MachineInstr *CopyMI);
-
-    /// We found a copy which can be moved to its less frequent predecessor.
-    bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);
-
-    /// If the source of a copy is defined by a
-    /// trivial computation, replace the copy by rematerialize the definition.
-    bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
-                                 bool &IsDefCopy);
-
-    /// Return true if a copy involving a physreg should be joined.
-    bool canJoinPhys(const CoalescerPair &CP);
-
-    /// Replace all defs and uses of SrcReg to DstReg and update the subregister
-    /// number if it is not zero. If DstReg is a physical register and the
-    /// existing subregister number of the def / use being updated is not zero,
-    /// make sure to set it to the correct physical subregister.
-    void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
-
-    /// If the given machine operand reads only undefined lanes add an undef
-    /// flag.
-    /// This can happen when undef uses were previously concealed by a copy
-    /// which we coalesced. Example:
-    ///    %0:sub0<def,read-undef> = ...
-    ///    %1 = COPY %0           <-- Coalescing COPY reveals undef
-    ///       = use %1:sub1       <-- hidden undef use
-    void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
-                      MachineOperand &MO, unsigned SubRegIdx);
-
-    /// Handle copies of undef values. If the undef value is an incoming
-    /// PHI value, it will convert @p CopyMI to an IMPLICIT_DEF.
-    /// Returns nullptr if @p CopyMI was not in any way eliminable. Otherwise,
-    /// it returns @p CopyMI (which could be an IMPLICIT_DEF at this point).
-    MachineInstr *eliminateUndefCopy(MachineInstr *CopyMI);
-
-    /// Check whether or not we should apply the terminal rule on the
-    /// destination (Dst) of \p Copy.
-    /// When the terminal rule applies, Copy is not profitable to
-    /// coalesce.
-    /// Dst is terminal if it has exactly one affinity (Dst, Src) and
-    /// at least one interference (Dst, Dst2). If Dst is terminal, the
-    /// terminal rule consists in checking that at least one of
-    /// interfering node, say Dst2, has an affinity of equal or greater
-    /// weight with Src.
-    /// In that case, Dst2 and Dst will not be able to be both coalesced
-    /// with Src. Since Dst2 exposes more coalescing opportunities than
-    /// Dst, we can drop \p Copy.
-    bool applyTerminalRule(const MachineInstr &Copy) const;
-
-    /// Wrapper method for \see LiveIntervals::shrinkToUses.
-    /// This method does the proper fixing of the live-ranges when the afore
-    /// mentioned method returns true.
-    void shrinkToUses(LiveInterval *LI,
-                      SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
-      NumShrinkToUses++;
-      if (LIS->shrinkToUses(LI, Dead)) {
-        /// Check whether or not \p LI is composed by multiple connected
-        /// components and if that is the case, fix that.
-        SmallVector<LiveInterval*, 8> SplitLIs;
-        LIS->splitSeparateComponents(*LI, SplitLIs);
-      }
-    }
+class JoinVals;
+
+class RegisterCoalescer : public MachineFunctionPass,
+                          private LiveRangeEdit::Delegate {
+  MachineFunction *MF = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  LiveIntervals *LIS = nullptr;
+  const MachineLoopInfo *Loops = nullptr;
+  AliasAnalysis *AA = nullptr;
+  RegisterClassInfo RegClassInfo;
+
+  /// Position and VReg of a PHI instruction during coalescing.
+  struct PHIValPos {
+    SlotIndex SI;    ///< Slot where this PHI occurs.
+    Register Reg;    ///< VReg the PHI occurs in.
+    unsigned SubReg; ///< Qualifying subregister for Reg.
+  };
 
-    /// Wrapper Method to do all the necessary work when an Instruction is
-    /// deleted.
-    /// Optimizations should use this to make sure that deleted instructions
-    /// are always accounted for.
-    void deleteInstr(MachineInstr* MI) {
-      ErasedInstrs.insert(MI);
-      LIS->RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
+  /// Map from debug instruction number to PHI position during coalescing.
+  DenseMap<unsigned, PHIValPos> PHIValToPos;
+  /// Index of, for each VReg, which debug instruction numbers and
+  /// corresponding PHIs are sensitive to coalescing. Each VReg may have
+  /// multiple PHI defs, at different positions.
+  DenseMap<Register, SmallVector<unsigned, 2>> RegToPHIIdx;
+
+  /// Debug variable location tracking -- for each VReg, maintain an
+  /// ordered-by-slot-index set of DBG_VALUEs, to help quick
+  /// identification of whether coalescing may change location validity.
+  using DbgValueLoc = std::pair<SlotIndex, MachineInstr *>;
+  DenseMap<Register, std::vector<DbgValueLoc>> DbgVRegToValues;
+
+  /// A LaneMask to remember on which subregister live ranges we need to call
+  /// shrinkToUses() later.
+  LaneBitmask ShrinkMask;
+
+  /// True if the main range of the currently coalesced intervals should be
+  /// checked for smaller live intervals.
+  bool ShrinkMainRange = false;
+
+  /// True if the coalescer should aggressively coalesce global copies
+  /// in favor of keeping local copies.
+  bool JoinGlobalCopies = false;
+
+  /// True if the coalescer should aggressively coalesce fall-thru
+  /// blocks exclusively containing copies.
+  bool JoinSplitEdges = false;
+
+  /// Copy instructions yet to be coalesced.
+  SmallVector<MachineInstr *, 8> WorkList;
+  SmallVector<MachineInstr *, 8> LocalWorkList;
+
+  /// Set of instruction pointers that have been erased, and
+  /// that may be present in WorkList.
+  SmallPtrSet<MachineInstr *, 8> ErasedInstrs;
+
+  /// Dead instructions that are about to be deleted.
+  SmallVector<MachineInstr *, 8> DeadDefs;
+
+  /// Virtual registers to be considered for register class inflation.
+  SmallVector<Register, 8> InflateRegs;
+
+  /// The collection of live intervals which should have been updated
+  /// immediately after rematerialiation but delayed until
+  /// lateLiveIntervalUpdate is called.
+  DenseSet<Register> ToBeUpdated;
+
+  /// Record how many times the large live interval with many valnos
+  /// has been tried to join with other live interval.
+  DenseMap<Register, unsigned long> LargeLIVisitCounter;
+
+  /// Recursively eliminate dead defs in DeadDefs.
+  void eliminateDeadDefs(LiveRangeEdit *Edit = nullptr);
+
+  /// LiveRangeEdit callback for eliminateDeadDefs().
+  void LRE_WillEraseInstruction(MachineInstr *MI) override;
+
+  /// Coalesce the LocalWorkList.
+  void coalesceLocals();
+
+  /// Join compatible live intervals
+  void joinAllIntervals();
+
+  /// Coalesce copies in the specified MBB, putting
+  /// copies that cannot yet be coalesced into WorkList.
+  void copyCoalesceInMBB(MachineBasicBlock *MBB);
+
+  /// Tries to coalesce all copies in CurrList. Returns true if any progress
+  /// was made.
+  bool copyCoalesceWorkList(MutableArrayRef<MachineInstr *> CurrList);
+
+  /// If one def has many copy like uses, and those copy uses are all
+  /// rematerialized, the live interval update needed for those
+  /// rematerializations will be delayed and done all at once instead
+  /// of being done multiple times. This is to save compile cost because
+  /// live interval update is costly.
+  void lateLiveIntervalUpdate();
+
+  /// Check if the incoming value defined by a COPY at \p SLRQ in the subrange
+  /// has no value defined in the predecessors. If the incoming value is the
+  /// same as defined by the copy itself, the value is considered undefined.
+  bool copyValueUndefInPredecessors(LiveRange &S, const MachineBasicBlock *MBB,
+                                    LiveQueryResult SLRQ);
+
+  /// Set necessary undef flags on subregister uses after pruning out undef
+  /// lane segments from the subrange.
+  void setUndefOnPrunedSubRegUses(LiveInterval &LI, Register Reg,
+                                  LaneBitmask PrunedLanes);
+
+  /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
+  /// src/dst of the copy instruction CopyMI.  This returns true if the copy
+  /// was successfully coalesced away. If it is not currently possible to
+  /// coalesce this interval, but it may be possible if other things get
+  /// coalesced, then it returns true by reference in 'Again'.
+  bool joinCopy(MachineInstr *CopyMI, bool &Again,
+                SmallPtrSetImpl<MachineInstr *> &CurrentErasedInstrs);
+
+  /// Attempt to join these two intervals.  On failure, this
+  /// returns false.  The output "SrcInt" will not have been modified, so we
+  /// can use this information below to update aliases.
+  bool joinIntervals(CoalescerPair &CP);
+
+  /// Attempt joining two virtual registers. Return true on success.
+  bool joinVirtRegs(CoalescerPair &CP);
+
+  /// If a live interval has many valnos and is coalesced with other
+  /// live intervals many times, we regard such live interval as having
+  /// high compile time cost.
+  bool isHighCostLiveInterval(LiveInterval &LI);
+
+  /// Attempt joining with a reserved physreg.
+  bool joinReservedPhysReg(CoalescerPair &CP);
+
+  /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+  /// Subranges in @p LI which only partially interfere with the desired
+  /// LaneMask are split as necessary. @p LaneMask are the lanes that
+  /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+  /// lanemasks already adjusted to the coalesced register.
+  void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                         LaneBitmask LaneMask, CoalescerPair &CP,
+                         unsigned DstIdx);
+
+  /// Join the liveranges of two subregisters. Joins @p RRange into
+  /// @p LRange, @p RRange may be invalid afterwards.
+  void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                        LaneBitmask LaneMask, const CoalescerPair &CP);
+
+  /// We found a non-trivially-coalescable copy. If the source value number is
+  /// defined by a copy from the destination reg see if we can merge these two
+  /// destination reg valno# into a single value number, eliminating a copy.
+  /// This returns true if an interval was modified.
+  bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+  /// Return true if there are definitions of IntB
+  /// other than BValNo val# that can reach uses of AValno val# of IntA.
+  bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                            VNInfo *AValNo, VNInfo *BValNo);
+
+  /// We found a non-trivially-coalescable copy.
+  /// If the source value number is defined by a commutable instruction and
+  /// its other operand is coalesced to the copy dest register, see if we
+  /// can transform the copy into a noop by commuting the definition.
+  /// This returns a pair of two flags:
+  /// - the first element is true if an interval was modified,
+  /// - the second element is true if the destination interval needs
+  ///   to be shrunk after deleting the copy.
+  std::pair<bool, bool> removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                 MachineInstr *CopyMI);
+
+  /// We found a copy which can be moved to its less frequent predecessor.
+  bool removePartialRedundancy(const CoalescerPair &CP, MachineInstr &CopyMI);
+
+  /// If the source of a copy is defined by a
+  /// trivial computation, replace the copy by rematerialize the definition.
+  bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
+                               bool &IsDefCopy);
+
+  /// Return true if a copy involving a physreg should be joined.
+  bool canJoinPhys(const CoalescerPair &CP);
+
+  /// Replace all defs and uses of SrcReg to DstReg and update the subregister
+  /// number if it is not zero. If DstReg is a physical register and the
+  /// existing subregister number of the def / use being updated is not zero,
+  /// make sure to set it to the correct physical subregister.
+  void updateRegDefsUses(Register SrcReg, Register DstReg, unsigned SubIdx);
+
+  /// If the given machine operand reads only undefined lanes add an undef
+  /// flag.
+  /// This can happen when undef uses were previously concealed by a copy
+  /// which we coalesced. Example:
+  ///    %0:sub0<def,read-undef> = ...
+  ///    %1 = COPY %0           <-- Coalescing COPY reveals undef
+  ///       = use %1:sub1       <-- hidden undef use
+  void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                    MachineOperand &MO, unsigned SubRegIdx);
+
+  /// Handle copies of undef values. If the undef value is an incoming
+  /// PHI value, it will convert @p CopyMI to an IMPLICIT_DEF.
+  /// Returns nullptr if @p CopyMI was not in any way eliminable. Otherwise,
+  /// it returns @p CopyMI (which could be an IMPLICIT_DEF at this point).
+  MachineInstr *eliminateUndefCopy(MachineInstr *CopyMI);
+
+  /// Check whether or not we should apply the terminal rule on the
+  /// destination (Dst) of \p Copy.
+  /// When the terminal rule applies, Copy is not profitable to
+  /// coalesce.
+  /// Dst is terminal if it has exactly one affinity (Dst, Src) and
+  /// at least one interference (Dst, Dst2). If Dst is terminal, the
+  /// terminal rule consists in checking that at least one of
+  /// interfering node, say Dst2, has an affinity of equal or greater
+  /// weight with Src.
+  /// In that case, Dst2 and Dst will not be able to be both coalesced
+  /// with Src. Since Dst2 exposes more coalescing opportunities than
+  /// Dst, we can drop \p Copy.
+  bool applyTerminalRule(const MachineInstr &Copy) const;
+
+  /// Wrapper method for \see LiveIntervals::shrinkToUses.
+  /// This method does the proper fixing of the live-ranges when the afore
+  /// mentioned method returns true.
+  void shrinkToUses(LiveInterval *LI,
+                    SmallVectorImpl<MachineInstr *> *Dead = nullptr) {
+    NumShrinkToUses++;
+    if (LIS->shrinkToUses(LI, Dead)) {
+      /// Check whether or not \p LI is composed by multiple connected
+      /// components and if that is the case, fix that.
+      SmallVector<LiveInterval *, 8> SplitLIs;
+      LIS->splitSeparateComponents(*LI, SplitLIs);
     }
+  }
 
-    /// Walk over function and initialize the DbgVRegToValues map.
-    void buildVRegToDbgValueMap(MachineFunction &MF);
+  /// Wrapper Method to do all the necessary work when an Instruction is
+  /// deleted.
+  /// Optimizations should use this to make sure that deleted instructions
+  /// are always accounted for.
+  void deleteInstr(MachineInstr *MI) {
+    ErasedInstrs.insert(MI);
+    LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+  }
 
-    /// Test whether, after merging, any DBG_VALUEs would refer to a
-    /// different value number than before merging, and whether this can
-    /// be resolved. If not, mark the DBG_VALUE as being undef.
-    void checkMergingChangesDbgValues(CoalescerPair &CP, LiveRange &LHS,
-                                      JoinVals &LHSVals, LiveRange &RHS,
-                                      JoinVals &RHSVals);
+  /// Walk over function and initialize the DbgVRegToValues map.
+  void buildVRegToDbgValueMap(MachineFunction &MF);
 
-    void checkMergingChangesDbgValuesImpl(Register Reg, LiveRange &OtherRange,
-                                          LiveRange &RegRange, JoinVals &Vals2);
+  /// Test whether, after merging, any DBG_VALUEs would refer to a
+  /// different value number than before merging, and whether this can
+  /// be resolved. If not, mark the DBG_VALUE as being undef.
+  void checkMergingChangesDbgValues(CoalescerPair &CP, LiveRange &LHS,
+                                    JoinVals &LHSVals, LiveRange &RHS,
+                                    JoinVals &RHSVals);
 
-  public:
-    static char ID; ///< Class identification, replacement for typeinfo
+  void checkMergingChangesDbgValuesImpl(Register Reg, LiveRange &OtherRange,
+                                        LiveRange &RegRange, JoinVals &Vals2);
 
-    RegisterCoalescer() : MachineFunctionPass(ID) {
-      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
-    }
+public:
+  static char ID; ///< Class identification, replacement for typeinfo
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
+  RegisterCoalescer() : MachineFunctionPass(ID) {
+    initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  }
 
-    MachineFunctionProperties getClearedProperties() const override {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::IsSSA);
-    }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
-    void releaseMemory() override;
+  MachineFunctionProperties getClearedProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
 
-    /// This is the pass entry point.
-    bool runOnMachineFunction(MachineFunction&) override;
+  void releaseMemory() override;
 
-    /// Implement the dump method.
-    void print(raw_ostream &O, const Module* = nullptr) const override;
-  };
+  /// This is the pass entry point.
+  bool runOnMachineFunction(MachineFunction &) override;
+
+  /// Implement the dump method.
+  void print(raw_ostream &O, const Module * = nullptr) const override;
+};
 
 } // end anonymous namespace
 
@@ -417,20 +416,20 @@ INITIALIZE_PASS_END(RegisterCoalescer, "register-coalescer",
                                       const MachineInstr *MI, Register &Src,
                                       Register &Dst, unsigned &SrcSub,
                                       unsigned &DstSub) {
-    if (MI->isCopy()) {
-      Dst = MI->getOperand(0).getReg();
-      DstSub = MI->getOperand(0).getSubReg();
-      Src = MI->getOperand(1).getReg();
-      SrcSub = MI->getOperand(1).getSubReg();
-    } else if (MI->isSubregToReg()) {
-      Dst = MI->getOperand(0).getReg();
-      DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
-                                        MI->getOperand(3).getImm());
-      Src = MI->getOperand(2).getReg();
-      SrcSub = MI->getOperand(2).getSubReg();
-    } else
-      return false;
-    return true;
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
+                                      MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else
+    return false;
+  return true;
 }
 
 /// Return true if this block should be vacated by the coalescer to eliminate
@@ -476,14 +475,16 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     // Eliminate DstSub on a physreg.
     if (DstSub) {
       Dst = TRI.getSubReg(Dst, DstSub);
-      if (!Dst) return false;
+      if (!Dst)
+        return false;
       DstSub = 0;
     }
 
     // Eliminate SrcSub by picking a corresponding Dst superregister.
     if (SrcSub) {
       Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
-      if (!Dst) return false;
+      if (!Dst)
+        return false;
     } else if (!MRI.getRegClass(Src)->contains(Dst)) {
       return false;
     }
@@ -498,8 +499,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
       if (Src == Dst && SrcSub != DstSub)
         return false;
 
-      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
-                                         SrcIdx, DstIdx);
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub, SrcIdx,
+                                         DstIdx);
       if (!NewRC)
         return false;
     } else if (DstSub) {
@@ -603,8 +604,8 @@ void RegisterCoalescer::eliminateDeadDefs(LiveRangeEdit *Edit) {
     return;
   }
   SmallVector<Register, 8> NewRegs;
-  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
-                nullptr, this).eliminateDeadDefs(DeadDefs);
+  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS, nullptr, this)
+      .eliminateDeadDefs(DeadDefs);
 }
 
 void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
@@ -618,9 +619,9 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   assert(!CP.isPhys() && "This doesn't work for physreg copies.");
 
   LiveInterval &IntA =
-    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
 
   // We have a non-trivially-coalescable copy with IntA being the source and
@@ -640,19 +641,22 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
   LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
-  if (BS == IntB.end()) return false;
+  if (BS == IntB.end())
+    return false;
   VNInfo *BValNo = BS->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
   // an unknown definition point or it is defined at CopyIdx.  If unknown, we
   // can't process it.
-  if (BValNo->def != CopyIdx) return false;
+  if (BValNo->def != CopyIdx)
+    return false;
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
   LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
   // The live segment might not exist after fun with physreg coalescing.
-  if (AS == IntA.end()) return false;
+  if (AS == IntA.end())
+    return false;
   VNInfo *AValNo = AS->valno;
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
@@ -664,21 +668,22 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // Get the Segment in IntB that this value number starts with.
   LiveInterval::iterator ValS =
-    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+      IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
   if (ValS == IntB.end())
     return false;
 
   // Make sure that the end of the live segment is inside the same block as
   // CopyMI.
   MachineInstr *ValSEndInst =
-    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+      LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
   if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
     return false;
 
   // Okay, we now know that ValS ends in the same block that the CopyMI
   // live-range starts.  If there are no intervening live segments between them
   // in IntB, we can merge them.
-  if (ValS+1 != BS) return false;
+  if (ValS + 1 != BS)
+    return false;
 
   LLVM_DEBUG(dbgs() << "Extending: " << printReg(IntB.reg(), TRI));
 
@@ -751,8 +756,7 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 }
 
 bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
-                                             LiveInterval &IntB,
-                                             VNInfo *AValNo,
+                                             LiveInterval &IntB, VNInfo *AValNo,
                                              VNInfo *BValNo) {
   // If AValNo has PHI kills, conservatively assume that IntB defs can reach
   // the PHI values.
@@ -760,7 +764,8 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
     return true;
 
   for (LiveRange::Segment &ASeg : IntA.segments) {
-    if (ASeg.valno != AValNo) continue;
+    if (ASeg.valno != AValNo)
+      continue;
     LiveInterval::iterator BI = llvm::upper_bound(IntB, ASeg.start);
     if (BI != IntB.begin())
       --BI;
@@ -778,9 +783,10 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
 
 /// Copy segments with value number @p SrcValNo from liverange @p Src to live
 /// range @Dst and use value number @p DstValNo there.
-static std::pair<bool,bool>
-addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo, const LiveRange &Src,
-                     const VNInfo *SrcValNo) {
+static std::pair<bool, bool> addSegmentsWithValNo(LiveRange &Dst,
+                                                  VNInfo *DstValNo,
+                                                  const LiveRange &Src,
+                                                  const VNInfo *SrcValNo) {
   bool Changed = false;
   bool MergedWithDead = false;
   for (const LiveRange::Segment &S : Src.segments) {
@@ -801,7 +807,7 @@ addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo, const LiveRange &Src,
   return std::make_pair(Changed, MergedWithDead);
 }
 
-std::pair<bool,bool>
+std::pair<bool, bool>
 RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
                                             MachineInstr *CopyMI) {
   assert(!CP.isPhys());
@@ -841,19 +847,19 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && !AValNo->isUnused() && "COPY source not live");
   if (AValNo->isPHIDef())
-    return { false, false };
+    return {false, false};
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
-    return { false, false };
+    return {false, false};
   if (!DefMI->isCommutable())
-    return { false, false };
+    return {false, false};
   // If DefMI is a two-address instruction then commuting it will change the
   // destination register.
   int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg(), /*TRI=*/nullptr);
   assert(DefIdx != -1);
   unsigned UseOpIdx;
   if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
-    return { false, false };
+    return {false, false};
 
   // FIXME: The code below tries to commute 'UseOpIdx' operand with some other
   // commutable operand which is expressed by 'CommuteAnyOperandIndex'value
@@ -866,17 +872,17 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   // op#2<->op#3) of commute transformation should be considered/tried here.
   unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
   if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
-    return { false, false };
+    return {false, false};
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   Register NewReg = NewDstMO.getReg();
   if (NewReg != IntB.reg() || !IntB.Query(AValNo->def).isKill())
-    return { false, false };
+    return {false, false};
 
   // Make sure there are no other definitions of IntB that would reach the
   // uses which the new definition can reach.
   if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
-    return { false, false };
+    return {false, false};
 
   // If some of the uses of IntA.reg is already coalesced away, return false.
   // It's not possible to determine whether it's safe to perform the coalescing.
@@ -889,7 +895,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     // If this use is tied to a def, we can't rewrite the register.
     if (UseMI->isRegTiedToDefOperand(OpNo))
-      return { false, false };
+      return {false, false};
   }
 
   LLVM_DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
@@ -901,10 +907,10 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   MachineInstr *NewMI =
       TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
-    return { false, false };
+    return {false, false};
   if (IntA.reg().isVirtual() && IntB.reg().isVirtual() &&
       !MRI->constrainRegClass(IntB.reg(), MRI->getRegClass(IntA.reg())))
-    return { false, false };
+    return {false, false};
   if (NewMI != DefMI) {
     LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
     MachineBasicBlock::iterator Pos = DefMI;
@@ -1035,7 +1041,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   LLVM_DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
   ++numCommutes;
-  return { true, ShrinkB };
+  return {true, ShrinkB};
 }
 
 /// For copy B = A in BB2, if A is defined by A = B in BB0 which is a
@@ -1194,9 +1200,9 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     for (LiveInterval::SubRange &SR : IntB.subranges())
       SR.createDeadDef(NewCopyIdx, LIS->getVNInfoAllocator());
 
-    // If the newly created Instruction has an address of an instruction that was
-    // deleted before (object recycled by the allocator) it needs to be removed from
-    // the deleted list.
+    // If the newly created Instruction has an address of an instruction that
+    // was deleted before (object recycled by the allocator) it needs to be
+    // removed from the deleted list.
     ErasedInstrs.erase(NewCopyMI);
   } else {
     LLVM_DEBUG(dbgs() << "\tremovePartialRedundancy: Remove the copy from "
@@ -1247,7 +1253,7 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     // to because the copy has been removed.  We can go ahead and remove that
     // endpoint; there is no other situation here that there could be a use at
     // the same place as we know that the copy is a full copy.
-    for (unsigned I = 0; I != EndPoints.size(); ) {
+    for (unsigned I = 0; I != EndPoints.size();) {
       if (SlotIndex::isSameInstr(EndPoints[I], CopyIdx)) {
         EndPoints[I] = EndPoints.back();
         EndPoints.pop_back();
@@ -1390,7 +1396,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   DebugLoc DL = CopyMI->getDebugLoc();
   MachineBasicBlock *MBB = CopyMI->getParent();
   MachineBasicBlock::iterator MII =
-    std::next(MachineBasicBlock::iterator(CopyMI));
+      std::next(MachineBasicBlock::iterator(CopyMI));
   Edit.rematerializeAt(*MBB, MII, DstReg, RM, *TRI, false, SrcIdx, CopyMI);
   MachineInstr &NewMI = *std::prev(MII);
   NewMI.setDebugLoc(DL);
@@ -1404,11 +1410,11 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   if (DstIdx != 0) {
     MachineOperand &DefMO = NewMI.getOperand(0);
     if (DefMO.getSubReg() == DstIdx) {
-      assert(SrcIdx == 0 && CP.isFlipped()
-             && "Shouldn't have SrcIdx+DstIdx at this point");
+      assert(SrcIdx == 0 && CP.isFlipped() &&
+             "Shouldn't have SrcIdx+DstIdx at this point");
       const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
       const TargetRegisterClass *CommonRC =
-        TRI->getCommonSubClass(DefRC, DstRC);
+          TRI->getCommonSubClass(DefRC, DstRC);
       if (CommonRC != nullptr) {
         NewRC = CommonRC;
 
@@ -1438,7 +1444,8 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
        I != E; ++I) {
     MachineOperand &MO = CopyMI->getOperand(I);
     if (MO.isReg()) {
-      assert(MO.isImplicit() && "No explicit operands after implicit operands.");
+      assert(MO.isImplicit() &&
+             "No explicit operands after implicit operands.");
       assert((MO.getReg().isPhysical() ||
               (MO.getSubReg() == 0 && MO.getReg() == DstOperand.getReg())) &&
              "unexpected implicit virtual register def");
@@ -1563,7 +1570,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
       SlotIndex DefIndex =
           CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
       LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
-      VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
+      VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if (!SR.liveAt(DefIndex))
           SR.createDeadDef(DefIndex, Alloc);
@@ -1723,7 +1730,7 @@ MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   // at this point.
   Register SrcReg, DstReg;
   unsigned SrcSubIdx = 0, DstSubIdx = 0;
-  if(!isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
+  if (!isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
     return nullptr;
 
   SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
@@ -1752,20 +1759,20 @@ MachineInstr *RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
   // copy introduced a live value.
   if (((V && V->isPHIDef()) || (!V && !DstLI.liveAt(Idx)))) {
     for (unsigned i = CopyMI->getNumOperands(); i != 0; --i) {
-      MachineOperand &MO = CopyMI->getOperand(i-1);
+      MachineOperand &MO = CopyMI->getOperand(i - 1);
       if (MO.isReg()) {
         if (MO.isUse())
           CopyMI->removeOperand(i - 1);
       } else {
         assert(MO.isImm() &&
                CopyMI->getOpcode() == TargetOpcode::SUBREG_TO_REG);
-        CopyMI->removeOperand(i-1);
+        CopyMI->removeOperand(i - 1);
       }
     }
 
     CopyMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
     LLVM_DEBUG(dbgs() << "\tReplaced copy of <undef> value with an "
-               "implicit def\n");
+                         "implicit def\n");
     return CopyMI;
   }
 
@@ -1877,10 +1884,10 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     }
   }
 
-  SmallPtrSet<MachineInstr*, 8> Visited;
-  for (MachineRegisterInfo::reg_instr_iterator
-       I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
-       I != E; ) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  for (MachineRegisterInfo::reg_instr_iterator I = MRI->reg_instr_begin(SrcReg),
+                                               E = MRI->reg_instr_end();
+       I != E;) {
     MachineInstr *UseMI = &*(I++);
 
     // Each instruction can only be rewritten once because sub-register
@@ -1891,7 +1898,7 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
     if (SrcReg == DstReg && !Visited.insert(UseMI).second)
       continue;
 
-    SmallVector<unsigned,8> Ops;
+    SmallVector<unsigned, 8> Ops;
     bool Reads, Writes;
     std::tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
 
@@ -1928,8 +1935,8 @@ void RegisterCoalescer::updateRegDefsUses(Register SrcReg, Register DstReg,
             DstInt->createSubRange(Allocator, UnusedLanes);
           }
           SlotIndex MIIdx = UseMI->isDebugInstr()
-            ? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
-            : LIS->getInstructionIndex(*UseMI);
+                                ? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
+                                : LIS->getInstructionIndex(*UseMI);
           SlotIndex UseIdx = MIIdx.getRegSlot(true);
           addUndefFlag(*DstInt, UseIdx, MO, SubUseIdx);
         }
@@ -2056,7 +2063,7 @@ bool RegisterCoalescer::joinCopy(
       if (UndefMI->isImplicitDef())
         return false;
       deleteInstr(CopyMI);
-      return false;  // Not coalescable.
+      return false; // Not coalescable.
     }
   }
 
@@ -2096,8 +2103,8 @@ bool RegisterCoalescer::joinCopy(
 
       LI.MergeValueNumberInto(DefVNI, ReadVNI);
       if (PrunedLanes.any()) {
-        LLVM_DEBUG(dbgs() << "Pruning undef incoming lanes: "
-                          << PrunedLanes << '\n');
+        LLVM_DEBUG(dbgs() << "Pruning undef incoming lanes: " << PrunedLanes
+                          << '\n');
         setUndefOnPrunedSubRegUses(LI, CP.getSrcReg(), PrunedLanes);
       }
 
@@ -2119,13 +2126,13 @@ bool RegisterCoalescer::joinCopy(
       if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
         return true;
       if (IsDefCopy)
-        Again = true;  // May be possible to coalesce later.
+        Again = true; // May be possible to coalesce later.
       return false;
     }
   } else {
     // When possible, let DstReg be the larger interval.
     if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
-                           LIS->getInterval(CP.getDstReg()).size())
+                               LIS->getInterval(CP.getDstReg()).size())
       CP.flip();
 
     LLVM_DEBUG({
@@ -2186,7 +2193,7 @@ bool RegisterCoalescer::joinCopy(
 
     // Otherwise, we are unable to join the intervals.
     LLVM_DEBUG(dbgs() << "\tInterference!\n");
-    Again = true;  // May be possible to coalesce later.
+    Again = true; // May be possible to coalesce later.
     return false;
   }
 
@@ -2472,7 +2479,7 @@ class JoinVals {
   const bool TrackSubRegLiveness;
 
   /// Values that will be present in the final live range.
-  SmallVectorImpl<VNInfo*> &NewVNInfo;
+  SmallVectorImpl<VNInfo *> &NewVNInfo;
 
   const CoalescerPair &CP;
   LiveIntervals *LIS;
@@ -2483,7 +2490,7 @@ class JoinVals {
   /// NewVNInfo. This is suitable for passing to LiveInterval::join().
   SmallVector<int, 8> Assignments;
 
-  public:
+public:
   /// Conflict resolution for overlapping values.
   enum ConflictResolution {
     /// No overlap, simply keep this value.
@@ -2512,7 +2519,7 @@ class JoinVals {
     CR_Impossible
   };
 
-  private:
+private:
   /// Per-value info for LI. The lane bit masks are all relative to the final
   /// joined register, so they can be compared directly between SrcReg and
   /// DstReg.
@@ -2584,7 +2591,8 @@ class JoinVals {
   /// Find the ultimate value that VNI was copied from.
   std::pair<const VNInfo *, Register> followCopyChain(const VNInfo *VNI) const;
 
-  bool valuesIdentical(VNInfo *Value0, VNInfo *Value1, const JoinVals &Other) const;
+  bool valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                       const JoinVals &Other) const;
 
   /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
   /// Return a conflict resolution when possible, but leave the hard cases as
@@ -2674,7 +2682,7 @@ class JoinVals {
   /// Add erased instructions to ErasedInstrs.
   /// Add foreign virtual registers to ShrinkRegs if their live range ended at
   /// the erased instrs.
-  void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+  void eraseInstrs(SmallPtrSetImpl<MachineInstr *> &ErasedInstrs,
                    SmallVectorImpl<Register> &ShrinkRegs,
                    LiveInterval *LI = nullptr);
 
@@ -2692,14 +2700,14 @@ class JoinVals {
 
 } // end anonymous namespace
 
-LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
-  const {
+LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI,
+                                        bool &Redef) const {
   LaneBitmask L;
   for (const MachineOperand &MO : DefMI->all_defs()) {
     if (MO.getReg() != Reg)
       continue;
     L |= TRI->getSubRegIndexLaneMask(
-           TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
+        TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
     if (MO.readsReg())
       Redef = true;
   }
@@ -2783,8 +2791,8 @@ bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
   return Orig0->def == Orig1->def && Reg0 == Reg1;
 }
 
-JoinVals::ConflictResolution
-JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+JoinVals::ConflictResolution JoinVals::analyzeValue(unsigned ValNo,
+                                                    JoinVals &Other) {
   Val &V = Vals[ValNo];
   assert(!V.isAnalyzed() && "Value has already been analyzed!");
   VNInfo *VNI = LR.getValNumInfo(ValNo);
@@ -2922,9 +2930,9 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     if (DefMI &&
         (DefMI->getParent() != OtherMBB || LIS->isLiveInToMBB(LR, OtherMBB))) {
       LLVM_DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
-                 << " extends into "
-                 << printMBBReference(*DefMI->getParent())
-                 << ", keeping it.\n");
+                        << " extends into "
+                        << printMBBReference(*DefMI->getParent())
+                        << ", keeping it.\n");
       OtherV.mustKeepImplicitDef(*TRI, *OtherImpDef);
     } else if (OtherMBB->hasEHPadSuccessor()) {
       // If OtherV is defined in a basic block that has EH pad successors then
@@ -3113,9 +3121,9 @@ bool JoinVals::mapValues(JoinVals &Other) {
   return true;
 }
 
-bool JoinVals::
-taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
-            SmallVectorImpl<std::pair<SlotIndex, LaneBitmask>> &TaintExtent) {
+bool JoinVals::taintExtent(
+    unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
+    SmallVectorImpl<std::pair<SlotIndex, LaneBitmask>> &TaintExtent) {
   VNInfo *VNI = LR.getValNumInfo(ValNo);
   MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
   SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
@@ -3176,8 +3184,8 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     if (V.Resolution != CR_Unresolved)
       continue;
     LLVM_DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i << '@'
-                      << LR.getValNumInfo(i)->def
-                      << ' ' << PrintLaneMask(LaneMask) << '\n');
+                      << LR.getValNumInfo(i)->def << ' '
+                      << PrintLaneMask(LaneMask) << '\n');
     if (SubRangeJoin)
       return false;
 
@@ -3210,7 +3218,7 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
            "Interference ends on VNI->def. Should have been handled earlier");
     MachineInstr *LastMI =
-      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+        Indexes->getInstructionFromIndex(TaintExtent.front().first);
     assert(LastMI && "Range must end at a proper instruction");
     unsigned TaintNum = 0;
     while (true) {
@@ -3268,8 +3276,8 @@ void JoinVals::pruneValues(JoinVals &Other,
       // predecessors, so the instruction should simply go away once its value
       // has been replaced.
       Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
-      bool EraseImpDef = OtherV.ErasableImplicitDef &&
-                         OtherV.Resolution == CR_Keep;
+      bool EraseImpDef =
+          OtherV.ErasableImplicitDef && OtherV.Resolution == CR_Keep;
       if (!Def.isBlock()) {
         if (changeInstrs) {
           // Remove <def,read-undef> flags. This def is now a partial redef.
@@ -3384,12 +3392,12 @@ void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
       // If a subrange starts at the copy then an undefined value has been
       // copied and we must remove that subrange value as well.
       VNInfo *ValueOut = Q.valueOutOrDead();
-      if (ValueOut != nullptr && (Q.valueIn() == nullptr ||
-                                  (V.Identical && V.Resolution == CR_Erase &&
-                                   ValueOut->def == Def))) {
+      if (ValueOut != nullptr &&
+          (Q.valueIn() == nullptr ||
+           (V.Identical && V.Resolution == CR_Erase && ValueOut->def == Def))) {
         LLVM_DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
                           << " at " << Def << "\n");
-        SmallVector<SlotIndex,8> EndPoints;
+        SmallVector<SlotIndex, 8> EndPoints;
         LIS->pruneValue(S, Def, &EndPoints);
         DidPrune = true;
         // Mark value number as unused.
@@ -3438,7 +3446,7 @@ static bool isDefInSubRange(LiveInterval &LI, SlotIndex Def) {
 }
 
 void JoinVals::pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange) {
-  assert(&static_cast<LiveRange&>(LI) == &LR);
+  assert(&static_cast<LiveRange &>(LI) == &LR);
 
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     if (Vals[i].Resolution != CR_Keep)
@@ -3463,7 +3471,7 @@ void JoinVals::removeImplicitDefs() {
   }
 }
 
-void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr *> &ErasedInstrs,
                            SmallVectorImpl<Register> &ShrinkRegs,
                            LiveInterval *LI) {
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
@@ -3502,7 +3510,7 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
       VNI->markUnused();
 
       if (LI != nullptr && LI->hasSubRanges()) {
-        assert(static_cast<LiveRange*>(LI) == &LR);
+        assert(static_cast<LiveRange *>(LI) == &LR);
         // Determine the end point based on the subrange information:
         // minimum of (earliest def of next segment,
         //             latest end point of containing segment)
@@ -3560,11 +3568,11 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
 void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
                                          LaneBitmask LaneMask,
                                          const CoalescerPair &CP) {
-  SmallVector<VNInfo*, 16> NewVNInfo;
-  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
-                   NewVNInfo, CP, LIS, TRI, true, true);
-  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
-                   NewVNInfo, CP, LIS, TRI, true, true);
+  SmallVector<VNInfo *, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask, NewVNInfo,
+                   CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask, NewVNInfo,
+                   CP, LIS, TRI, true, true);
 
   // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
   // We should be able to resolve all conflicts here as we could successfully do
@@ -3600,8 +3608,8 @@ void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
   LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
               NewVNInfo);
 
-  LLVM_DEBUG(dbgs() << "\t\tjoined lanes: " << PrintLaneMask(LaneMask)
-                    << ' ' << LRange << "\n");
+  LLVM_DEBUG(dbgs() << "\t\tjoined lanes: " << PrintLaneMask(LaneMask) << ' '
+                    << LRange << "\n");
   if (EndPoints.empty())
     return;
 
@@ -3611,7 +3619,7 @@ void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
     dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
     for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
       dbgs() << EndPoints[i];
-      if (i != n-1)
+      if (i != n - 1)
         dbgs() << ',';
     }
     dbgs() << ":  " << LRange << '\n';
@@ -3651,7 +3659,7 @@ bool RegisterCoalescer::isHighCostLiveInterval(LiveInterval &LI) {
 }
 
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
-  SmallVector<VNInfo*, 16> NewVNInfo;
+  SmallVector<VNInfo *, 16> NewVNInfo;
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
   bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC());
@@ -3820,12 +3828,12 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
       dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
       for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
         dbgs() << EndPoints[i];
-        if (i != n-1)
+        if (i != n - 1)
           dbgs() << ',';
       }
       dbgs() << ":  " << LHS << '\n';
     });
-    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+    LIS->extendToIndices((LiveRange &)LHS, EndPoints);
   }
 
   return true;
@@ -3835,8 +3843,7 @@ bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
 }
 
-void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF)
-{
+void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF) {
   const SlotIndexes &Slots = *LIS->getSlotIndexes();
   SmallVector<MachineInstr *, 8> ToInsert;
 
@@ -3939,8 +3946,8 @@ void RegisterCoalescer::checkMergingChangesDbgValuesImpl(Register Reg,
     // was coalesced and Reg deleted. It's safe to refer to the other register
     // (which will be the source of the copy).
     auto Resolution = RegVals.getResolution(OtherIt->valno->id);
-    LastUndefResult = Resolution != JoinVals::CR_Keep &&
-                      Resolution != JoinVals::CR_Erase;
+    LastUndefResult =
+        Resolution != JoinVals::CR_Keep && Resolution != JoinVals::CR_Erase;
     LastUndefIdx = Idx;
     return LastUndefResult;
   };
@@ -3977,7 +3984,7 @@ struct MBBPriorityInfo {
   bool IsSplit;
 
   MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
-    : MBB(mbb), Depth(depth), IsSplit(issplit) {}
+      : MBB(mbb), Depth(depth), IsSplit(issplit) {}
 };
 
 } // end anonymous namespace
@@ -4020,8 +4027,8 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
   if (SrcReg.isPhysical() || DstReg.isPhysical())
     return false;
 
-  return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
-    || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
+  return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg)) ||
+         LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
 }
 
 void RegisterCoalescer::lateLiveIntervalUpdate() {
@@ -4036,8 +4043,8 @@ void RegisterCoalescer::lateLiveIntervalUpdate() {
   ToBeUpdated.clear();
 }
 
-bool RegisterCoalescer::
-copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
+bool RegisterCoalescer::copyCoalesceWorkList(
+    MutableArrayRef<MachineInstr *> CurrList) {
   bool Progress = false;
   SmallPtrSet<MachineInstr *, 4> CurrentErasedInstrs;
   for (MachineInstr *&MI : CurrList) {
@@ -4113,7 +4120,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
     Register OtherSrcReg, OtherReg;
     unsigned OtherSrcSubReg = 0, OtherSubReg = 0;
     if (!isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
-                OtherSubReg))
+                     OtherSubReg))
       return false;
     if (OtherReg == SrcReg)
       OtherReg = OtherSrcReg;
@@ -4130,16 +4137,15 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
   return false;
 }
 
-void
-RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+void RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << MBB->getName() << ":\n");
 
   // Collect all copy-like instructions in MBB. Don't start coalescing anything
   // yet, it might invalidate the iterator.
   const unsigned PrevSize = WorkList.size();
   if (JoinGlobalCopies) {
-    SmallVector<MachineInstr*, 2> LocalTerminals;
-    SmallVector<MachineInstr*, 2> GlobalTerminals;
+    SmallVector<MachineInstr *, 2> LocalTerminals;
+    SmallVector<MachineInstr *, 2> GlobalTerminals;
     // Coalesce copies bottom-up to coalesce local defs before local uses. They
     // are not inherently easier to resolve, but slightly preferable until we
     // have local live range splitting. In particular this is required by
@@ -4163,9 +4169,8 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     // Append the copies evicted by the terminal rule at the end of the list.
     LocalWorkList.append(LocalTerminals.begin(), LocalTerminals.end());
     WorkList.append(GlobalTerminals.begin(), GlobalTerminals.end());
-  }
-  else {
-    SmallVector<MachineInstr*, 2> Terminals;
+  } else {
+    SmallVector<MachineInstr *, 2> Terminals;
     for (MachineInstr &MII : *MBB)
       if (MII.isCopyLike()) {
         if (applyTerminalRule(MII))
@@ -4179,11 +4184,12 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
   // Try coalescing the collected copies immediately, and remove the nulls.
   // This prevents the WorkList from getting too large since most copies are
   // joinable on the first attempt.
-  MutableArrayRef<MachineInstr*>
-    CurrList(WorkList.begin() + PrevSize, WorkList.end());
+  MutableArrayRef<MachineInstr *> CurrList(WorkList.begin() + PrevSize,
+                                           WorkList.end());
   if (copyCoalesceWorkList(CurrList))
-    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
-                               nullptr), WorkList.end());
+    WorkList.erase(
+        std::remove(WorkList.begin() + PrevSize, WorkList.end(), nullptr),
+        WorkList.end());
 }
 
 void RegisterCoalescer::coalesceLocals() {
@@ -4223,7 +4229,7 @@ void RegisterCoalescer::joinAllIntervals() {
   // Joining intervals can allow other intervals to be joined.  Iteratively join
   // until we make no progress.
   while (copyCoalesceWorkList(WorkList))
-    /* empty */ ;
+    /* empty */;
   lateLiveIntervalUpdate();
 }
 
@@ -4349,6 +4355,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   return true;
 }
 
-void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
+void RegisterCoalescer::print(raw_ostream &O, const Module *m) const {
   LIS->print(O);
 }
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.h b/llvm/lib/CodeGen/RegisterCoalescer.h
index f265d93fb0d63..6926e9b5d188f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.h
+++ b/llvm/lib/CodeGen/RegisterCoalescer.h
@@ -22,92 +22,92 @@ class MachineInstr;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
-  /// A helper class for register coalescers. When deciding if
-  /// two registers can be coalesced, CoalescerPair can determine if a copy
-  /// instruction would become an identity copy after coalescing.
-  class CoalescerPair {
-    const TargetRegisterInfo &TRI;
+/// A helper class for register coalescers. When deciding if
+/// two registers can be coalesced, CoalescerPair can determine if a copy
+/// instruction would become an identity copy after coalescing.
+class CoalescerPair {
+  const TargetRegisterInfo &TRI;
 
-    /// The register that will be left after coalescing. It can be a
-    /// virtual or physical register.
-    Register DstReg;
+  /// The register that will be left after coalescing. It can be a
+  /// virtual or physical register.
+  Register DstReg;
 
-    /// The virtual register that will be coalesced into dstReg.
-    Register SrcReg;
+  /// The virtual register that will be coalesced into dstReg.
+  Register SrcReg;
 
-    /// The sub-register index of the old DstReg in the new coalesced register.
-    unsigned DstIdx = 0;
+  /// The sub-register index of the old DstReg in the new coalesced register.
+  unsigned DstIdx = 0;
 
-    /// The sub-register index of the old SrcReg in the new coalesced register.
-    unsigned SrcIdx = 0;
+  /// The sub-register index of the old SrcReg in the new coalesced register.
+  unsigned SrcIdx = 0;
 
-    /// True when the original copy was a partial subregister copy.
-    bool Partial = false;
+  /// True when the original copy was a partial subregister copy.
+  bool Partial = false;
 
-    /// True when both regs are virtual and newRC is constrained.
-    bool CrossClass = false;
+  /// True when both regs are virtual and newRC is constrained.
+  bool CrossClass = false;
 
-    /// True when DstReg and SrcReg are reversed from the original
-    /// copy instruction.
-    bool Flipped = false;
+  /// True when DstReg and SrcReg are reversed from the original
+  /// copy instruction.
+  bool Flipped = false;
 
-    /// The register class of the coalesced register, or NULL if DstReg
-    /// is a physreg. This register class may be a super-register of both
-    /// SrcReg and DstReg.
-    const TargetRegisterClass *NewRC = nullptr;
+  /// The register class of the coalesced register, or NULL if DstReg
+  /// is a physreg. This register class may be a super-register of both
+  /// SrcReg and DstReg.
+  const TargetRegisterClass *NewRC = nullptr;
 
-  public:
-    CoalescerPair(const TargetRegisterInfo &tri) : TRI(tri) {}
+public:
+  CoalescerPair(const TargetRegisterInfo &tri) : TRI(tri) {}
 
-    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
-    /// No need to call setRegisters().
-    CoalescerPair(Register VirtReg, MCRegister PhysReg,
-                  const TargetRegisterInfo &tri)
-        : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg) {}
+  /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+  /// No need to call setRegisters().
+  CoalescerPair(Register VirtReg, MCRegister PhysReg,
+                const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg) {}
 
-    /// Set registers to match the copy instruction MI. Return
-    /// false if MI is not a coalescable copy instruction.
-    bool setRegisters(const MachineInstr*);
+  /// Set registers to match the copy instruction MI. Return
+  /// false if MI is not a coalescable copy instruction.
+  bool setRegisters(const MachineInstr *);
 
-    /// Swap SrcReg and DstReg. Return false if swapping is impossible
-    /// because DstReg is a physical register, or SubIdx is set.
-    bool flip();
+  /// Swap SrcReg and DstReg. Return false if swapping is impossible
+  /// because DstReg is a physical register, or SubIdx is set.
+  bool flip();
 
-    /// Return true if MI is a copy instruction that will become
-    /// an identity copy after coalescing.
-    bool isCoalescable(const MachineInstr*) const;
+  /// Return true if MI is a copy instruction that will become
+  /// an identity copy after coalescing.
+  bool isCoalescable(const MachineInstr *) const;
 
-    /// Return true if DstReg is a physical register.
-    bool isPhys() const { return !NewRC; }
+  /// Return true if DstReg is a physical register.
+  bool isPhys() const { return !NewRC; }
 
-    /// Return true if the original copy instruction did not copy
-    /// the full register, but was a subreg operation.
-    bool isPartial() const { return Partial; }
+  /// Return true if the original copy instruction did not copy
+  /// the full register, but was a subreg operation.
+  bool isPartial() const { return Partial; }
 
-    /// Return true if DstReg is virtual and NewRC is a smaller
-    /// register class than DstReg's.
-    bool isCrossClass() const { return CrossClass; }
+  /// Return true if DstReg is virtual and NewRC is a smaller
+  /// register class than DstReg's.
+  bool isCrossClass() const { return CrossClass; }
 
-    /// Return true when getSrcReg is the register being defined by
-    /// the original copy instruction.
-    bool isFlipped() const { return Flipped; }
+  /// Return true when getSrcReg is the register being defined by
+  /// the original copy instruction.
+  bool isFlipped() const { return Flipped; }
 
-    /// Return the register (virtual or physical) that will remain
-    /// after coalescing.
-    Register getDstReg() const { return DstReg; }
+  /// Return the register (virtual or physical) that will remain
+  /// after coalescing.
+  Register getDstReg() const { return DstReg; }
 
-    /// Return the virtual register that will be coalesced away.
-    Register getSrcReg() const { return SrcReg; }
+  /// Return the virtual register that will be coalesced away.
+  Register getSrcReg() const { return SrcReg; }
 
-    /// Return the subregister index that DstReg will be coalesced into, or 0.
-    unsigned getDstIdx() const { return DstIdx; }
+  /// Return the subregister index that DstReg will be coalesced into, or 0.
+  unsigned getDstIdx() const { return DstIdx; }
 
-    /// Return the subregister index that SrcReg will be coalesced into, or 0.
-    unsigned getSrcIdx() const { return SrcIdx; }
+  /// Return the subregister index that SrcReg will be coalesced into, or 0.
+  unsigned getSrcIdx() const { return SrcIdx; }
 
-    /// Return the register class of the coalesced register.
-    const TargetRegisterClass *getNewRC() const { return NewRC; }
-  };
+  /// Return the register class of the coalesced register.
+  const TargetRegisterClass *getNewRC() const { return NewRC; }
+};
 
 } // end namespace llvm
 

From 0165e3346fdb1a3f51352821227d6ff1af5aee59 Mon Sep 17 00:00:00 2001
From: Bushev Dmitry <111585886+dybv-sc@users.noreply.github.com>
Date: Tue, 28 Jan 2025 13:22:48 +0300
Subject: [PATCH 374/432] [llvm][Object] Add missing const qualifier for
 value_type in content_iterator (#124106)

value_type was defined as non-const for content_iterator, although it's
methods returned a const pointers/references. This prevented it from
using in some algorithms from STLExtras.h
---
 llvm/include/llvm/Object/SymbolicFile.h    |  2 +-
 llvm/unittests/Object/SymbolicFileTest.cpp | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Object/SymbolicFile.h b/llvm/include/llvm/Object/SymbolicFile.h
index b13588c147d9b..2c857e72c3e5a 100644
--- a/llvm/include/llvm/Object/SymbolicFile.h
+++ b/llvm/include/llvm/Object/SymbolicFile.h
@@ -71,7 +71,7 @@ template <class content_type> class content_iterator {
 
 public:
   using iterator_category = std::forward_iterator_tag;
-  using value_type = content_type;
+  using value_type = const content_type;
   using difference_type = std::ptrdiff_t;
   using pointer = value_type *;
   using reference = value_type &;
diff --git a/llvm/unittests/Object/SymbolicFileTest.cpp b/llvm/unittests/Object/SymbolicFileTest.cpp
index 38875ce7b8cd9..c3813b12b4476 100644
--- a/llvm/unittests/Object/SymbolicFileTest.cpp
+++ b/llvm/unittests/Object/SymbolicFileTest.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/SymbolicFile.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Host.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <sstream>
 
@@ -38,3 +40,22 @@ TEST(Object, DataRefImplOstream) {
 
   EXPECT_EQ(Expected, s);
 }
+
+struct ProxyContent {
+  unsigned Index = 0;
+  ProxyContent(unsigned Index) : Index(Index) {};
+  void moveNext() { ++Index; }
+
+  bool operator==(const ProxyContent &Another) const {
+    return Index == Another.Index;
+  }
+};
+
+TEST(Object, ContentIterator) {
+  using Iter = llvm::object::content_iterator<ProxyContent>;
+  auto Sequence = llvm::make_range(Iter(0u), Iter(10u));
+  auto EvenSequence = llvm::make_filter_range(
+      Sequence, [](auto &&PC) { return PC.Index % 2 == 0; });
+
+  EXPECT_THAT(EvenSequence, testing::ElementsAre(0u, 2u, 4u, 6u, 8u));
+}

From b29bf3de051bfc696a266367102a9b68b0cabfe1 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 10:33:44 +0000
Subject: [PATCH 375/432] [llvm][Docs] Re-order the LLDB release notes

To put generic changes first, moving into target specific changes
at the end.
---
 llvm/docs/ReleaseNotes.md | 48 ++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index c065787b874ab..8d89c64df1fa9 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -473,8 +473,6 @@ Changes to LLDB
            ╰─ error: use of undeclared identifier 'a'
   ```
 
-* LLDB can now read the `fpmr` register from AArch64 Linux processes and core
-  files.
 
 * Program stdout/stderr redirection will now open the file with O_TRUNC flag, make sure to truncate the file if path already exists.
   * eg. `settings set target.output-path/target.error-path <path/to/file>`
@@ -517,9 +515,28 @@ Changes to LLDB
           _regexp-bt        -- Show backtrace of the current thread's call ...
           _regexp-display   -- Evaluate an expression at every stop (see 'h...
   ```
-* DWARF indexing speed (for binaries not using the debug_names index) increased
+
+* DWARF indexing speed (for binaries not using the `debug_names` index) increased
   by [30-60%](https://github.com/llvm/llvm-project/pull/118657).
 
+* The `frame diagnose` now works on ELF-based systems. After a crash, LLDB will
+  try to determine the likely cause of the signal, matching Darwin behavior.
+  This feature requires using a new `lldb-server` version and (like Darwin) only
+  works on x86 binaries.
+
+  ```
+  * thread #1, name = 'a.out', stop reason = signal SIGSEGV: address not mapped to object (fault address=0x4)
+      frame #0: 0x00005555555551aa a.out`GetSum(f=0x0000555555558018) at main.c:21:37
+     18   }
+     19
+     20   int GetSum(struct Foo *f) {
+  -> 21     return SumTwoIntegers(f->a, f->b->d ? 0 : 1);
+     22   }
+     23
+     24   int main() {
+  Likely cause: f->b->d accessed 0x4
+  ```
+
 * Minidumps generated by LLDB now support:
   * 64 bit memory (due to 64b support, Minidumps are now paged to disk while being written).
   * Capturing of TLS variables.
@@ -550,7 +567,10 @@ Changes to LLDB
   that port to the connection handler processes. This means that only 2 ports need
   to be opened in the firewall (one for the `lldb-server` platform, one for gdbserver connections).
   In addition, due to this work, `lldb-server` now works on Windows in the server mode.
-  
+
+* LLDB can now read the `fpmr` register from AArch64 Linux processes and core
+  files.
+
 * LLDB now supports execution of user expressions for non-trivial cases for LoongArch and RISC-V targets, like function calls, when some code needs to be executed on the target.
 
 * LLDB now supports optionally enabled/disabled register sets (particularly floating point registers) for RISC-V 64. This happens for targets like `RV64IMAC` or `RV64IMACV`,
@@ -562,25 +582,7 @@ Changes to LLDB
 
 * LLDB now supports [vector registers for LoongArch](https://github.com/llvm/llvm-project/pull/120664) when debugging a live process.
 
-* Incorrect floating-point register dwarf number for LoongArch is [fixed](https://github.com/llvm/llvm-project/pull/120391).
-
-* The `frame diagnose` now works on ELF-based systems. After a crash, LLDB will
-  try to determine the likely cause of the signal, matching Darwin behavior.
-  This feature requires using a new `lldb-server` version and (like Darwin) only
-  works on x86 binaries.
-
-  ```
-  * thread #1, name = 'a.out', stop reason = signal SIGSEGV: address not mapped to object (fault address=0x4)
-      frame #0: 0x00005555555551aa a.out`GetSum(f=0x0000555555558018) at main.c:21:37
-     18   }
-     19
-     20   int GetSum(struct Foo *f) {
-  -> 21     return SumTwoIntegers(f->a, f->b->d ? 0 : 1);
-     22   }
-     23
-     24   int main() {
-  Likely cause: f->b->d accessed 0x4
-  ```
+* Incorrect floating-point register DWARF numbers for LoongArch were [fixed](https://github.com/llvm/llvm-project/pull/120391).
 
 Changes to BOLT
 ---------------------------------

From cdea38f91afcae93cc2a552cd96c41d8d3ab2ad6 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy@arm.com>
Date: Tue, 28 Jan 2025 10:40:35 +0000
Subject: [PATCH 376/432] Reland "[LoopVectorizer] Add support for chaining
 partial reductions #120272" (#124282)

Change `getScaledReduction` to take an existing vector, rather than
creating and returning a new one each call.
Rename `getScaledReduction` to `getScaledReductions` to more accurately
reflect what it's now doing.

---------

Co-authored-by: Karlo Basioli <68535415+basioli-k@users.noreply.github.com>
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   77 +-
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   16 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |    5 +-
 .../AArch64/partial-reduce-chained.ll         | 1025 +++++++++++++++++
 4 files changed, 1087 insertions(+), 36 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 16e5adb01a564..f1f367be9a897 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8684,12 +8684,12 @@ VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,
 /// are valid so recipes can be formed later.
 void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   // Find all possible partial reductions.
-  SmallVector<std::pair<PartialReductionChain, unsigned>, 1>
+  SmallVector<std::pair<PartialReductionChain, unsigned>>
       PartialReductionChains;
-  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars())
-    if (std::optional<std::pair<PartialReductionChain, unsigned>> Pair =
-            getScaledReduction(Phi, RdxDesc, Range))
-      PartialReductionChains.push_back(*Pair);
+  for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
+    getScaledReductions(Phi, RdxDesc.getLoopExitInstr(), Range,
+                        PartialReductionChains);
+  }
 
   // A partial reduction is invalid if any of its extends are used by
   // something that isn't another partial reduction. This is because the
@@ -8717,39 +8717,54 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
   }
 }
 
-std::optional<std::pair<PartialReductionChain, unsigned>>
-VPRecipeBuilder::getScaledReduction(PHINode *PHI,
-                                    const RecurrenceDescriptor &Rdx,
-                                    VFRange &Range) {
+bool VPRecipeBuilder::getScaledReductions(
+    Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
+    SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
+
+  if (!CM.TheLoop->contains(RdxExitInstr))
+    return false;
+
   // TODO: Allow scaling reductions when predicating. The select at
   // the end of the loop chooses between the phi value and most recent
   // reduction result, both of which have different VFs to the active lane
   // mask when scaling.
-  if (CM.blockNeedsPredicationForAnyReason(Rdx.getLoopExitInstr()->getParent()))
-    return std::nullopt;
+  if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))
+    return false;
 
-  auto *Update = dyn_cast<BinaryOperator>(Rdx.getLoopExitInstr());
+  auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
   if (!Update)
-    return std::nullopt;
+    return false;
 
   Value *Op = Update->getOperand(0);
   Value *PhiOp = Update->getOperand(1);
-  if (Op == PHI) {
-    Op = Update->getOperand(1);
-    PhiOp = Update->getOperand(0);
+  if (Op == PHI)
+    std::swap(Op, PhiOp);
+
+  // Try and get a scaled reduction from the first non-phi operand.
+  // If one is found, we use the discovered reduction instruction in
+  // place of the accumulator for costing.
+  if (auto *OpInst = dyn_cast<Instruction>(Op)) {
+    if (getScaledReductions(PHI, OpInst, Range, Chains)) {
+      PHI = Chains.rbegin()->first.Reduction;
+
+      Op = Update->getOperand(0);
+      PhiOp = Update->getOperand(1);
+      if (Op == PHI)
+        std::swap(Op, PhiOp);
+    }
   }
   if (PhiOp != PHI)
-    return std::nullopt;
+    return false;
 
   auto *BinOp = dyn_cast<BinaryOperator>(Op);
   if (!BinOp || !BinOp->hasOneUse())
-    return std::nullopt;
+    return false;
 
   using namespace llvm::PatternMatch;
   Value *A, *B;
   if (!match(BinOp->getOperand(0), m_ZExtOrSExt(m_Value(A))) ||
       !match(BinOp->getOperand(1), m_ZExtOrSExt(m_Value(B))))
-    return std::nullopt;
+    return false;
 
   Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
   Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
@@ -8759,7 +8774,7 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
   TTI::PartialReductionExtendKind OpBExtend =
       TargetTransformInfo::getPartialReductionExtendKind(ExtB);
 
-  PartialReductionChain Chain(Rdx.getLoopExitInstr(), ExtA, ExtB, BinOp);
+  PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);
 
   unsigned TargetScaleFactor =
       PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
@@ -8773,10 +8788,12 @@ VPRecipeBuilder::getScaledReduction(PHINode *PHI,
                 std::make_optional(BinOp->getOpcode()));
             return Cost.isValid();
           },
-          Range))
-    return std::make_pair(Chain, TargetScaleFactor);
+          Range)) {
+    Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
+    return true;
+  }
 
-  return std::nullopt;
+  return false;
 }
 
 VPRecipeBase *
@@ -8871,12 +8888,14 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
          "Unexpected number of operands for partial reduction");
 
   VPValue *BinOp = Operands[0];
-  VPValue *Phi = Operands[1];
-  if (isa<VPReductionPHIRecipe>(BinOp->getDefiningRecipe()))
-    std::swap(BinOp, Phi);
-
-  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp, Phi,
-                                      Reduction);
+  VPValue *Accumulator = Operands[1];
+  VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
+  if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
+      isa<VPPartialReductionRecipe>(BinOpRecipe))
+    std::swap(BinOp, Accumulator);
+
+  return new VPPartialReductionRecipe(Reduction->getOpcode(), BinOp,
+                                      Accumulator, Reduction);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 44745bfd46f89..87c97d1edd7b6 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -139,12 +139,16 @@ class VPRecipeBuilder {
 
   /// Examines reduction operations to see if the target can use a cheaper
   /// operation with a wider per-iteration input VF and narrower PHI VF.
-  /// Returns null if no scaled reduction was found, otherwise a pair with a
-  /// struct containing reduction information and the scaling factor between the
-  /// number of elements in the input and output.
-  std::optional<std::pair<PartialReductionChain, unsigned>>
-  getScaledReduction(PHINode *PHI, const RecurrenceDescriptor &Rdx,
-                     VFRange &Range);
+  /// Each element within Chains is a pair with a struct containing reduction
+  /// information and the scaling factor between the number of elements in
+  /// the input and output.
+  /// Recursively calls itself to identify chained scaled reductions.
+  /// Returns true if this invocation added an entry to Chains, otherwise false.
+  /// i.e. returns false in the case that a subcall adds an entry to Chains,
+  /// but the top-level call does not.
+  bool getScaledReductions(
+      Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
+      SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains);
 
 public:
   VPRecipeBuilder(VPlan &Plan, Loop *OrigLoop, const TargetLibraryInfo *TLI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 253f22d299b62..a1ff684b2b801 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2462,7 +2462,10 @@ class VPPartialReductionRecipe : public VPSingleDefRecipe {
       : VPSingleDefRecipe(VPDef::VPPartialReductionSC,
                           ArrayRef<VPValue *>({Op0, Op1}), ReductionInst),
         Opcode(Opcode) {
-    assert(isa<VPReductionPHIRecipe>(getOperand(1)->getDefiningRecipe()) &&
+    [[maybe_unused]] auto *AccumulatorRecipe =
+        getOperand(1)->getDefiningRecipe();
+    assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+            isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
            "Unexpected operand order for partial reduction recipe");
   }
   ~VPPartialReductionRecipe() override = default;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
new file mode 100644
index 0000000000000..bedf8b6b3a9b5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -0,0 +1,1025 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
+; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) vscale_range(1,16)
+define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %sub, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %sub, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %add = add nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %sub = sub i32 %add, %mul.ac
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add.2, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add.2, %for.body ]
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %add = add nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %add.2 = add i32 %add, %mul.ac
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %add, %for.body ]
+
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %sub = sub nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %add = add i32 %sub, %mul.ac
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13]] = sub <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19]] = sub <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP19]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %sub = sub nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %sub.2 = sub i32 %sub, %mul.ac
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP21]] = add <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_add_add_add(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 16 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]], <vscale x 16 x i32> [[TMP17]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 16 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[PARTIAL_REDUCE3]], <vscale x 16 x i32> [[TMP18]])
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE4]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %sub = add nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %add = add nsw i32 %sub, %mul.ac
+  %mul.bc = mul nsw i32 %b.ext, %c.ext
+  %sub.2 = add i32 %add, %mul.bc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
+; CHECK-NEON-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-NEON-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEON-NEXT:  entry:
+; CHECK-NEON-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEON-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEON-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEON-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON:       vector.ph:
+; CHECK-NEON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
+; CHECK-NEON-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON:       vector.body:
+; CHECK-NEON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEON-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP0]]
+; CHECK-NEON-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-NEON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP3]], i32 0
+; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub <16 x i32> [[VEC_PHI]], [[TMP10]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = add <16 x i32> [[TMP11]], [[TMP12]]
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP15]] = sub <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEON:       middle.block:
+; CHECK-NEON-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
+; CHECK-NEON-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEON-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SVE-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-NEXT:  entry:
+; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE:       vector.ph:
+; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE:       vector.body:
+; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-NEXT:    [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-SVE:       middle.block:
+; CHECK-SVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+; CHECK-SVE-MAXBW-LABEL: define i32 @chained_partial_reduce_sub_add_sub(
+; CHECK-SVE-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-SVE-MAXBW-NEXT:  entry:
+; CHECK-SVE-MAXBW-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-SVE-MAXBW-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-SVE-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-SVE-MAXBW:       vector.ph:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-SVE-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-SVE-MAXBW-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
+; CHECK-SVE-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK-SVE-MAXBW:       vector.body:
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; CHECK-SVE-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP9]], i32 0
+; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP12]], align 1
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub <vscale x 8 x i32> [[VEC_PHI]], [[TMP16]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = add <vscale x 8 x i32> [[TMP17]], [[TMP18]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP21]] = sub <vscale x 8 x i32> [[TMP19]], [[TMP20]]
+; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-SVE-MAXBW:       middle.block:
+; CHECK-SVE-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP21]])
+; CHECK-SVE-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-SVE-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+;
+entry:
+  %cmp28.not = icmp ult i32 %N, 2
+  %div27 = lshr i32 %N, 1
+  %wide.trip.count = zext nneg i32 %div27 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %res.0.lcssa = phi i32 [ %sub.2, %for.body ]
+  ret i32 %res.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %res = phi i32 [ 0, %entry ], [ %sub.2, %for.body ]
+
+  %a.ptr = getelementptr inbounds nuw i8, ptr %a, i64 %indvars.iv
+  %b.ptr = getelementptr inbounds nuw i8, ptr %b, i64 %indvars.iv
+  %c.ptr = getelementptr inbounds nuw i8, ptr %c, i64 %indvars.iv
+  %a.val = load i8, ptr %a.ptr, align 1
+  %b.val = load i8, ptr %b.ptr, align 1
+  %c.val = load i8, ptr %c.ptr, align 1
+
+  %a.ext = sext i8 %a.val to i32
+  %b.ext = sext i8 %b.val to i32
+  %c.ext = sext i8 %c.val to i32
+
+  %mul.ab = mul nsw i32 %a.ext, %b.ext
+  %sub = sub nsw i32 %res, %mul.ab
+  %mul.ac = mul nsw i32 %a.ext, %c.ext
+  %add = add nsw i32 %sub, %mul.ac
+  %mul.bc = mul nsw i32 %b.ext, %c.ext
+  %sub.2 = sub i32 %add, %mul.bc
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+}
+
+attributes #0 = { vscale_range(1,16) }
+
+
+!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!1 = distinct !{!0}

From 2c934dc5e1a3ef7b717400f27d6b9ea21f4e20a0 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 28 Jan 2025 11:43:02 +0100
Subject: [PATCH 377/432] [clang][bytecode] Always compile most recent function
 decl (#124722)

---
 clang/lib/AST/ByteCode/Context.cpp | 6 ++----
 clang/test/AST/ByteCode/cxx2a.cpp  | 9 +++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp
index b52892cf69bfb..a322700fc0d22 100644
--- a/clang/lib/AST/ByteCode/Context.cpp
+++ b/clang/lib/AST/ByteCode/Context.cpp
@@ -27,10 +27,7 @@ Context::~Context() {}
 
 bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
   assert(Stk.empty());
-  Function *Func = P->getFunction(FD);
-  if (!Func || !Func->hasBody())
-    Func = Compiler<ByteCodeEmitter>(*this, *P).compileFunc(FD);
-
+  const Function *Func = getOrCreateFunction(FD);
   if (!Func)
     return false;
 
@@ -271,6 +268,7 @@ Context::getOverridingFunction(const CXXRecordDecl *DynamicDecl,
 
 const Function *Context::getOrCreateFunction(const FunctionDecl *FD) {
   assert(FD);
+  FD = FD->getMostRecentDecl();
   const Function *Func = P->getFunction(FD);
   bool IsBeingCompiled = Func && Func->isDefined() && !Func->isFullyCompiled();
   bool WasNotDefined = Func && !Func->isConstexpr() && !Func->isDefined();
diff --git a/clang/test/AST/ByteCode/cxx2a.cpp b/clang/test/AST/ByteCode/cxx2a.cpp
index e478a0ddc4c14..b9327716d7b92 100644
--- a/clang/test/AST/ByteCode/cxx2a.cpp
+++ b/clang/test/AST/ByteCode/cxx2a.cpp
@@ -170,3 +170,12 @@ namespace TypeId {
   }
   static_assert(side_effects());
 }
+
+consteval int f(int i);
+constexpr bool test(auto i) {
+    return f(0) == 0;
+}
+consteval int f(int i) {
+    return 2 * i;
+}
+static_assert(test(42));

From db6fa74dfea30c025e5d4c30ca4e31e20b69b04d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Tue, 28 Jan 2025 11:06:24 +0000
Subject: [PATCH 378/432] [AArch64] Implement FP8 Neon reinterpret intrinsics
 (#120476)

---
 clang/include/clang/Basic/arm_neon.td         |   2 +-
 .../acle_neon_fp8_reinterpret.c               | 855 ++++++++++++++++++
 clang/utils/TableGen/NeonEmitter.cpp          |   2 +-
 3 files changed, 857 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 577379e492160..3e73dd054933f 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1299,7 +1299,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">;
 // NeonEmitter implicitly takes the cartesian product of the type string with
 // itself during generation so, unlike all other intrinsics, this one should
 // include *all* types, not just additional ones.
-def VVREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk"> {
+def VVREINTERPRET : REINTERPRET_CROSS_SELF<"csilUcUsUiUlmhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQmQhQfQdQPcQPsQPlQPk"> {
   let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)";
   let BigEndianSafe = 1;
 }
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c
new file mode 100644
index 0000000000000..201d4dbbe34ad
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_reinterpret.c
@@ -0,0 +1,855 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+#include <arm_neon.h>
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_p8_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z24test_vreinterpret_p8_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+poly8x8_t test_vreinterpret_p8_mf8(mfloat8x8_t v) {
+  return vreinterpret_p8_mf8(v);
+}
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_p64_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_p64_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-CXX-NEXT:    ret <1 x i64> [[TMP0]]
+//
+poly64x1_t test_vreinterpret_p64_mf8(mfloat8x8_t v) {
+  return vreinterpret_p64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_p16_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x i16> @_Z25test_vreinterpret_p16_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-CXX-NEXT:    ret <4 x i16> [[TMP0]]
+//
+poly16x4_t test_vreinterpret_p16_mf8(mfloat8x8_t v) {
+  return vreinterpret_p16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_p8_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z25test_vreinterpretq_p8_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+poly8x16_t test_vreinterpretq_p8_mf8(mfloat8x16_t v) {
+  return vreinterpretq_p8_mf8(v);
+}
+// CHECK-LABEL: define dso_local i128 @test_vreinterpretq_p128_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef i128 @_Z27test_vreinterpretq_p128_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to i128
+// CHECK-CXX-NEXT:    ret i128 [[TMP0]]
+//
+poly128_t test_vreinterpretq_p128_mf8(mfloat8x16_t v) {
+  return vreinterpretq_p128_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_p64_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x i64> @_Z26test_vreinterpretq_p64_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-CXX-NEXT:    ret <2 x i64> [[TMP0]]
+//
+poly64x2_t test_vreinterpretq_p64_mf8(mfloat8x16_t v) {
+  return vreinterpretq_p64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_p16_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i16> @_Z26test_vreinterpretq_p16_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-CXX-NEXT:    ret <8 x i16> [[TMP0]]
+//
+poly16x8_t test_vreinterpretq_p16_mf8(mfloat8x16_t v) {
+  return vreinterpretq_p16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_u8_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z25test_vreinterpretq_u8_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+uint8x16_t test_vreinterpretq_u8_mf8(mfloat8x16_t v) {
+  return vreinterpretq_u8_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_u32_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x i32> @_Z26test_vreinterpretq_u32_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32>
+// CHECK-CXX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+uint32x4_t test_vreinterpretq_u32_mf8(mfloat8x16_t v) {
+  return vreinterpretq_u32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_u64_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x i64> @_Z26test_vreinterpretq_u64_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-CXX-NEXT:    ret <2 x i64> [[TMP0]]
+//
+uint64x2_t test_vreinterpretq_u64_mf8(mfloat8x16_t v) {
+  return vreinterpretq_u64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_u16_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i16> @_Z26test_vreinterpretq_u16_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-CXX-NEXT:    ret <8 x i16> [[TMP0]]
+//
+uint16x8_t test_vreinterpretq_u16_mf8(mfloat8x16_t v) {
+  return vreinterpretq_u16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_s8_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z25test_vreinterpretq_s8_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+int8x16_t test_vreinterpretq_s8_mf8(mfloat8x16_t v) {
+  return vreinterpretq_s8_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x double> @test_vreinterpretq_f64_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x double>
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x double> @_Z26test_vreinterpretq_f64_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x double>
+// CHECK-CXX-NEXT:    ret <2 x double> [[TMP0]]
+//
+float64x2_t test_vreinterpretq_f64_mf8(mfloat8x16_t v) {
+  return vreinterpretq_f64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x float> @test_vreinterpretq_f32_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z26test_vreinterpretq_f32_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x float>
+// CHECK-CXX-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t test_vreinterpretq_f32_mf8(mfloat8x16_t v) {
+  return vreinterpretq_f32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x half> @test_vreinterpretq_f16_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x half>
+// CHECK-NEXT:    ret <8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vreinterpretq_f16_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x half>
+// CHECK-CXX-NEXT:    ret <8 x half> [[TMP0]]
+//
+float16x8_t test_vreinterpretq_f16_mf8(mfloat8x16_t v) {
+  return vreinterpretq_f16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x i32> @test_vreinterpretq_s32_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x i32> @_Z26test_vreinterpretq_s32_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <4 x i32>
+// CHECK-CXX-NEXT:    ret <4 x i32> [[TMP0]]
+//
+int32x4_t test_vreinterpretq_s32_mf8(mfloat8x16_t v) {
+  return vreinterpretq_s32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x i64> @test_vreinterpretq_s64_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x i64> @_Z26test_vreinterpretq_s64_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <2 x i64>
+// CHECK-CXX-NEXT:    ret <2 x i64> [[TMP0]]
+//
+int64x2_t test_vreinterpretq_s64_mf8(mfloat8x16_t v) {
+  return vreinterpretq_s64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_p8(
+// CHECK-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z25test_vreinterpretq_mf8_p812__Poly8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_p8(poly8x16_t v) {
+  return vreinterpretq_mf8_p8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_p128(
+// CHECK-SAME: i128 noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z27test_vreinterpretq_mf8_p128o(
+// CHECK-CXX-SAME: i128 noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast i128 [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_p128(poly128_t v) {
+  return vreinterpretq_mf8_p128(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_p64(
+// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_p6412__Poly64x2_t(
+// CHECK-CXX-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_p64(poly64x2_t v) {
+  return vreinterpretq_mf8_p64(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_p16(
+// CHECK-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_p1612__Poly16x8_t(
+// CHECK-CXX-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_p16(poly16x8_t v) {
+  return vreinterpretq_mf8_p16(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_u8(
+// CHECK-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z25test_vreinterpretq_mf8_u812__Uint8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_u8(uint8x16_t v) {
+  return vreinterpretq_mf8_u8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_u32(
+// CHECK-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_u3212__Uint32x4_t(
+// CHECK-CXX-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_u32(uint32x4_t v) {
+  return vreinterpretq_mf8_u32(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_u64(
+// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_u6412__Uint64x2_t(
+// CHECK-CXX-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_u64(uint64x2_t v) {
+  return vreinterpretq_mf8_u64(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_u16(
+// CHECK-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_u1612__Uint16x8_t(
+// CHECK-CXX-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_u16(uint16x8_t v) {
+  return vreinterpretq_mf8_u16(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_s8(
+// CHECK-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z25test_vreinterpretq_mf8_s811__Int8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[V]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_s8(int8x16_t v) {
+  return vreinterpretq_mf8_s8(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f64(
+// CHECK-SAME: <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f6413__Float64x2_t(
+// CHECK-CXX-SAME: <2 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x double> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_f64(float64x2_t v) {
+  return vreinterpretq_mf8_f64(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f32(
+// CHECK-SAME: <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f3213__Float32x4_t(
+// CHECK-CXX-SAME: <4 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_f32(float32x4_t v) {
+  return vreinterpretq_mf8_f32(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_f16(
+// CHECK-SAME: <8 x half> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_f1613__Float16x8_t(
+// CHECK-CXX-SAME: <8 x half> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_f16(float16x8_t v) {
+  return vreinterpretq_mf8_f16(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_s32(
+// CHECK-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_s3211__Int32x4_t(
+// CHECK-CXX-SAME: <4 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_s32(int32x4_t v) {
+  return vreinterpretq_mf8_s32(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_s64(
+// CHECK-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_s6411__Int64x2_t(
+// CHECK-CXX-SAME: <2 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_s64(int64x2_t v) {
+  return vreinterpretq_mf8_s64(v);
+}
+// CHECK-LABEL: define dso_local <16 x i8> @test_vreinterpretq_mf8_s16(
+// CHECK-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vreinterpretq_mf8_s1611__Int16x8_t(
+// CHECK-CXX-SAME: <8 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i16> [[V]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t test_vreinterpretq_mf8_s16(int16x8_t v) {
+  return vreinterpretq_mf8_s16(v);
+}
+// CHECK-LABEL: define dso_local <8 x i16> @test_vreinterpretq_s16_mf8(
+// CHECK-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i16> @_Z26test_vreinterpretq_s16_mf814__Mfloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[V]] to <8 x i16>
+// CHECK-CXX-NEXT:    ret <8 x i16> [[TMP0]]
+//
+int16x8_t test_vreinterpretq_s16_mf8(mfloat8x16_t v) {
+  return vreinterpretq_s16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_u8_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z24test_vreinterpret_u8_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+uint8x8_t test_vreinterpret_u8_mf8(mfloat8x8_t v) {
+  return vreinterpret_u8_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_u32_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x i32> @_Z25test_vreinterpret_u32_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32>
+// CHECK-CXX-NEXT:    ret <2 x i32> [[TMP0]]
+//
+uint32x2_t test_vreinterpret_u32_mf8(mfloat8x8_t v) {
+  return vreinterpret_u32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_u64_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_u64_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-CXX-NEXT:    ret <1 x i64> [[TMP0]]
+//
+uint64x1_t test_vreinterpret_u64_mf8(mfloat8x8_t v) {
+  return vreinterpret_u64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_u16_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x i16> @_Z25test_vreinterpret_u16_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-CXX-NEXT:    ret <4 x i16> [[TMP0]]
+//
+uint16x4_t test_vreinterpret_u16_mf8(mfloat8x8_t v) {
+  return vreinterpret_u16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_s8_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z24test_vreinterpret_s8_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+int8x8_t test_vreinterpret_s8_mf8(mfloat8x8_t v) {
+  return vreinterpret_s8_mf8(v);
+}
+// CHECK-LABEL: define dso_local <1 x double> @test_vreinterpret_f64_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x double>
+// CHECK-NEXT:    ret <1 x double> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <1 x double> @_Z25test_vreinterpret_f64_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x double>
+// CHECK-CXX-NEXT:    ret <1 x double> [[TMP0]]
+//
+float64x1_t test_vreinterpret_f64_mf8(mfloat8x8_t v) {
+  return vreinterpret_f64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x float> @test_vreinterpret_f32_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z25test_vreinterpret_f32_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x float>
+// CHECK-CXX-NEXT:    ret <2 x float> [[TMP0]]
+//
+float32x2_t test_vreinterpret_f32_mf8(mfloat8x8_t v) {
+  return vreinterpret_f32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <4 x half> @test_vreinterpret_f16_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x half>
+// CHECK-NEXT:    ret <4 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z25test_vreinterpret_f16_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x half>
+// CHECK-CXX-NEXT:    ret <4 x half> [[TMP0]]
+//
+float16x4_t test_vreinterpret_f16_mf8(mfloat8x8_t v) {
+  return vreinterpret_f16_mf8(v);
+}
+// CHECK-LABEL: define dso_local <2 x i32> @test_vreinterpret_s32_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32>
+// CHECK-NEXT:    ret <2 x i32> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x i32> @_Z25test_vreinterpret_s32_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <2 x i32>
+// CHECK-CXX-NEXT:    ret <2 x i32> [[TMP0]]
+//
+int32x2_t test_vreinterpret_s32_mf8(mfloat8x8_t v) {
+  return vreinterpret_s32_mf8(v);
+}
+// CHECK-LABEL: define dso_local <1 x i64> @test_vreinterpret_s64_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-NEXT:    ret <1 x i64> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <1 x i64> @_Z25test_vreinterpret_s64_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <1 x i64>
+// CHECK-CXX-NEXT:    ret <1 x i64> [[TMP0]]
+//
+int64x1_t test_vreinterpret_s64_mf8(mfloat8x8_t v) {
+  return vreinterpret_s64_mf8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_p8(
+// CHECK-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z24test_vreinterpret_mf8_p811__Poly8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+mfloat8x8_t test_vreinterpret_mf8_p8(poly8x8_t v) {
+  return vreinterpret_mf8_p8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_p64(
+// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_p6412__Poly64x1_t(
+// CHECK-CXX-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_p64(poly64x1_t v) {
+  return vreinterpret_mf8_p64(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_p16(
+// CHECK-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_p1612__Poly16x4_t(
+// CHECK-CXX-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_p16(poly16x4_t v) {
+  return vreinterpret_mf8_p16(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_u8(
+// CHECK-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z24test_vreinterpret_mf8_u811__Uint8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+mfloat8x8_t test_vreinterpret_mf8_u8(uint8x8_t v) {
+  return vreinterpret_mf8_u8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_u32(
+// CHECK-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_u3212__Uint32x2_t(
+// CHECK-CXX-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_u32(uint32x2_t v) {
+  return vreinterpret_mf8_u32(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_u64(
+// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_u6412__Uint64x1_t(
+// CHECK-CXX-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_u64(uint64x1_t v) {
+  return vreinterpret_mf8_u64(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_u16(
+// CHECK-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_u1612__Uint16x4_t(
+// CHECK-CXX-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_u16(uint16x4_t v) {
+  return vreinterpret_mf8_u16(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_s8(
+// CHECK-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[V]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z24test_vreinterpret_mf8_s810__Int8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[V]]
+//
+mfloat8x8_t test_vreinterpret_mf8_s8(int8x8_t v) {
+  return vreinterpret_mf8_s8(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f64(
+// CHECK-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f6413__Float64x1_t(
+// CHECK-CXX-SAME: <1 x double> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x double> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_f64(float64x1_t v) {
+  return vreinterpret_mf8_f64(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f32(
+// CHECK-SAME: <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f3213__Float32x2_t(
+// CHECK-CXX-SAME: <2 x float> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_f32(float32x2_t v) {
+  return vreinterpret_mf8_f32(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_f16(
+// CHECK-SAME: <4 x half> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_f1613__Float16x4_t(
+// CHECK-CXX-SAME: <4 x half> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_f16(float16x4_t v) {
+  return vreinterpret_mf8_f16(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_s32(
+// CHECK-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_s3211__Int32x2_t(
+// CHECK-CXX-SAME: <2 x i32> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x i32> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_s32(int32x2_t v) {
+  return vreinterpret_mf8_s32(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_s64(
+// CHECK-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_s6411__Int64x1_t(
+// CHECK-CXX-SAME: <1 x i64> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <1 x i64> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_s64(int64x1_t v) {
+  return vreinterpret_mf8_s64(v);
+}
+// CHECK-LABEL: define dso_local <8 x i8> @test_vreinterpret_mf8_s16(
+// CHECK-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z25test_vreinterpret_mf8_s1611__Int16x4_t(
+// CHECK-CXX-SAME: <4 x i16> noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x i16> [[V]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_vreinterpret_mf8_s16(int16x4_t v) {
+  return vreinterpret_mf8_s16(v);
+}
+// CHECK-LABEL: define dso_local <4 x i16> @test_vreinterpret_s16_mf8(
+// CHECK-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-NEXT:    ret <4 x i16> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x i16> @_Z25test_vreinterpret_s16_mf813__Mfloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[V:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[V]] to <4 x i16>
+// CHECK-CXX-NEXT:    ret <4 x i16> [[TMP0]]
+//
+int16x4_t test_vreinterpret_s16_mf8(mfloat8x8_t v) {
+  return vreinterpret_s16_mf8(v);
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 11f33ca17fda8..295e7eb89c967 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -1023,7 +1023,7 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
     return "bf16";
 
   if (T.isMFloat8())
-    return "mfp8";
+    return "mf8";
 
   if (T.isPoly())
     typeCode = 'p';

From 97aa56ada5d25803112901ff06764975506ce7a6 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 28 Jan 2025 12:07:35 +0100
Subject: [PATCH 379/432] [LLD][COFF] Move delayLoadHelper and
 tailMergeUnwindInfoChunk to SymbolTable (NFC) (#124729)

In preparation for ARM64X delay-load import support (#124600).
---
 lld/COFF/Config.h      |  1 -
 lld/COFF/DLL.cpp       | 41 +++++++++++++++++------------------------
 lld/COFF/DLL.h         |  8 +++-----
 lld/COFF/Driver.cpp    | 13 +++++++------
 lld/COFF/SymbolTable.h |  3 +++
 lld/COFF/Writer.cpp    |  3 +--
 6 files changed, 31 insertions(+), 38 deletions(-)

diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index cd280aa09964d..0c7c4e91402f1 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -164,7 +164,6 @@ struct Configuration {
   bool noimplib = false;
   std::set<std::string> delayLoads;
   std::map<std::string, int> dllOrder;
-  Symbol *delayLoadHelper = nullptr;
   Symbol *arm64ECIcallHelper = nullptr;
 
   llvm::DenseSet<llvm::StringRef> saveTempsArgs;
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index b6fbd5a484b5e..534cd47be1051 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -911,12 +911,9 @@ uint64_t DelayLoadContents::getDirSize() {
   return dirs.size() * sizeof(delay_import_directory_table_entry);
 }
 
-void DelayLoadContents::create(Defined *h) {
-  helper = h;
+void DelayLoadContents::create() {
   std::vector<std::vector<DefinedImportData *>> v = binImports(ctx, imports);
 
-  Chunk *unwind = newTailMergeUnwindInfoChunk();
-
   // Create .didat contents for each DLL.
   for (std::vector<DefinedImportData *> &syms : v) {
     // Create the delay import table header.
@@ -924,8 +921,8 @@ void DelayLoadContents::create(Defined *h) {
     auto *dir = make<DelayDirectoryChunk>(dllNames.back());
 
     size_t base = addresses.size();
-    Chunk *tm = newTailMergeChunk(dir);
-    Chunk *pdataChunk = unwind ? newTailMergePDataChunk(tm, unwind) : nullptr;
+    Chunk *tm = newTailMergeChunk(ctx.symtab, dir);
+    Chunk *pdataChunk = newTailMergePDataChunk(ctx.symtab, tm);
     for (DefinedImportData *s : syms) {
       Chunk *t = newThunkChunk(s, tm);
       auto *a = make<DelayAddressChunk>(ctx, t);
@@ -982,15 +979,18 @@ void DelayLoadContents::create(Defined *h) {
     dirs.push_back(dir);
   }
 
-  if (unwind)
-    unwindinfo.push_back(unwind);
+  ctx.forEachSymtab([&](SymbolTable &symtab) {
+    if (symtab.tailMergeUnwindInfoChunk)
+      unwindinfo.push_back(symtab.tailMergeUnwindInfoChunk);
+  });
   // Add null terminator.
   dirs.push_back(
       make<NullChunk>(sizeof(delay_import_directory_table_entry), 4));
 }
 
-Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) {
-  switch (ctx.config.machine) {
+Chunk *DelayLoadContents::newTailMergeChunk(SymbolTable &symtab, Chunk *dir) {
+  auto helper = cast<Defined>(symtab.delayLoadHelper);
+  switch (symtab.machine) {
   case AMD64:
   case ARM64EC:
     return make<TailMergeChunkX64>(dir, helper);
@@ -1005,21 +1005,14 @@ Chunk *DelayLoadContents::newTailMergeChunk(Chunk *dir) {
   }
 }
 
-Chunk *DelayLoadContents::newTailMergeUnwindInfoChunk() {
-  switch (ctx.config.machine) {
-  case AMD64:
-  case ARM64EC:
-    return make<TailMergeUnwindInfoX64>();
-    // FIXME: Add support for other architectures.
-  default:
-    return nullptr; // Just don't generate unwind info.
-  }
-}
-Chunk *DelayLoadContents::newTailMergePDataChunk(Chunk *tm, Chunk *unwind) {
-  switch (ctx.config.machine) {
+Chunk *DelayLoadContents::newTailMergePDataChunk(SymbolTable &symtab,
+                                                 Chunk *tm) {
+  switch (symtab.machine) {
   case AMD64:
   case ARM64EC:
-    return make<TailMergePDataChunkX64>(tm, unwind);
+    if (!symtab.tailMergeUnwindInfoChunk)
+      symtab.tailMergeUnwindInfoChunk = make<TailMergeUnwindInfoX64>();
+    return make<TailMergePDataChunkX64>(tm, symtab.tailMergeUnwindInfoChunk);
     // FIXME: Add support for other architectures.
   default:
     return nullptr; // Just don't generate unwind info.
@@ -1028,7 +1021,7 @@ Chunk *DelayLoadContents::newTailMergePDataChunk(Chunk *tm, Chunk *unwind) {
 
 Chunk *DelayLoadContents::newThunkChunk(DefinedImportData *s,
                                         Chunk *tailMerge) {
-  switch (ctx.config.machine) {
+  switch (s->file->getMachineType()) {
   case AMD64:
   case ARM64EC:
     return make<ThunkChunkX64>(s, tailMerge);
diff --git a/lld/COFF/DLL.h b/lld/COFF/DLL.h
index 724a323d62d20..5105b79f15d31 100644
--- a/lld/COFF/DLL.h
+++ b/lld/COFF/DLL.h
@@ -42,7 +42,7 @@ class DelayLoadContents {
   DelayLoadContents(COFFLinkerContext &ctx) : ctx(ctx) {}
   void add(DefinedImportData *sym) { imports.push_back(sym); }
   bool empty() { return imports.empty(); }
-  void create(Defined *helper);
+  void create();
   std::vector<Chunk *> getChunks();
   std::vector<Chunk *> getDataChunks();
   ArrayRef<Chunk *> getCodeChunks() { return thunks; }
@@ -56,11 +56,9 @@ class DelayLoadContents {
 
 private:
   Chunk *newThunkChunk(DefinedImportData *s, Chunk *tailMerge);
-  Chunk *newTailMergeChunk(Chunk *dir);
-  Chunk *newTailMergePDataChunk(Chunk *tm, Chunk *unwind);
-  Chunk *newTailMergeUnwindInfoChunk();
+  Chunk *newTailMergeChunk(SymbolTable &symtab, Chunk *dir);
+  Chunk *newTailMergePDataChunk(SymbolTable &symtab, Chunk *tm);
 
-  Defined *helper;
   std::vector<DefinedImportData *> imports;
   std::vector<Chunk *> dirs;
   std::vector<Chunk *> moduleHandles;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 6eea11f5f451f..ac3ac57bd17f4 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2353,12 +2353,13 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     llvm::TimeTraceScope timeScope("Delay load");
     for (auto *arg : args.filtered(OPT_delayload)) {
       config->delayLoads.insert(StringRef(arg->getValue()).lower());
-      if (config->machine == I386) {
-        config->delayLoadHelper = ctx.symtab.addGCRoot("___delayLoadHelper2@8");
-      } else {
-        config->delayLoadHelper =
-            ctx.symtab.addGCRoot("__delayLoadHelper2", true);
-      }
+      ctx.forEachSymtab([&](SymbolTable &symtab) {
+        if (symtab.machine == I386) {
+          symtab.delayLoadHelper = symtab.addGCRoot("___delayLoadHelper2@8");
+        } else {
+          symtab.delayLoadHelper = symtab.addGCRoot("__delayLoadHelper2", true);
+        }
+      });
     }
   }
 
diff --git a/lld/COFF/SymbolTable.h b/lld/COFF/SymbolTable.h
index c8d7251838842..ff6e8487f0734 100644
--- a/lld/COFF/SymbolTable.h
+++ b/lld/COFF/SymbolTable.h
@@ -158,6 +158,9 @@ class SymbolTable {
   Chunk *edataStart = nullptr;
   Chunk *edataEnd = nullptr;
 
+  Symbol *delayLoadHelper = nullptr;
+  Chunk *tailMergeUnwindInfoChunk = nullptr;
+
   void fixupExports();
   void assignExportOrdinals();
   void parseModuleDefs(StringRef path);
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index bef2ced9f2957..2bdaeb58ab432 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1307,8 +1307,7 @@ void Writer::appendImportThunks() {
   }
 
   if (!delayIdata.empty()) {
-    Defined *helper = cast<Defined>(ctx.config.delayLoadHelper);
-    delayIdata.create(helper);
+    delayIdata.create();
     for (Chunk *c : delayIdata.getChunks())
       didatSec->addChunk(c);
     for (Chunk *c : delayIdata.getDataChunks())

From 8017ca1d0056907331ff7542ac9ff1ff5ec969a2 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Tue, 28 Jan 2025 11:27:34 +0000
Subject: [PATCH 380/432] Reapply "[AArch64] Combine and and lsl into ubfiz"
 (#123356) (#124576)

Patch was reverted due to test case (added) exposing an infinite loop in
combiner, where (shl C1, C2) create by performSHLCombine isn't
constant-folded:

  Combining: t14: i64 = shl t12, Constant:i64<1>
Creating new node: t36: i64 = shl
OpaqueConstant:i64<-2401053089408754003>, Constant:i64<1>
  Creating new node: t37: i64 = shl t6, Constant:i64<1>
  Creating new node: t38: i64 = and t37, t36
   ... into: t38: i64 = and t37, t36
  ...
  Combining: t38: i64 = and t37, t36
Creating new node: t39: i64 = and t6,
OpaqueConstant:i64<-2401053089408754003>
  Creating new node: t40: i64 = shl t39, Constant:i64<1>
   ... into: t40: i64 = shl t39, Constant:i64<1>

and subsequently gets simplified by DAGCombiner::visitAND:

  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
  if (N0.getOpcode() == N1.getOpcode())
    if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
      return V;

before being folded by performSHLCombine once again and so on.

The combine in performSHLCombine should only be done if (shl C1, C2) can
be constant-folded, it may otherwise be unsafe and generally have a
worse end result. Thanks to Dave Sherwood for his insight on this one.

This reverts commit f719771f251d7c30eca448133fe85730f19a6bd1.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  48 +++++++++
 .../AArch64/const-shift-of-constmasked.ll     | 101 ++++++++----------
 llvm/test/CodeGen/AArch64/extract-bits.ll     |  16 +--
 llvm/test/CodeGen/AArch64/fpenv.ll            |   6 +-
 llvm/test/CodeGen/AArch64/xbfiz.ll            |  34 ++++++
 5 files changed, 140 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4ede1fb93fe5f..bd9994bcb669c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1140,6 +1140,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
 
+  setTargetDAGCombine(ISD::SHL);
+
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset =
@@ -26339,6 +26341,50 @@ performScalarToVectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return NVCAST;
 }
 
+/// If the operand is a bitwise AND with a constant RHS, and the shift has a
+/// constant RHS and is the only use, we can pull it out of the shift, i.e.
+///
+///   (shl (and X, C1), C2) -> (and (shl X, C2), (shl C1, C2))
+///
+/// We prefer this canonical form to match existing isel patterns.
+static SDValue performSHLCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  if (Op0.getOpcode() != ISD::AND || !Op0.hasOneUse())
+    return SDValue();
+
+  SDValue C1 = Op0->getOperand(1);
+  SDValue C2 = N->getOperand(1);
+  if (!isa<ConstantSDNode>(C1) || !isa<ConstantSDNode>(C2))
+    return SDValue();
+
+  // Might be folded into shifted op, do not lower.
+  if (N->hasOneUse()) {
+    unsigned UseOpc = N->user_begin()->getOpcode();
+    if (UseOpc == ISD::ADD || UseOpc == ISD::SUB || UseOpc == ISD::SETCC ||
+        UseOpc == AArch64ISD::ADDS || UseOpc == AArch64ISD::SUBS)
+      return SDValue();
+  }
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // Don't combine unless (shl C1, C2) can be constant folded. Otherwise,
+  // DAGCombiner will simplify (and (op x...), (op y...)) -> (op (and x, y))
+  // causing infinite loop. Result may also be worse.
+  SDValue NewRHS = DAG.getNode(ISD::SHL, DL, VT, C1, C2);
+  if (!isa<ConstantSDNode>(NewRHS))
+    return SDValue();
+
+  SDValue X = Op0->getOperand(0);
+  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, X, C2);
+  return DAG.getNode(ISD::AND, DL, VT, NewShift, NewRHS);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -26684,6 +26730,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
     return performCTLZCombine(N, DAG, Subtarget);
   case ISD::SCALAR_TO_VECTOR:
     return performScalarToVectorCombine(N, DCI, DAG);
+  case ISD::SHL:
+    return performSHLCombine(N, DCI, DAG);
   }
   return SDValue();
 }
diff --git a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
index 66a6745cda8f7..1fffcdda4b416 100644
--- a/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/AArch64/const-shift-of-constmasked.ll
@@ -190,8 +190,7 @@ define i8 @test_i8_224_mask_ashr_6(i8 %a0) {
 define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #3
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 1
@@ -200,8 +199,7 @@ define i8 @test_i8_7_mask_shl_1(i8 %a0) {
 define i8 @test_i8_7_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_7_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    ubfiz w0, w0, #4, #3
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 7
   %t1 = shl i8 %t0, 4
@@ -229,8 +227,8 @@ define i8 @test_i8_7_mask_shl_6(i8 %a0) {
 define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0x38
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 1
@@ -239,8 +237,8 @@ define i8 @test_i8_28_mask_shl_1(i8 %a0) {
 define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #2
+; CHECK-NEXT:    lsl w8, w0, #2
+; CHECK-NEXT:    and w0, w8, #0x70
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 2
@@ -249,8 +247,8 @@ define i8 @test_i8_28_mask_shl_2(i8 %a0) {
 define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x1c
-; CHECK-NEXT:    lsl w0, w8, #3
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    and w0, w8, #0xe0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 3
@@ -259,8 +257,8 @@ define i8 @test_i8_28_mask_shl_3(i8 %a0) {
 define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 ; CHECK-LABEL: test_i8_28_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0xc
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    lsl w8, w0, #4
+; CHECK-NEXT:    and w0, w8, #0xc0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 28
   %t1 = shl i8 %t0, 4
@@ -270,8 +268,8 @@ define i8 @test_i8_28_mask_shl_4(i8 %a0) {
 define i8 @test_i8_224_mask_shl_1(i8 %a0) {
 ; CHECK-LABEL: test_i8_224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x60
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xc0
 ; CHECK-NEXT:    ret
   %t0 = and i8 %a0, 224
   %t1 = shl i8 %t0, 1
@@ -465,8 +463,7 @@ define i16 @test_i16_65024_mask_ashr_10(i16 %a0) {
 define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #7
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 1
@@ -475,8 +472,7 @@ define i16 @test_i16_127_mask_shl_1(i16 %a0) {
 define i16 @test_i16_127_mask_shl_8(i16 %a0) {
 ; CHECK-LABEL: test_i16_127_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f
-; CHECK-NEXT:    lsl w0, w8, #8
+; CHECK-NEXT:    ubfiz w0, w0, #8, #7
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 127
   %t1 = shl i16 %t0, 8
@@ -504,8 +500,8 @@ define i16 @test_i16_127_mask_shl_10(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_3:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #3
+; CHECK-NEXT:    lsl w8, w0, #3
+; CHECK-NEXT:    and w0, w8, #0x3f80
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 3
@@ -514,8 +510,8 @@ define i16 @test_i16_2032_mask_shl_3(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_4:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #4
+; CHECK-NEXT:    lsl w8, w0, #4
+; CHECK-NEXT:    and w0, w8, #0x7f00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 4
@@ -524,8 +520,8 @@ define i16 @test_i16_2032_mask_shl_4(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_5:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7f0
-; CHECK-NEXT:    lsl w0, w8, #5
+; CHECK-NEXT:    lsl w8, w0, #5
+; CHECK-NEXT:    and w0, w8, #0xfe00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 5
@@ -534,8 +530,8 @@ define i16 @test_i16_2032_mask_shl_5(i16 %a0) {
 define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 ; CHECK-LABEL: test_i16_2032_mask_shl_6:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x3f0
-; CHECK-NEXT:    lsl w0, w8, #6
+; CHECK-NEXT:    lsl w8, w0, #6
+; CHECK-NEXT:    and w0, w8, #0xfc00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 2032
   %t1 = shl i16 %t0, 6
@@ -545,8 +541,8 @@ define i16 @test_i16_2032_mask_shl_6(i16 %a0) {
 define i16 @test_i16_65024_mask_shl_1(i16 %a0) {
 ; CHECK-LABEL: test_i16_65024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7e00
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xfc00
 ; CHECK-NEXT:    ret
   %t0 = and i16 %a0, 65024
   %t1 = shl i16 %t0, 1
@@ -740,8 +736,7 @@ define i32 @test_i32_4294836224_mask_ashr_18(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    ubfiz w0, w0, #1, #15
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 1
@@ -750,8 +745,7 @@ define i32 @test_i32_32767_mask_shl_1(i32 %a0) {
 define i32 @test_i32_32767_mask_shl_16(i32 %a0) {
 ; CHECK-LABEL: test_i32_32767_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff
-; CHECK-NEXT:    lsl w0, w8, #16
+; CHECK-NEXT:    ubfiz w0, w0, #16, #15
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 32767
   %t1 = shl i32 %t0, 16
@@ -779,8 +773,8 @@ define i32 @test_i32_32767_mask_shl_18(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_7:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #7
+; CHECK-NEXT:    lsl w8, w0, #7
+; CHECK-NEXT:    and w0, w8, #0x3fff8000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 7
@@ -789,8 +783,8 @@ define i32 @test_i32_8388352_mask_shl_7(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #8
+; CHECK-NEXT:    lsl w8, w0, #8
+; CHECK-NEXT:    and w0, w8, #0x7fff0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 8
@@ -799,8 +793,8 @@ define i32 @test_i32_8388352_mask_shl_8(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_9:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7fff00
-; CHECK-NEXT:    lsl w0, w8, #9
+; CHECK-NEXT:    lsl w8, w0, #9
+; CHECK-NEXT:    and w0, w8, #0xfffe0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 9
@@ -809,8 +803,8 @@ define i32 @test_i32_8388352_mask_shl_9(i32 %a0) {
 define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 ; CHECK-LABEL: test_i32_8388352_mask_shl_10:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x3fff00
-; CHECK-NEXT:    lsl w0, w8, #10
+; CHECK-NEXT:    lsl w8, w0, #10
+; CHECK-NEXT:    and w0, w8, #0xfffc0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 8388352
   %t1 = shl i32 %t0, 10
@@ -820,8 +814,8 @@ define i32 @test_i32_8388352_mask_shl_10(i32 %a0) {
 define i32 @test_i32_4294836224_mask_shl_1(i32 %a0) {
 ; CHECK-LABEL: test_i32_4294836224_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and w8, w0, #0x7ffe0000
-; CHECK-NEXT:    lsl w0, w8, #1
+; CHECK-NEXT:    lsl w8, w0, #1
+; CHECK-NEXT:    and w0, w8, #0xfffc0000
 ; CHECK-NEXT:    ret
   %t0 = and i32 %a0, 4294836224
   %t1 = shl i32 %t0, 1
@@ -1015,8 +1009,7 @@ define i64 @test_i64_18446744065119617024_mask_ashr_34(i64 %a0) {
 define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_2147483647_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff
-; CHECK-NEXT:    lsl x0, x8, #1
+; CHECK-NEXT:    lsl w0, w0, #1
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 2147483647
   %t1 = shl i64 %t0, 1
@@ -1054,8 +1047,8 @@ define i64 @test_i64_2147483647_mask_shl_34(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_15:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #15
+; CHECK-NEXT:    lsl x8, x0, #15
+; CHECK-NEXT:    and x0, x8, #0x3fffffff80000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 15
@@ -1064,8 +1057,8 @@ define i64 @test_i64_140737488289792_mask_shl_15(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #16
+; CHECK-NEXT:    lsl x8, x0, #16
+; CHECK-NEXT:    and x0, x8, #0x7fffffff00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 16
@@ -1074,8 +1067,8 @@ define i64 @test_i64_140737488289792_mask_shl_16(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_17:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #17
+; CHECK-NEXT:    lsl x8, x0, #17
+; CHECK-NEXT:    and x0, x8, #0xfffffffe00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 17
@@ -1084,8 +1077,8 @@ define i64 @test_i64_140737488289792_mask_shl_17(i64 %a0) {
 define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 ; CHECK-LABEL: test_i64_140737488289792_mask_shl_18:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x3fffffff0000
-; CHECK-NEXT:    lsl x0, x8, #18
+; CHECK-NEXT:    lsl x8, x0, #18
+; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 140737488289792
   %t1 = shl i64 %t0, 18
@@ -1095,8 +1088,8 @@ define i64 @test_i64_140737488289792_mask_shl_18(i64 %a0) {
 define i64 @test_i64_18446744065119617024_mask_shl_1(i64 %a0) {
 ; CHECK-LABEL: test_i64_18446744065119617024_mask_shl_1:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    and x8, x0, #0x7ffffffe00000000
-; CHECK-NEXT:    lsl x0, x8, #1
+; CHECK-NEXT:    lsl x8, x0, #1
+; CHECK-NEXT:    and x0, x8, #0xfffffffc00000000
 ; CHECK-NEXT:    ret
   %t0 = and i64 %a0, 18446744065119617024
   %t1 = shl i64 %t0, 1
diff --git a/llvm/test/CodeGen/AArch64/extract-bits.ll b/llvm/test/CodeGen/AArch64/extract-bits.ll
index b87157a183835..aaa6c7eb4a30f 100644
--- a/llvm/test/CodeGen/AArch64/extract-bits.ll
+++ b/llvm/test/CodeGen/AArch64/extract-bits.ll
@@ -1013,8 +1013,8 @@ define i32 @c1_i32(i32 %arg) nounwind {
 define i32 @c2_i32(i32 %arg) nounwind {
 ; CHECK-LABEL: c2_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx w8, w0, #19, #10
-; CHECK-NEXT:    lsl w0, w8, #2
+; CHECK-NEXT:    lsr w8, w0, #17
+; CHECK-NEXT:    and w0, w8, #0xffc
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
   %tmp1 = and i32 %tmp0, 1023
@@ -1063,8 +1063,8 @@ define i64 @c1_i64(i64 %arg) nounwind {
 define i64 @c2_i64(i64 %arg) nounwind {
 ; CHECK-LABEL: c2_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx x8, x0, #51, #10
-; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    lsr x8, x0, #49
+; CHECK-NEXT:    and x0, x8, #0xffc
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
   %tmp1 = and i64 %tmp0, 1023
@@ -1120,8 +1120,8 @@ define void @c6_i32(i32 %arg, ptr %ptr) nounwind {
 define void @c7_i32(i32 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx w8, w0, #19, #10
-; CHECK-NEXT:    lsl w8, w8, #2
+; CHECK-NEXT:    lsr w8, w0, #17
+; CHECK-NEXT:    and w8, w8, #0xffc
 ; CHECK-NEXT:    str w8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i32 %arg, 19
@@ -1163,8 +1163,8 @@ define void @c6_i64(i64 %arg, ptr %ptr) nounwind {
 define void @c7_i64(i64 %arg, ptr %ptr) nounwind {
 ; CHECK-LABEL: c7_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ubfx x8, x0, #51, #10
-; CHECK-NEXT:    lsl x8, x8, #2
+; CHECK-NEXT:    lsr x8, x0, #49
+; CHECK-NEXT:    and x8, x8, #0xffc
 ; CHECK-NEXT:    str x8, [x1]
 ; CHECK-NEXT:    ret
   %tmp0 = lshr i64 %arg, 51
diff --git a/llvm/test/CodeGen/AArch64/fpenv.ll b/llvm/test/CodeGen/AArch64/fpenv.ll
index 3a307f7731037..3351565d8dd89 100644
--- a/llvm/test/CodeGen/AArch64/fpenv.ll
+++ b/llvm/test/CodeGen/AArch64/fpenv.ll
@@ -4,11 +4,11 @@
 define void @func_set_rounding_dyn(i32 %rm) {
 ; CHECK-LABEL: func_set_rounding_dyn:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub w9, w0, #1
+; CHECK-NEXT:    lsl w9, w0, #22
 ; CHECK-NEXT:    mrs x8, FPCR
-; CHECK-NEXT:    and w9, w9, #0x3
 ; CHECK-NEXT:    and x8, x8, #0xffffffffff3fffff
-; CHECK-NEXT:    lsl w9, w9, #22
+; CHECK-NEXT:    sub w9, w9, #1024, lsl #12 // =4194304
+; CHECK-NEXT:    and w9, w9, #0xc00000
 ; CHECK-NEXT:    orr x8, x8, x9
 ; CHECK-NEXT:    msr FPCR, x8
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/xbfiz.ll b/llvm/test/CodeGen/AArch64/xbfiz.ll
index b777ddcb7efcc..d3bebf7c6637c 100644
--- a/llvm/test/CodeGen/AArch64/xbfiz.ll
+++ b/llvm/test/CodeGen/AArch64/xbfiz.ll
@@ -69,3 +69,37 @@ define i64 @lsl32_not_ubfiz64(i64 %v) {
   %and = and i64 %shl, 4294967295
   ret i64 %and
 }
+
+define i64 @lsl_zext_i8_i64(i8 %b) {
+; CHECK-LABEL: lsl_zext_i8_i64:
+; CHECK:    ubfiz x0, x0, #1, #8
+  %1 = zext i8 %b to i64
+  %2 = shl i64 %1, 1
+  ret i64 %2
+}
+
+define i64 @lsl_zext_i16_i64(i16 %b) {
+; CHECK-LABEL: lsl_zext_i16_i64:
+; CHECK:    ubfiz x0, x0, #1, #16
+  %1 = zext i16 %b to i64
+  %2 = shl i64 %1, 1
+  ret i64 %2
+}
+
+; Regression test for:
+; https://github.com/llvm/llvm-project/pull/118974#issuecomment-2598521878
+; that exposed infinite loop in DAGCombiner.
+define void @_f(ptr %0, ptr %1, i64 %2) {
+; CHECK-LABEL: @_f
+; CHECK-NOT: ubfiz
+  store i64 -2401053089408754003, ptr %1, align 8
+  %4 = and i64 %2, -2401053089408754003
+  %5 = shl i64 %4, 1
+  store i64 %5, ptr %0, align 1
+  %6 = lshr i64 %4, 54
+  %7 = shl i64 %2, 10
+  %8 = and i64 %7, 131072
+  %9 = or i64 %8, %6
+  store i64 %9, ptr %1, align 1
+  ret void
+}

From 65f81df473904d2df2b9eaa91ff4fcbe69f8fb00 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 28 Jan 2025 11:31:15 +0000
Subject: [PATCH 381/432] [Docs][DebugInfo] Summarise what people need to do
 for RemoveDIs now (#124725)

Replace the "what I need to do" section of the RemoveDIs docs with a
paragraph about preserving start-of-block iterators. Hopefully this is
concise enough to remain in peoples heads going forwards!
---
 llvm/docs/RemoveDIsDebugInfo.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
index c9012548f7578..db22d1df758fa 100644
--- a/llvm/docs/RemoveDIsDebugInfo.md
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -24,7 +24,13 @@ The debug records are not instructions, do not appear in the instruction list, a
 
 # Great, what do I need to do!
 
-Very little -- we've already instrumented all of LLVM to handle these new records ("`DbgRecords`") and behave identically to past LLVM behaviour. This is currently being turned on by default, so that `DbgRecords` will be used by default in memory, IR, and bitcode.
+We've largely completed the migration. The remaining rough edge is that going forwards, instructions must be inserted into basic blocks using iterators rather than instruction pointers. In almost all circumstances you can just call `getIterator` on an instruction pointer -- however, if you call a function that returns the start of a basic block, such as:
+
+1. BasicBlock::begin
+2. BasicBlock::getFirstNonPHIIt
+3. BasicBlock::getFirstInsertionPt
+
+Then you must past that iterator into the insertion function without modification (the iterator carries a debug-info bit). That's all! Read on for a more detailed explanation.
 
 ## API Changes
 

From 75aa5a35568b9e0b3eabd1e7f991a6a0f5525e0c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 11:32:24 +0000
Subject: [PATCH 382/432] [lldb][AArch64] Add Guarded Control Stack support for
 Linux core files (#124293)

This allows you to read the same registers as you would for a live
process.

As the content of proc/pid/smaps is not included in the core file, we
don't get the "ss" marker that tell us that it is shadow stack. The GCS
region is still in the list though.
---
 .../RegisterContextPOSIXCore_arm64.cpp        |  17 ++++++++++
 .../elf-core/RegisterContextPOSIXCore_arm64.h |   1 +
 .../Process/elf-core/RegisterUtilities.h      |   4 +++
 .../linux/aarch64/gcs/TestAArch64LinuxGCS.py  |  32 ++++++++++++++++++
 lldb/test/API/linux/aarch64/gcs/corefile      | Bin 0 -> 24576 bytes
 5 files changed, 54 insertions(+)
 create mode 100644 lldb/test/API/linux/aarch64/gcs/corefile

diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
index 2ddf8440aeb03..bd02bb0e69a4d 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
@@ -69,6 +69,15 @@ RegisterContextCorePOSIX_arm64::Create(Thread &thread, const ArchSpec &arch,
   if (fpmr_data.GetByteSize() >= sizeof(uint64_t))
     opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskFPMR);
 
+  DataExtractor gcs_data = getRegset(notes, arch.GetTriple(), AARCH64_GCS_Desc);
+  struct __attribute__((packed)) gcs_regs {
+    uint64_t features_enabled;
+    uint64_t features_locked;
+    uint64_t gcspr_e0;
+  };
+  if (gcs_data.GetByteSize() >= sizeof(gcs_regs))
+    opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskGCS);
+
   auto register_info_up =
       std::make_unique<RegisterInfoPOSIX_arm64>(arch, opt_regsets);
   return std::unique_ptr<RegisterContextCorePOSIX_arm64>(
@@ -136,6 +145,9 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
   if (m_register_info_up->IsFPMRPresent())
     m_fpmr_data = getRegset(notes, target_triple, AARCH64_FPMR_Desc);
 
+  if (m_register_info_up->IsGCSPresent())
+    m_gcs_data = getRegset(notes, target_triple, AARCH64_GCS_Desc);
+
   ConfigureRegisterContext();
 }
 
@@ -330,6 +342,11 @@ bool RegisterContextCorePOSIX_arm64::ReadRegister(const RegisterInfo *reg_info,
     assert(offset < m_mte_data.GetByteSize());
     value.SetFromMemoryData(*reg_info, m_mte_data.GetDataStart() + offset,
                             reg_info->byte_size, lldb::eByteOrderLittle, error);
+  } else if (IsGCS(reg)) {
+    offset = reg_info->byte_offset - m_register_info_up->GetGCSOffset();
+    assert(offset < m_gcs_data.GetByteSize());
+    value.SetFromMemoryData(*reg_info, m_gcs_data.GetDataStart() + offset,
+                            reg_info->byte_size, lldb::eByteOrderLittle, error);
   } else if (IsSME(reg)) {
     // If you had SME in the process, active or otherwise, there will at least
     // be a ZA header. No header, no SME at all.
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
index 35588c40c2eb1..6140f805ffc78 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.h
@@ -63,6 +63,7 @@ class RegisterContextCorePOSIX_arm64 : public RegisterContextPOSIX_arm64 {
   lldb_private::DataExtractor m_mte_data;
   lldb_private::DataExtractor m_zt_data;
   lldb_private::DataExtractor m_fpmr_data;
+  lldb_private::DataExtractor m_gcs_data;
 
   SVEState m_sve_state = SVEState::Unknown;
   uint16_t m_sve_vector_length = 0;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
index b97279b0d735b..59382a12cde0a 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
@@ -148,6 +148,10 @@ constexpr RegsetDesc AARCH64_FPMR_Desc[] = {
     {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_FPMR},
 };
 
+constexpr RegsetDesc AARCH64_GCS_Desc[] = {
+    {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_GCS},
+};
+
 constexpr RegsetDesc PPC_VMX_Desc[] = {
     {llvm::Triple::FreeBSD, llvm::Triple::UnknownArch, llvm::ELF::NT_PPC_VMX},
     {llvm::Triple::Linux, llvm::Triple::UnknownArch, llvm::ELF::NT_PPC_VMX},
diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index 797551b061a23..adbef69c5c38b 100644
--- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -374,3 +374,35 @@ def test_gcs_expression_enable_gcs(self):
         # consistent with the disabled -> enabled behaviour.
         enabled |= 1
         self.check_gcs_registers(enabled, locked, spr_el0)
+
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_gcs_core_file(self):
+        # To re-generate the core file, build the test file and run it on a
+        # machine with GCS enabled. Note that because the kernel decides where
+        # the GCS is stored, the value of gcspr_el0 and which memory region it
+        # points to may change between runs.
+
+        self.runCmd("target create --core corefile")
+
+        self.expect(
+            "bt",
+            substrs=["stop reason = SIGSEGV: control protection fault"],
+        )
+
+        self.expect(
+            "register read --all",
+            substrs=[
+                "Guarded Control Stack Registers:",
+                "gcs_features_enabled = 0x0000000000000001",
+                "gcs_features_locked = 0x0000000000000000",
+                "gcspr_el0 = 0x0000ffffa83ffff0",
+            ],
+        )
+
+        # Core files do not include /proc/pid/smaps, so we cannot see the
+        # shadow stack "ss" flag. gcspr_el0 should at least point to some mapped
+        # region.
+        self.expect(
+            "memory region $gcspr_el0",
+            substrs=["[0x0000ffffa8000000-0x0000ffffa8400000) rw-"],
+        )
diff --git a/lldb/test/API/linux/aarch64/gcs/corefile b/lldb/test/API/linux/aarch64/gcs/corefile
new file mode 100644
index 0000000000000000000000000000000000000000..34faa98c4d783d8d6905c9620976039e35025e00
GIT binary patch
literal 24576
zcmeHP4RBP|6+XLfce8;+A3q9eG!KXnFf0KnHPucY8-C140~1z<!5MeMCXyJk3&}>K
zU|BFI5Id6C&Iq;!6p&4wHq*gYbVec~MX{~M;yBZp8ve1=3Q?4Z<n^5Q?%BLuHYNVG
zI(--3JNKM>&b{Y;_uQYCd2d(FoHNUAvjHau`~mtn0F?_&RE3*B5xhfCHO-`8$_9p*
z7|gOhwc(;7g>IO-7N-BY&B!L)WxK)31|d7W4#nUB9z52T(oU3{>=N1o*r91$hncDd
zW3qFSL@~qwE52HL^1%H}=wGxdwV$f+$K+>1|2q1y<9_sK*VT_*$Hna>;!@p@pEP2a
zOzr8sqTeyVmZY{jEy_)HiS5lxZMR{hVQ8|`1&XL|LuG2aOSTtEZ5MF==k@~krnXDb
zUc$J~yDzn!=$9GKgmDiH+%6H%hScqC5Ww||=M$;zcoi|(CG>B@#?*H0Vw_BN3H?(y
zrMAnzgx%7C+a=)A(T`pG#p_anpV42X_EWutpYT*_JLO`#1U&O=Qrig}`;8ZM;C2b^
z1qSYCA}(Lit^+<yxn5ste^8RzV;q^9;!GcRRp6`+%#Li<;u1%zT+!UpnPfefZX%iU
zER#{0wjQFVScvp*KQ=L#UFb>V`{Sr>N(+`J`D}>e_vjwGj^R1YKr$fsxPr(8ZtZ7n
zO;U_+?P70hzx%F)7;M;Uw8#EEk!kFKYSi#^Ub@&z+Y|SkwZe||x!Kdg-$8>6yUl{n
z<9YxBq|5E;V}rbe7mp2;k_IN1s*{sgNLTWhlzGDS5naQ$c_!%+#U%P#S@JE4SMluP
z`Rp6V${<m*sb50!?GoK51drw``!tH16!YOm!7m+ZWO#XbM0@)q!d;*1R3r*GM#>F!
zeJm)eigT2{rXYWlC?_8%r5gm|$4apdi``a~_UU!FjF?~yX-p|)7(^dq%4R0<mnHEd
z1kdCuk=eh`h~=vV|G0KdG;JhAuVqC+vjsi<7eoJY!M`W$uMvD!5>*7BObCw$@HTT!
z@hl#bD_M!b=QHp{XG}4_WI^M0;w-8dl;;cZSyP-HX<gG3pC!fF@KKh7IP)c+;g}bD
z(r|7U#JZAv24h~dzb;<CICCYRp_mtEq=&hFJ~N3k&>EKWnP{nTHj3+mIP+{sI>Rt8
z&M4_(s|P-_h%-oB&Sw&F#u(1}e0ErxbQa*fTI{{U*iYVz=NWr)9Jhd2*TuRyf%OFs
zn~m~3A)izkuBn(*UUqNY(yIK)#=-?(LReo{J84PPqDf_CwZY11QztH|3e`7ETpX$=
zY0y&__Dq9GUtei)FzBzN+9qG3wgD@CbH3FKq?i9?rMgIY@P-?7$2|F-8}l!L+IL93
zT!fGasBd)42N)b*+J%$`0JI*{k&~u4ugARCFg2#3I5EKdTcapY4WV(5Gsf;Gk2A!)
zM>6I5=sdtOT~n-&Ty9wZ5AVEQD+xZR%u%uVVtKOvTn(3hUn!k|Xlu6N@XjeNxqSiV
z1MP`>0O`1~!*-L7%!d~&`+rW1jbveSQ5X<52n#q&*x=)wMsfugC8tSFd^;YPRk8Z;
zd}C@<Zmm-W4vA}&c`ZuBtMB=e?Y^&+Vusc!mnmk9zN{2oTB$$J(KFW|lMX-A=<^iI
z(oJQ;Fm89bsf?fH$d3L>E1Sy9cvBsxwZeqIC6LTY76KLm76KLm76KLm76KLm76KLm
z76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm76KLm
z76KLm1_CRSzW<lR_y76g`zcib^%j39g*<)#FX8<%{F;j1+oN~l`%AgPjwye?Wk&;i
z{SQux544i0B3=k&%64Q!*IQIU3hFb&z?5_;QFrM5L!u<3cMv(5-#w&isXwGga=J+O
zMNn<B8@3S7zHh9!`MrNU*3f)?Nv3aaV4*x?^H}A=^w|5T(r=<qI){QAmWr1lmk7-n
zNM)4Me@fing_tLOw-789<MJTNhrk*7j5Hz9dzGZWUku6)(K#X{=W%^P;0JzxbwzEc
zVu?SvBpj^vFRqA0s%k3w#6l3hyS(iFK51OJwxX_2S{w1#RV}Wqh}74H`lMJxZCR*1
zTm#03ME92CH$VDkKl;n@{wbbmo*N6$KfCFl4UwmRTXz2|lfHwZ@8xRnqGXJL4b~m-
zr|~yZga;)WTapn>vrBG=P4%rUXQz$0?(ZMC(j8B=58HVD-69%Kj*B_;Z4>g4>C(qp
zhYN3UIhC1G_ARjNq!WCjAzB)gSCx8|j;oIz>y?W@x<_$=Z|z%3Qzo<zeFW59X-@dz
z?gHuHYKOYZ>wu}*igfVRuA2`o0_lyz!O=~AP<z(o7L9*&tr~Mp^u|bc9olJ-UTy2u
zd_l-Ka7=PeSOoU<U4<Jv)w*Z9y?M?jy*Y1e^5dt&#WwIAE!@~5p)Xq>eW9D`_kyD-
z=zt+qjxW&q_Nil<0L#a^A=V-J-_5-&+KJu!E9J>AKV(*SNZ>0(+jlf=gx_&Z?J5A@
zp5AfOwjS@5-$WU#P#;gjJ~`dcyhe65twtYL+l;<#$3Bb&=-KW->VO{V!^2qa2Uqng
z*avF6udTFd6)~c=KEislWmx83vEuO~cWSaXsL4Tj^`<ItH&vj$Gu^0v5&F33b4|x?
z4`Me9>~9<oj%|7$+g*&<)wHXzu>!jg+U<tRbnF_j9xr4bI3cA?@IuD=BZyygxHsm_
zOM9}2a6}Bf;B49lfa4&YB0oRCeiNRACE<ucf;dV#j<2eUj^h}CBTl8}v3F=P{By-S
z`>~GKu<dP<;d5HLs*mOCC@Zu%t4HBDJd8TW(1*s#z`2>&eh1=p0^9JS+z-xE>0&Iq
zBxf79Zyv`ZVjQ`B!tnQqA<;uf_rBdLXW$s&;<b}v?-5sD?Co)-j=hI-ipDo#?5)W)
zVs9<lY2X&IH~7GD$)(30V$wN-Fw1pe?7fb$=QuSU{XH&XZ@Z3{OT?W+$7>t<8+>tH
zX;H@0wcZuaJn8-7?sqVLVubT_fafVDa^CEfCG`35ZJN9W$9XqM)r}YftGm=)qXBlU
zK8n-<yZYiMqq+mfi2D3T>~roFeX;y%u_ja8yyobO8@tv|++e?#Jg3I)7X7Y7yKEdM
znsXQ{B4!5bF|$VT#`Ji}P1F1C#kkssaYJ$8#J=#jndjd&Q{23!Uf=Yx7^nA;>g$Ic
zd_{YEWf%6H){$dHn!FhMPisl}&{d@x&YhsVhH$=T6yi*CQblT4R&_p|wz*qixU*2i
z;-Zu4uEUaj!sl53t)}^%jX2SILzL!DD!y`vfnnho>EMOsFT^fL(ERayTkI1F+CG+S
zZD;Sa$3Dfn$#(P0y>d^rBW6Rt&(}F)1%K+5zmTLhJNn)+N0YTe=<$QCx?oszQ>z`$
zMBUS049MUM<f$#7+MBZ->Y4q>x2pCt9ktiucBDQ~HUxai7_}ff3u<vg^UWU)I|Dgh
zonM8zXK#!AZl(Q-Q}r#s*z{RYirp8rH7j}yMN1EbVOJo>UZ6;7&#EG|CmU?1UKt9$
z#xdT4kr~m>^U@V<HYw-8=hE=DLb&EsW0u-7YQvM8Y<775B^T^msll0(>EJsgjc7Z3
z#May;WwjMSv~v@{&s)dX3mn)E*^?dFlkN0gZD*t8Xe!KtGwGSQ4=C_9jz_2W^n)#4
z1)3ki`CU}4oNKLA&H?h-U_Vuc^S=@2`DnM=qDna5GvEwP1m8m#cU~FZmS(^Ut$6tz
zarymTH_CIMp%t&y06THbEx;|ipcStu_HUtNYg#VZ4&WX+coq8kQjhjLVL(_E;@WS+
zb+&b!y`T#*_;~*G*x5V1vCo1<v7YJ~vGa8^V;7e1-?1CVoaPqpsBPI`j8_z6vkUP$
zBn@jjEL}Na|7asVxDK;jmvnVoP@DA*VMiFwLEHi-9%xxULTy=y{cOW{eENd+gUya;
z%W2H7#F*$9q&~12_ZxFe6k}{}wnbamw_~W=6~;N~fMthpucdyF@Bhg!wl({A7~_|G
z+=}%D@}ap5Y|YZpXs6wYdpC4+KO{lV>cOy!Vmll0*W(Arig+4RuJaYNj}{qwo6QmJ
z?83g<F*X<*va?|yAR}7pz$=R55kMP?(=LqB1(<I|d+x`dk*7BNInqIQ`SH&EKYO#=
z?Rl<yN5*5_>E$1GuKUfK-Nkvo?JnNFrrU9?yCmn@ktNsfQ0L5UP&4u!aJhW-hLUL3
z){-q*+e-FiZ7(^J^=wI_Jz5HlTS|giN+{?KguIQ<luXVV9U65<-rP~M^1@WM49mvc
zkvn(HtlV&4`8X^e7apb9LSEG#3I?Rm*QzHV>QSUnqZ^i$oyRjk<JOW<joV7RjoV9t
zjn9_gyqI&M@$_97cg?uRo!Qc(`DQoZPc<Bcxi-A)cu9CU@p9osF}n9tP0K&8X~Xa&
z{K;pUmM8MLJ(`v)^22cf%n|u{NPpR@X;<lA$^kyB7>Uo2@t;l*ay}296#Us<Elk;w
zHbW49#=%KQiO!Ie7Kn2G&5rekIQ|<jbRMVlg=W;p=k5!F7uCUahMN9tL0OzHdi(2H
zL6m=6grC3qtABP;(T|ke+ZWY`BK1mvXR0TE;^caroBY!$`JVi#o+;xEIe@3GvMy2^
zDO&^{{LuhUWm#P%c*^e&)!koX&`7OOa&JX#T~#=g!1&9-6ADKvz!Rxxh=8YTNu(--
z)zt?hs9L92P*zhF1W#?ayev`%o?y79rXm#S&+{)Qr*GS6tQmHW2d4R=I`J5W$3P=-
z{Lm-77^-2m=lEfIOF#BP3|C-=>R@}0FQz+$BDF`M$L({R(eIIvZ)_jW2m!|@s_03C
zFSBqx=J0dUxc?k~OnDx0|G6EeQ_!Abn|Y3RrZ<U}$bZsd|AiQ)V}>4$**^I>A0H?Z
z664Oo*_fd|@%+pejxufFAC?gY$?g9{*mL_k#NQ{F^7sjJy)DiY#YIhToAKd!#Z(c1
zv)|l4uSbi~o>au22f!@=XR2VCIk7!cS~SR><CJyz?=Ls7Kv3aMydT7~?`GBc!Vmpf
n*;K&xCXto35U>!i5U>!i5U>!i5U>!i5U>!i5U>#V4kGYxk}Yyj

literal 0
HcmV?d00001


From 7f845cba2ccc2ab637b8e40fbafb9f83a2d67c70 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 28 Jan 2025 12:35:33 +0100
Subject: [PATCH 383/432] [libc++] Update the CI to Clang-20 and drop Clang-17
 support (#117429)

---
 .github/workflows/libcxx-build-and-test.yaml  | 18 ++--
 libcxx/include/__configuration/compiler.h     |  4 +-
 .../__memory/uninitialized_algorithms.h       |  3 +-
 libcxx/include/__type_traits/promote.h        | 85 +------------------
 libcxx/src/experimental/time_zone.cpp         |  6 +-
 .../diagnose_invalid_memory_order.verify.cpp  |  5 --
 libcxx/test/libcxx/clang_tidy.gen.py          |  3 -
 .../libcxx/gdb/gdb_pretty_printer_test.sh.cpp |  2 +-
 .../no_unique_address.compile.pass.cpp        |  1 -
 .../no_unique_address.compile.pass.cpp        |  1 -
 .../no_unique_address.compile.pass.cpp        |  2 -
 .../atomics.types.float/fetch_add.pass.cpp    |  2 +-
 .../atomics.types.float/fetch_sub.pass.cpp    |  2 +-
 .../operator.minus_equals.pass.cpp            |  2 +-
 .../operator.plus_equals.pass.cpp             |  2 +-
 .../mdspan/mdspan/index_operator.pass.cpp     |  6 +-
 .../views/views.span/span.cons/array.pass.cpp |  5 +-
 .../simd/simd.class/simd_copy.pass.cpp        |  2 +-
 .../simd/simd.class/simd_unary.pass.cpp       |  2 +-
 .../sized_delete_array.pass.cpp               |  1 -
 .../new.delete.single/sized_delete.pass.cpp   |  1 -
 .../test/std/numerics/c.math/signbit.pass.cpp |  2 +-
 .../expected.expected/ctor/ctor.copy.pass.cpp |  4 +-
 .../format.arg/visit.pass.cpp                 |  1 -
 .../format.arg/visit.return_type.pass.cpp     |  1 -
 .../visit_format_arg.deprecated.verify.cpp    |  1 -
 .../meta/meta.rel/is_virtual_base_of.pass.cpp |  2 +-
 .../is_implicit_lifetime.pass.cpp             |  2 +-
 .../is_implicit_lifetime.verify.cpp           |  2 +-
 .../nttp.equivalence.compile.pass.cpp         |  1 -
 .../utility/pairs/pairs.pair/nttp.verify.cpp  |  1 -
 .../robust_against_adl.pass.cpp               |  2 -
 .../variant.visit.member/visit.pass.cpp       |  1 -
 .../visit_return_type.pass.cpp                |  1 -
 34 files changed, 38 insertions(+), 138 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index a28bf4d5daf6d..ee77e83363d37 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -48,8 +48,8 @@ jobs:
           'generic-cxx26',
           'generic-modules'
         ]
-        cc: [  'clang-19' ]
-        cxx: [ 'clang++-19' ]
+        cc: [  'clang-20' ]
+        cxx: [ 'clang++-20' ]
         include:
           - config: 'generic-gcc'
             cc: 'gcc-14'
@@ -88,18 +88,18 @@ jobs:
           'generic-cxx20',
           'generic-cxx23'
         ]
-        cc: [ 'clang-19' ]
-        cxx: [ 'clang++-19' ]
+        cc: [ 'clang-20' ]
+        cxx: [ 'clang++-20' ]
         include:
           - config: 'generic-gcc-cxx11'
             cc: 'gcc-14'
             cxx: 'g++-14'
           - config: 'generic-cxx23'
-            cc: 'clang-17'
-            cxx: 'clang++-17'
-          - config: 'generic-cxx26'
             cc: 'clang-18'
             cxx: 'clang++-18'
+          - config: 'generic-cxx26'
+            cc: 'clang-19'
+            cxx: 'clang++-19'
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}
@@ -169,8 +169,8 @@ jobs:
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
-          CC: clang-19
-          CXX: clang++-19
+          CC: clang-20
+          CXX: clang++-20
       - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         if: always()
         with:
diff --git a/libcxx/include/__configuration/compiler.h b/libcxx/include/__configuration/compiler.h
index 80ece22bb50bd..cf459a0619b23 100644
--- a/libcxx/include/__configuration/compiler.h
+++ b/libcxx/include/__configuration/compiler.h
@@ -33,8 +33,8 @@
 // Warn if a compiler version is used that is not supported anymore
 // LLVM RELEASE Update the minimum compiler versions
 #  if defined(_LIBCPP_CLANG_VER)
-#    if _LIBCPP_CLANG_VER < 1700
-#      warning "Libc++ only supports Clang 17 and later"
+#    if _LIBCPP_CLANG_VER < 1800
+#      warning "Libc++ only supports Clang 18 and later"
 #    endif
 #  elif defined(_LIBCPP_APPLE_CLANG_VER)
 #    if _LIBCPP_APPLE_CLANG_VER < 1500
diff --git a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h
index d595c8c6cf49e..7a7cc64f08ab3 100644
--- a/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h
+++ b/libcxx/include/__cxx03/__memory/uninitialized_algorithms.h
@@ -642,7 +642,8 @@ __uninitialized_allocator_relocate(_Alloc& __alloc, _Tp* __first, _Tp* __last, _
     __guard.__complete();
     std::__allocator_destroy(__alloc, __first, __last);
   } else {
-    __builtin_memcpy(const_cast<__remove_const_t<_Tp>*>(__result), __first, sizeof(_Tp) * (__last - __first));
+    // Casting to void* to suppress clang complaining that this is technically UB.
+    __builtin_memcpy(static_cast<void*>(__result), __first, sizeof(_Tp) * (__last - __first));
   }
 }
 
diff --git a/libcxx/include/__type_traits/promote.h b/libcxx/include/__type_traits/promote.h
index 0f545bc507398..b449a749004ab 100644
--- a/libcxx/include/__type_traits/promote.h
+++ b/libcxx/include/__type_traits/promote.h
@@ -13,20 +13,12 @@
 #include <__type_traits/integral_constant.h>
 #include <__type_traits/is_arithmetic.h>
 
-#if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER == 1700
-#  include <__type_traits/is_same.h>
-#  include <__utility/declval.h>
-#endif
-
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-// TODO(LLVM-20): Remove this workaround
-#if !defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER != 1700
-
 template <class... _Args>
 class __promote {
   static_assert((is_arithmetic<_Args>::value && ...));
@@ -39,10 +31,10 @@ class __promote {
   static double __test(unsigned long);
   static double __test(long long);
   static double __test(unsigned long long);
-#  if _LIBCPP_HAS_INT128
+#if _LIBCPP_HAS_INT128
   static double __test(__int128_t);
   static double __test(__uint128_t);
-#  endif
+#endif
   static double __test(double);
   static long double __test(long double);
 
@@ -50,79 +42,6 @@ class __promote {
   using type = decltype((__test(_Args()) + ...));
 };
 
-#else
-
-template <class _Tp>
-struct __numeric_type {
-  static void __test(...);
-  static float __test(float);
-  static double __test(char);
-  static double __test(int);
-  static double __test(unsigned);
-  static double __test(long);
-  static double __test(unsigned long);
-  static double __test(long long);
-  static double __test(unsigned long long);
-#  if _LIBCPP_HAS_INT128
-  static double __test(__int128_t);
-  static double __test(__uint128_t);
-#  endif
-  static double __test(double);
-  static long double __test(long double);
-
-  typedef decltype(__test(std::declval<_Tp>())) type;
-  static const bool value = _IsNotSame<type, void>::value;
-};
-
-template <>
-struct __numeric_type<void> {
-  static const bool value = true;
-};
-
-template <class _A1,
-          class _A2 = void,
-          class _A3 = void,
-          bool      = __numeric_type<_A1>::value && __numeric_type<_A2>::value && __numeric_type<_A3>::value>
-class __promote_imp {
-public:
-  static const bool value = false;
-};
-
-template <class _A1, class _A2, class _A3>
-class __promote_imp<_A1, _A2, _A3, true> {
-private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
-  typedef typename __promote_imp<_A3>::type __type3;
-
-public:
-  typedef decltype(__type1() + __type2() + __type3()) type;
-  static const bool value = true;
-};
-
-template <class _A1, class _A2>
-class __promote_imp<_A1, _A2, void, true> {
-private:
-  typedef typename __promote_imp<_A1>::type __type1;
-  typedef typename __promote_imp<_A2>::type __type2;
-
-public:
-  typedef decltype(__type1() + __type2()) type;
-  static const bool value = true;
-};
-
-template <class _A1>
-class __promote_imp<_A1, void, void, true> {
-public:
-  typedef typename __numeric_type<_A1>::type type;
-  static const bool value = true;
-};
-
-template <class _A1, class _A2 = void, class _A3 = void>
-class __promote : public __promote_imp<_A1, _A2, _A3> {};
-
-#endif // !defined(_LIBCPP_CLANG_VER) || _LIBCPP_CLANG_VER >= 1700
-
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___TYPE_TRAITS_PROMOTE_H
diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp
index 764a89ab513c8..f7d82a5d4cfc3 100644
--- a/libcxx/src/experimental/time_zone.cpp
+++ b/libcxx/src/experimental/time_zone.cpp
@@ -199,7 +199,7 @@ __format(const __tz::__continuation& __continuation, const string& __letters, se
                    // active at the end. This should be determined separately.
                    return chrono::seconds{0};
                  else
-                   static_assert(sizeof(_Tp) == 0); // TODO TZDB static_assert(false); after droping clang-16 support
+                   static_assert(false);
 
                  std::__libcpp_unreachable();
                },
@@ -225,7 +225,7 @@ __format(const __tz::__continuation& __continuation, const string& __letters, se
         else if constexpr (same_as<_Tp, __tz::__constrained_weekday>)
           return __value(__year, __month);
         else
-          static_assert(sizeof(_Tp) == 0); // TODO TZDB static_assert(false); after droping clang-16 support
+          static_assert(false);
 
         std::__libcpp_unreachable();
       },
@@ -688,7 +688,7 @@ __get_sys_info(sys_seconds __time,
         else if constexpr (same_as<_Tp, __tz::__save>)
           return chrono::__get_sys_info_basic(__time, __continuation_begin, __continuation, __value.__time);
         else
-          static_assert(sizeof(_Tp) == 0); // TODO TZDB static_assert(false); after droping clang-16 support
+          static_assert(false);
 
         std::__libcpp_unreachable();
       },
diff --git a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
index 2790916edaf69..1b0b945f33700 100644
--- a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
+++ b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
@@ -6,11 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test fails with Clang <18 because diagnose_if doesn't emit all of the
-// diagnostics when -fdelayed-template-parsing is enabled, like it is in MSVC
-// mode.
-// XFAIL: msvc && clang-17
-
 // REQUIRES: diagnose-if-support
 
 // <atomic>
diff --git a/libcxx/test/libcxx/clang_tidy.gen.py b/libcxx/test/libcxx/clang_tidy.gen.py
index 06f277e901d33..f1135749febe4 100644
--- a/libcxx/test/libcxx/clang_tidy.gen.py
+++ b/libcxx/test/libcxx/clang_tidy.gen.py
@@ -26,9 +26,6 @@
 // The GCC compiler flags are not always compatible with clang-tidy.
 // UNSUPPORTED: gcc
 
-// Clang 17 has false positives.
-// UNSUPPORTED: clang-17
-
 {lit_header_restrictions.get(header, '')}
 {lit_header_undeprecations.get(header, '')}
 
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index ff951d94db0a4..6509bb58140ab 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: c++03
 
 // TODO: Investigate these failures which break the CI.
-// UNSUPPORTED: clang-17, clang-18, clang-19
+// UNSUPPORTED: clang-18, clang-19, clang-20
 
 // The Android libc++ tests are run on a non-Android host, connected to an
 // Android device over adb. gdb needs special support to make this work (e.g.
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
index a0bfb7c4a246b..4a975f472b828 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && clang-17
 
 // class lazy_split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
index 694cf1fd0d0e4..7950827dcc868 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.split/no_unique_address.compile.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && clang-17
 
 // class split_view {
 //   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
diff --git a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
index a77c4e4d1bcdb..56d973d411408 100644
--- a/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.factories/range.istream.view/no_unique_address.compile.pass.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: no-localization
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// XFAIL: msvc && clang-17
 
 // Test the libc++ extension that the value stored in `std::ranges::istream_view` has been marked
 // as _LIBCPP_NO_UNIQUE_ADDRESS
@@ -21,4 +20,3 @@ struct Empty {
 };
 
 static_assert(sizeof(std::ranges::istream_view<Empty>) == sizeof(void*));
-
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
index c7a797171e0a7..b162c2da4f337 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_add.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang have a bug with atomic builtins affecting double and long double.
 // Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
 
 // https://github.com/llvm/llvm-project/issues/72893
 // XFAIL: target={{x86_64-.*}} && tsan
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
index 00d43a61acc69..8784037aa5e82 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/fetch_sub.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang have a bug with atomic builtins affecting double and long double.
 // Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
 
 // https://github.com/llvm/llvm-project/issues/72893
 // XFAIL: target={{x86_64-.*}} && tsan
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
index 5dddb7c5472e1..e0e079436075f 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.minus_equals.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang have a bug with atomic builtins affecting double and long double.
 // Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
 
 // floating-point-type operator-=(floating-point-type) volatile noexcept;
 // floating-point-type operator-=(floating-point-type) noexcept;
diff --git a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
index cf7b494a3a800..7e2c10106e9ab 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/atomics.types.float/operator.plus_equals.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang have a bug with atomic builtins affecting double and long double.
 // Fixed by 5fdd0948.
-// XFAIL: target=powerpc-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc-ibm-{{.*}} && clang-18
 
 // floating-point-type operator+=(floating-point-type) volatile noexcept;
 // floating-point-type operator+=(floating-point-type) noexcept;
diff --git a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
index 22020b1f64881..9124bd2314806 100644
--- a/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
+++ b/libcxx/test/std/containers/views/mdspan/mdspan/index_operator.pass.cpp
@@ -39,8 +39,8 @@
 #include "../ConvertibleToIntegral.h"
 #include "../CustomTestLayouts.h"
 
-// Clang 16 does not support argument packs as input to operator []
-#if defined(__clang_major__) && __clang_major__ < 17
+// Apple Clang does not support argument packs as input to operator []
+#ifdef TEST_COMPILER_APPLE_CLANG
 template <class MDS>
 constexpr auto& access(MDS mds) {
   return mds[];
@@ -84,7 +84,7 @@ template <class MDS, class... Args>
 constexpr void iterate(MDS mds, Args... args) {
   constexpr int r = static_cast<int>(MDS::extents_type::rank()) - 1 - static_cast<int>(sizeof...(Args));
   if constexpr (-1 == r) {
-#if defined(__clang_major__) && __clang_major__ < 17
+#ifdef TEST_COMPILER_APPLE_CLANG
     int* ptr1 = &access(mds, args...);
 #else
     int* ptr1 = &mds[args...];
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
index c02f42400b6e1..988ecbc11af36 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
@@ -93,7 +93,9 @@ constexpr bool testSpan()
     assert(s3.data() == val && s3.size() == 2);
     assert(s4.data() == val && s4.size() == 2);
 
-    std::span<const int> s5 = {{1,2}};
+    TEST_DIAGNOSTIC_PUSH
+    TEST_CLANG_DIAGNOSTIC_IGNORED("-Wdangling")
+    std::span<const int> s5 = {{1, 2}};
 #if TEST_STD_VER >= 26
     std::span<const int, 2> s6({1, 2});
 #else
@@ -101,6 +103,7 @@ constexpr bool testSpan()
 #endif
     assert(s5.size() == 2);  // and it dangles
     assert(s6.size() == 2);  // and it dangles
+    TEST_DIAGNOSTIC_POP
 
     return true;
 }
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
index 7d91ca0eada1d..6929831eca361 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang may encounter a backend error (see 0295c2ad):
 //   Pass-by-value arguments with alignment greater than register width are not supported.
-// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && clang-18
 
 // <experimental/simd>
 //
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
index 17ecfd3b50d25..eb88d90ca18bd 100644
--- a/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_unary.pass.cpp
@@ -10,7 +10,7 @@
 
 // Older versions of clang may encounter a backend error (see 0295c2ad):
 //   Pass-by-value arguments with alignment greater than register width are not supported.
-// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && (clang-17 || clang-18)
+// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && clang-18
 
 // This test crashes AppleClang 15 but not later versions.
 // UNSUPPORTED: apple-clang-15
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index f0ad2c0e67df3..1d763d6caba6a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index fd52df451afc8..462037e53374b 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(clang-18): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-15): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index cbad968a8aa20..143baf1fec941 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-17, clang-18, clang-19, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, apple-clang-15, apple-clang-16
 
 // XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
index 9e78596929fb6..028655412c921 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
@@ -121,8 +121,8 @@ constexpr bool test() {
   }
 
   {
-    // TODO(LLVM 20): Remove once we drop support for Clang 17
-#if defined(TEST_CLANG_VER) && TEST_CLANG_VER >= 1800
+    // TODO: Drop this once AppleClang is upgraded
+#ifndef TEST_COMPILER_APPLE_CLANG
     // https://github.com/llvm/llvm-project/issues/92676
     std::expected<Any, int> e1;
     auto e2 = e1;
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
index 829b74121b9c6..20e0a5ed66bd0 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
@@ -9,7 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 // The tested functionality needs deducing this.
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
index 874d609432f22..8a79dd4d50f20 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
@@ -9,7 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
 // The tested functionality needs deducing this.
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <format>
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
index e3e3e9a19e122..146ceba58872e 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <format>
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index 6b34d56e2c6f4..bcffa5812d04e 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-17, clang-18, clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-16
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
index a6ab77158aae1..24adec37431e7 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-17, clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 25bba30da612e..4bcb10d0b7579 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-17, clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
+// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp
index db45a56feb88a..f5fd5a674882b 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.equivalence.compile.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-17
 
 // <utility>
 
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp
index ac081495a6205..499ba6b243bed 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/nttp.verify.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-17
 
 // <utility>
 
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
index bea6d949924bd..7be7c7ff9122b 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// The tested functionality needs deducing this.
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <variant>
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
index 0da23fd58ccaa..f68112d30fc35 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // The tested functionality needs deducing this.
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <variant>
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
index 7429cdf80faca..8093af0aba587 100644
--- a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 // The tested functionality needs deducing this.
-// UNSUPPORTED: clang-17
 // XFAIL: apple-clang
 
 // <variant>

From 304a99091c84f303ff5037dc6bf5455e4cfde7a1 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 28 Jan 2025 09:51:14 +0000
Subject: [PATCH 384/432] [NFC][DebugInfo] Use iterators for insertion at some
 final callsites

These are the callsites that have materialised in the last three weeks
since I last built with deprecation warnings.
---
 llvm/examples/IRTransforms/SimplifyCFG.cpp               | 4 ++--
 llvm/lib/Analysis/Loads.cpp                              | 2 +-
 llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp               | 4 ++--
 llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp              | 2 +-
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 2 +-
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp          | 4 ++--
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/examples/IRTransforms/SimplifyCFG.cpp b/llvm/examples/IRTransforms/SimplifyCFG.cpp
index a37060cedb4a7..5b8b9b0cef11e 100644
--- a/llvm/examples/IRTransforms/SimplifyCFG.cpp
+++ b/llvm/examples/IRTransforms/SimplifyCFG.cpp
@@ -286,7 +286,7 @@ static bool mergeIntoSinglePredecessor_v1(Function &F) {
     }
     // Move all instructions from BB to Pred.
     for (Instruction &I : make_early_inc_range(BB))
-      I.moveBefore(Pred->getTerminator());
+      I.moveBefore(Pred->getTerminator()->getIterator());
 
     // Remove the Pred's terminator (which jumped to BB). BB's terminator
     // will become Pred's terminator.
@@ -337,7 +337,7 @@ static bool mergeIntoSinglePredecessor_v2(Function &F, DominatorTree &DT) {
     }
     // Move all instructions from BB to Pred.
     for (Instruction &I : make_early_inc_range(BB))
-      I.moveBefore(Pred->getTerminator());
+      I.moveBefore(Pred->getTerminator()->getIterator());
 
     // Remove the Pred's terminator (which jumped to BB). BB's terminator
     // will become Pred's terminator.
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 691d7e4a3edcf..733a7988e5a73 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -361,7 +361,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(
   } else
     return false;
 
-  Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
+  Instruction *HeaderFirstNonPHI = &*L->getHeader()->getFirstNonPHIIt();
   return isDereferenceableAndAlignedPointer(Base, Alignment, AccessSize, DL,
                                             HeaderFirstNonPHI, AC, &DT);
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
index 246eecd4ffcaa..2632e0ad546ef 100644
--- a/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVRegularizer.cpp
@@ -83,7 +83,7 @@ void SPIRVRegularizer::runLowerConstExpr(Function &F) {
       LLVM_DEBUG(dbgs() << "[lowerConstantExpressions] " << *CE);
       auto ReplInst = CE->getAsInstruction();
       auto InsPoint = II->getParent() == &*FBegin ? II : &FBegin->back();
-      ReplInst->insertBefore(InsPoint);
+      ReplInst->insertBefore(InsPoint->getIterator());
       LLVM_DEBUG(dbgs() << " -> " << *ReplInst << '\n');
       std::vector<Instruction *> Users;
       // Do not replace use during iteration of use. Do it in another loop.
@@ -97,7 +97,7 @@ void SPIRVRegularizer::runLowerConstExpr(Function &F) {
       for (auto &User : Users) {
         if (ReplInst->getParent() == User->getParent() &&
             User->comesBefore(ReplInst))
-          ReplInst->moveBefore(User);
+          ReplInst->moveBefore(User->getIterator());
         User->replaceUsesOfWith(CE, ReplInst);
       }
       return ReplInst;
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 2e4343c7922f1..21539c92e5b4d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -683,7 +683,7 @@ class SPIRVStructurizer : public FunctionPass {
               });
 
     for (Instruction *I : MergeInstructions) {
-      I->moveBefore(InsertionPoint);
+      I->moveBefore(InsertionPoint->getIterator());
       InsertionPoint = I;
     }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index df5f9833a2ff9..c8bdf029dd71c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -2044,7 +2044,7 @@ convertFSqrtDivIntoFMul(CallInst *CI, Instruction *X,
   // instructions in R2 and get the most common fpmath metadata and fast-math
   // flags on it.
   auto *FSqrt = cast<CallInst>(CI->clone());
-  FSqrt->insertBefore(CI);
+  FSqrt->insertBefore(CI->getIterator());
   auto *R2FPMathMDNode = (*R2.begin())->getMetadata(LLVMContext::MD_fpmath);
   FastMathFlags R2FMF = (*R2.begin())->getFastMathFlags(); // Common FMF
   for (Instruction *I : R2) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f1f367be9a897..57b7358049bce 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -10301,8 +10301,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
         // value. This ensures correctness when the start value might not be
         // less than the minimum value of a monotonically increasing induction
         // variable.
-        IRBuilder<> Builder(
-            cast<Instruction>(ResumeV)->getParent()->getFirstNonPHI());
+        BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
+        IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
         Value *Cmp =
             Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
         ResumeV =

From 6d0dd3d5c41e5b564714de9226c0623012538051 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 11:38:58 +0000
Subject: [PATCH 385/432] [lldb][Docs] Add Guarded Control Stack to AArch64
 Linux page (#117860)

The meat of this is how we execute expressions and deal with the
aftermath. For most users this will never be a concern, so it functions
more as a design doc than anything else.
---
 lldb/docs/use/aarch64-linux.md | 61 ++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/lldb/docs/use/aarch64-linux.md b/lldb/docs/use/aarch64-linux.md
index 393838dc0bb4f..164d25cef6721 100644
--- a/lldb/docs/use/aarch64-linux.md
+++ b/lldb/docs/use/aarch64-linux.md
@@ -229,3 +229,64 @@ bytes.
 
 `zt0`'s value and whether it is active or not will be saved prior to
 expression evaluation and restored afterwards.
+
+## Guarded Control Stack Extension (GCS)
+
+GCS support includes the following new registers:
+
+* `gcs_features_enabled`
+* `gcs_features_locked`
+* `gcspr_el0`
+
+These map to the registers ptrace provides. The first two have a `gcs_`
+prefix added as their names are too generic without it.
+
+When the GCS is enabled the kernel allocates a memory region for it. This region
+has a special attribute that LLDB will detect and presents like this:
+```
+  (lldb) memory region --all
+  <...>
+  [0x0000fffff7a00000-0x0000fffff7e00000) rw-
+  shadow stack: yes
+  [0x0000fffff7e00000-0x0000fffff7e10000) ---
+```
+
+`shadow stack` is a generic term used in the kernel for secure stack
+extensions like GCS.
+
+### Expression Evaluation
+
+To execute an expression when GCS is enabled, LLDB must push the return
+address of the expression wrapper (usually the entry point of the program)
+to the Guarded Control Stack. It does this by decrementing `gcspr_el0` and
+writing to the location now pointed to by `gcspr_el0` (instead of using the
+GCS push instructions).
+
+After an expression finishes, LLDB will restore the contents of all 3
+GCS registers, apart from the enable bit of `gcs_features_enabled`. This is
+because there are limits on how often and from where you can set this
+bit.
+
+GCS cannot be enabled from ptrace and it is expected that a process which
+has enabled and then disabled GCS, will not enable it again. The simplest
+choice was to not restore the enable bit at all. It is up to the user or
+program to manage that bit.
+
+The return address that LLDB pushed onto the Guarded Control Stack will be left
+in place. As will any values that were pushed to the stack by functions run
+during the expression.
+
+When the process resumes, `gcspr_el0` will be pointing to the original entry
+on the guarded control stack. So the other values will have no effect and
+likely be overwritten by future function calls.
+
+LLDB does not track and restore changes to general memory during expressions,
+so not restoring the GCS contents fits with the current behaviour.
+
+Note that if GCS is disabled and an expression enables it, LLDB will not
+be able to setup the return address and it is up to that expression to do that
+if it wants to return to LLDB correctly.
+
+If it does not do this, the expression will fail and although most process
+state will be restored, GCS will be left enabled. Which means that the program
+is very unlikely to be able to progress.

From 1b551e76a2fec3a1ab7d36476ab99f2504e6f6c9 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 28 Jan 2025 12:56:38 +0100
Subject: [PATCH 386/432] [LLD][COFF] Call setLocation on DelayAddressChunk
 when inserting into the addresses vector (NFC) (#124736)

This change prepares for ARM64X delay-load imports support (#124600).
Delaying the `setLocation` call is problematic on ARM64X because the
order of addresses may not align with the order of symbols.
---
 lld/COFF/DLL.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 534cd47be1051..76b7bee0e77e3 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -927,6 +927,7 @@ void DelayLoadContents::create() {
       Chunk *t = newThunkChunk(s, tm);
       auto *a = make<DelayAddressChunk>(ctx, t);
       addresses.push_back(a);
+      s->setLocation(a);
       thunks.push_back(t);
       StringRef extName = s->getExternalName();
       if (extName.empty()) {
@@ -967,8 +968,6 @@ void DelayLoadContents::create() {
       auxIatCopy.push_back(make<NullChunk>(ctx, 8));
     }
 
-    for (int i = 0, e = syms.size(); i < e; ++i)
-      syms[i]->setLocation(addresses[base + i]);
     auto *mh = make<NullChunk>(8, 8);
     moduleHandles.push_back(mh);
 

From 83433d936195c612a51b54397f82ab0d97369d86 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Tue, 28 Jan 2025 12:57:40 +0100
Subject: [PATCH 387/432] [OpenMP][IRBuilder] Handle `target ... nowait` when
 codegen targets host (#124720)

Fixes https://github.com/llvm/llvm-project/issues/124578

Handles the `nowait` clause for `omp.target` ops when the actual target
is the host (i.e. there is no target device). Rather than only checking
for the `HasNoWait` boolean, we also check for the presence/absence of a
`DeviceID` value. We only emit the target task if both are present.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     | 14 +++++----
 .../LLVMIR/omptarget-nowait-host-only.mlir    | 29 +++++++++++++++++++
 2 files changed, 38 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3e74f4c6d8ccd..6d234262e402f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -7258,10 +7258,12 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // If `HasNoWait == true`, we call  @__kmpc_omp_target_task_alloc to provide
     // the DeviceID to the deferred task and also since
     // @__kmpc_omp_target_task_alloc creates an untied/async task.
+    bool NeedsTargetTask = HasNoWait && DeviceID;
     Function *TaskAllocFn =
-        !HasNoWait ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
-                   : getOrCreateRuntimeFunctionPtr(
-                         OMPRTL___kmpc_omp_target_task_alloc);
+        !NeedsTargetTask
+            ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
+            : getOrCreateRuntimeFunctionPtr(
+                  OMPRTL___kmpc_omp_target_task_alloc);
 
     // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
     // call.
@@ -7310,8 +7312,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
         /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
         /*task_func=*/ProxyFn};
 
-    if (HasNoWait)
+    if (NeedsTargetTask) {
+      assert(DeviceID && "Expected non-empty device ID.");
       TaskAllocArgs.push_back(DeviceID);
+    }
 
     TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
 
@@ -7333,7 +7337,7 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
     // ---------------------------------------------------------------
     // The above means that the lack of a nowait on the target construct
     // translates to '#pragma omp task if(0)'
-    if (!HasNoWait) {
+    if (!NeedsTargetTask) {
       if (DepArray) {
         Function *TaskWaitFn =
             getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir
new file mode 100644
index 0000000000000..6b634226a3568
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-host-only.mlir
@@ -0,0 +1,29 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Tests `target ... nowait` when code gen targets the host rather than a
+// device.
+
+module attributes {omp.is_target_device = false} {
+  llvm.func @omp_target_nowait_() {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x f32 {bindc_name = "x"} : (i64) -> !llvm.ptr
+    %3 = omp.map.info var_ptr(%1 : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "x"}
+    omp.target nowait map_entries(%3 -> %arg0 : !llvm.ptr) {
+      %4 = llvm.mlir.constant(5.000000e+00 : f32) : f32
+      llvm.store %4, %arg0 : f32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+
+// CHECK: define void @omp_target_nowait_()
+// CHECK-NOT: define {{.*}} @
+// CHECK-NOT: call ptr @__kmpc_omp_target_task_alloc({{.*}})
+// Verify that we directly emit a call to the "target" region's body from the
+// parent function of the the `omp.target` op.
+// CHECK: call void @__omp_offloading_[[DEV:.*]]_[[FIL:.*]]_omp_target_nowait__l[[LINE:.*]](ptr {{.*}})
+// CHECK-NEXT: ret void
+
+// CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_nowait__l[[LINE]](ptr %[[ADDR_X:.*]])
+// CHECK: store float 5{{.*}}, ptr %[[ADDR_X]], align 4

From c5840cc609a3674cf7453a45946f7e4a2a73590b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 12:05:24 +0000
Subject: [PATCH 388/432] [lldb][AArch64] Add register fields for Guarded
 Control Stack registers (#124295)

The features and locked registers hold the same bits, the latter
is a lock for the former. Tested with core files and live processes.

I thought about setting a non-zero lock register in the core file,
however:
* We can be pretty sure it's reading correctly because its between
  the 2 other GCS registers in the same core file note.
* I can't make the test case modify lock bits because userspace
can't clear them (without using ptrace) and we don't know what the libc
has locked
  (probably all feature bits).
---
 .../Utility/RegisterFlagsDetector_arm64.cpp      | 16 ++++++++++++++++
 .../Utility/RegisterFlagsDetector_arm64.h        |  5 ++++-
 .../API/linux/aarch64/gcs/TestAArch64LinuxGCS.py | 15 ++++++++++++++-
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
index 9f82c935c0e7e..1438a45f37d72 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
@@ -17,6 +17,7 @@
 #define HWCAP_ASIMDHP (1ULL << 10)
 #define HWCAP_DIT (1ULL << 24)
 #define HWCAP_SSBS (1ULL << 28)
+#define HWCAP_GCS (1UL << 32)
 
 #define HWCAP2_BTI (1ULL << 17)
 #define HWCAP2_MTE (1ULL << 18)
@@ -50,6 +51,21 @@ Arm64RegisterFlagsDetector::DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2) {
   };
 }
 
+Arm64RegisterFlagsDetector::Fields
+Arm64RegisterFlagsDetector::DetectGCSFeatureFields(uint64_t hwcap,
+                                                   uint64_t hwcap2) {
+  (void)hwcap2;
+
+  if (!(hwcap & HWCAP_GCS))
+    return {};
+
+  return {
+      {"PUSH", 2},
+      {"WRITE", 1},
+      {"ENABLE", 0},
+  };
+}
+
 Arm64RegisterFlagsDetector::Fields
 Arm64RegisterFlagsDetector::DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2) {
   (void)hwcap;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
index 0f3d53d93892b..7daebcc71db04 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.h
@@ -61,6 +61,7 @@ class Arm64RegisterFlagsDetector {
   static Fields DetectMTECtrlFields(uint64_t hwcap, uint64_t hwcap2);
   static Fields DetectSVCRFields(uint64_t hwcap, uint64_t hwcap2);
   static Fields DetectFPMRFields(uint64_t hwcap, uint64_t hwcap2);
+  static Fields DetectGCSFeatureFields(uint64_t hwcap, uint64_t hwcap2);
 
   struct RegisterEntry {
     RegisterEntry(llvm::StringRef name, unsigned size, DetectorFn detector)
@@ -70,13 +71,15 @@ class Arm64RegisterFlagsDetector {
     llvm::StringRef m_name;
     RegisterFlags m_flags;
     DetectorFn m_detector;
-  } m_registers[6] = {
+  } m_registers[8] = {
       RegisterEntry("cpsr", 4, DetectCPSRFields),
       RegisterEntry("fpsr", 4, DetectFPSRFields),
       RegisterEntry("fpcr", 4, DetectFPCRFields),
       RegisterEntry("mte_ctrl", 8, DetectMTECtrlFields),
       RegisterEntry("svcr", 8, DetectSVCRFields),
       RegisterEntry("fpmr", 8, DetectFPMRFields),
+      RegisterEntry("gcs_features_enabled", 8, DetectGCSFeatureFields),
+      RegisterEntry("gcs_features_locked", 8, DetectGCSFeatureFields),
   };
 
   // Becomes true once field detection has been run for all registers.
diff --git a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
index adbef69c5c38b..f5a2ca356bbe8 100644
--- a/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
+++ b/lldb/test/API/linux/aarch64/gcs/TestAArch64LinuxGCS.py
@@ -223,7 +223,10 @@ def test_gcs_registers(self):
         self.runCmd(f"register write gcs_features_enabled {enabled}")
         self.expect(
             "register read gcs_features_enabled",
-            substrs=[f"gcs_features_enabled = 0x{enabled:016x}"],
+            substrs=[
+                f"gcs_features_enabled = 0x{enabled:016x}",
+                f"= (PUSH = {(enabled >> 2) & 1}, WRITE = {(enabled >> 1) & 1}, ENABLE = {enabled & 1})",
+            ],
         )
 
         # With GCS disabled, the invalid guarded control stack pointer is not
@@ -399,6 +402,16 @@ def test_gcs_core_file(self):
             ],
         )
 
+        # Should get register fields for both. They have the same fields.
+        self.expect(
+            "register read gcs_features_enabled",
+            substrs=["= (PUSH = 0, WRITE = 0, ENABLE = 1)"],
+        )
+        self.expect(
+            "register read gcs_features_locked",
+            substrs=["= (PUSH = 0, WRITE = 0, ENABLE = 0)"],
+        )
+
         # Core files do not include /proc/pid/smaps, so we cannot see the
         # shadow stack "ss" flag. gcspr_el0 should at least point to some mapped
         # region.

From 3a51466caf93b179f859175b7fe87018a2607e6c Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 28 Jan 2025 13:06:01 +0100
Subject: [PATCH 389/432] [LLD][COFF] Add support for delay-load imports on
 ARM64X (#124600)

For each imported module, emit null-terminated native import entries,
followed by null-terminated EC entries. If a view lacks imports for a
given module, only terminators are emitted. Use ARM64X relocations to
skip native entries in the EC view.

Move `delayLoadHelper` and `tailMergeUnwindInfoChunk` to `SymbolTable`
since they are different for each symbol table.
---
 lld/COFF/DLL.cpp                      | 120 ++++++---
 lld/test/COFF/arm64x-delayimport.test | 363 ++++++++++++++++++++++++++
 2 files changed, 441 insertions(+), 42 deletions(-)
 create mode 100644 lld/test/COFF/arm64x-delayimport.test

diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 76b7bee0e77e3..198b6e1cddd1e 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -921,52 +921,88 @@ void DelayLoadContents::create() {
     auto *dir = make<DelayDirectoryChunk>(dllNames.back());
 
     size_t base = addresses.size();
-    Chunk *tm = newTailMergeChunk(ctx.symtab, dir);
-    Chunk *pdataChunk = newTailMergePDataChunk(ctx.symtab, tm);
-    for (DefinedImportData *s : syms) {
-      Chunk *t = newThunkChunk(s, tm);
-      auto *a = make<DelayAddressChunk>(ctx, t);
-      addresses.push_back(a);
-      s->setLocation(a);
-      thunks.push_back(t);
-      StringRef extName = s->getExternalName();
-      if (extName.empty()) {
-        names.push_back(make<OrdinalOnlyChunk>(ctx, s->getOrdinal()));
-      } else {
-        auto *c = make<HintNameChunk>(extName, 0);
-        names.push_back(make<LookupChunk>(ctx, c));
-        hintNames.push_back(c);
-        // Add a synthetic symbol for this load thunk, using the "__imp___load"
-        // prefix, in case this thunk needs to be added to the list of valid
-        // call targets for Control Flow Guard.
-        StringRef symName = saver().save("__imp___load_" + extName);
-        s->loadThunkSym =
-            cast<DefinedSynthetic>(ctx.symtab.addSynthetic(symName, t));
+    ctx.forEachSymtab([&](SymbolTable &symtab) {
+      if (ctx.hybridSymtab && symtab.isEC()) {
+        // For hybrid images, emit null-terminated native import entries
+        // followed by null-terminated EC entries. If a view is missing imports
+        // for a given module, only terminators are emitted. Emit ARM64X
+        // relocations to skip native entries in the EC view.
+        ctx.dynamicRelocs->add(
+            IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0,
+            Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry,
+                                         DelayImportAddressTable)),
+            (addresses.size() - base) * sizeof(uint64_t));
+        ctx.dynamicRelocs->add(
+            IMAGE_DVRT_ARM64X_FIXUP_TYPE_DELTA, 0,
+            Arm64XRelocVal(dir, offsetof(delay_import_directory_table_entry,
+                                         DelayImportNameTable)),
+            (addresses.size() - base) * sizeof(uint64_t));
       }
 
-      if (s->file->impECSym) {
-        auto chunk = make<AuxImportChunk>(s->file);
-        auxIat.push_back(chunk);
-        s->file->impECSym->setLocation(chunk);
+      Chunk *tm = nullptr;
 
-        chunk = make<AuxImportChunk>(s->file);
-        auxIatCopy.push_back(chunk);
-        s->file->auxImpCopySym->setLocation(chunk);
+      for (DefinedImportData *s : syms) {
+        // Process only the symbols belonging to the current symtab.
+        if (symtab.isEC() != s->file->isEC())
+          continue;
+
+        if (!tm) {
+          tm = newTailMergeChunk(symtab, dir);
+          Chunk *pdataChunk = newTailMergePDataChunk(symtab, tm);
+          if (pdataChunk)
+            pdata.push_back(pdataChunk);
+        }
+
+        Chunk *t = newThunkChunk(s, tm);
+        auto *a = make<DelayAddressChunk>(ctx, t);
+        addresses.push_back(a);
+        s->setLocation(a);
+        thunks.push_back(t);
+        StringRef extName = s->getExternalName();
+        if (extName.empty()) {
+          names.push_back(make<OrdinalOnlyChunk>(ctx, s->getOrdinal()));
+        } else {
+          auto *c = make<HintNameChunk>(extName, 0);
+          names.push_back(make<LookupChunk>(ctx, c));
+          hintNames.push_back(c);
+          // Add a synthetic symbol for this load thunk, using the
+          // "__imp___load" prefix, in case this thunk needs to be added to the
+          // list of valid call targets for Control Flow Guard.
+          StringRef symName = saver().save("__imp___load_" + extName);
+          s->loadThunkSym =
+              cast<DefinedSynthetic>(symtab.addSynthetic(symName, t));
+        }
+
+        if (symtab.isEC()) {
+          auto chunk = make<AuxImportChunk>(s->file);
+          auxIat.push_back(chunk);
+          s->file->impECSym->setLocation(chunk);
+
+          chunk = make<AuxImportChunk>(s->file);
+          auxIatCopy.push_back(chunk);
+          s->file->auxImpCopySym->setLocation(chunk);
+        } else if (ctx.hybridSymtab) {
+          // Fill the auxiliary IAT with null chunks for native imports.
+          auxIat.push_back(make<NullChunk>(ctx));
+          auxIatCopy.push_back(make<NullChunk>(ctx));
+        }
       }
-    }
-    thunks.push_back(tm);
-    if (pdataChunk)
-      pdata.push_back(pdataChunk);
-    StringRef tmName =
-        saver().save("__tailMerge_" + syms[0]->getDLLName().lower());
-    ctx.symtab.addSynthetic(tmName, tm);
-    // Terminate with null values.
-    addresses.push_back(make<NullChunk>(ctx, 8));
-    names.push_back(make<NullChunk>(ctx, 8));
-    if (ctx.config.machine == ARM64EC) {
-      auxIat.push_back(make<NullChunk>(ctx, 8));
-      auxIatCopy.push_back(make<NullChunk>(ctx, 8));
-    }
+
+      if (tm) {
+        thunks.push_back(tm);
+        StringRef tmName =
+            saver().save("__tailMerge_" + syms[0]->getDLLName().lower());
+        symtab.addSynthetic(tmName, tm);
+      }
+
+      // Terminate with null values.
+      addresses.push_back(make<NullChunk>(ctx, 8));
+      names.push_back(make<NullChunk>(ctx, 8));
+      if (ctx.symtabEC) {
+        auxIat.push_back(make<NullChunk>(ctx, 8));
+        auxIatCopy.push_back(make<NullChunk>(ctx, 8));
+      }
+    });
 
     auto *mh = make<NullChunk>(8, 8);
     moduleHandles.push_back(mh);
diff --git a/lld/test/COFF/arm64x-delayimport.test b/lld/test/COFF/arm64x-delayimport.test
new file mode 100644
index 0000000000000..56923ef748d09
--- /dev/null
+++ b/lld/test/COFF/arm64x-delayimport.test
@@ -0,0 +1,363 @@
+REQUIRES: aarch64, x86
+RUN: split-file %s %t.dir && cd %t.dir
+
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows test-arm64ec.s -o test-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows test-arm64.s -o test-arm64.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-helper.s -o arm64ec-helper.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows arm64-helper.s -o arm64-helper.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows %S/Inputs/loadconfig-arm64ec.s -o loadconfig-arm64ec.obj
+RUN: llvm-mc -filetype=obj -triple=aarch64-windows %S/Inputs/loadconfig-arm64.s -o loadconfig-arm64.obj
+RUN: llvm-lib -machine:arm64ec -def:test.def -out:test-arm64ec.lib
+RUN: llvm-lib -machine:arm64 -def:test.def -out:test-arm64.lib
+
+# Test delayed-load import from both native and EC code.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          arm64-helper.obj arm64ec-helper.obj test-arm64.obj test-arm64ec.obj test-arm64.lib test-arm64ec.lib -delayload:test.dll
+
+RUN: llvm-readobj --coff-imports out.dll | FileCheck --check-prefix=IMPORTS %s
+IMPORTS:      DelayImport {
+IMPORTS-NEXT:   Name: test.dll
+IMPORTS-NEXT:   Attributes: 0x1
+IMPORTS-NEXT:   ModuleHandle: 0x6080
+IMPORTS-NEXT:   ImportAddressTable: 0x6088
+IMPORTS-NEXT:   ImportNameTable: 0x4390
+IMPORTS-NEXT:   BoundDelayImportTable: 0x0
+IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
+IMPORTS-NEXT:   Import {
+IMPORTS-NEXT:     Symbol: func (0)
+IMPORTS-NEXT:     Address: 0x180001014
+IMPORTS-NEXT:   }
+IMPORTS-NEXT: }
+IMPORTS-NEXT: HybridObject {
+IMPORTS:        DelayImport {
+IMPORTS-NEXT:     Name: test.dll
+IMPORTS-NEXT:     Attributes: 0x1
+IMPORTS-NEXT:     ModuleHandle: 0x6080
+IMPORTS-NEXT:     ImportAddressTable: 0x6098
+IMPORTS-NEXT:     ImportNameTable: 0x43A0
+IMPORTS-NEXT:     BoundDelayImportTable: 0x0
+IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
+IMPORTS-NEXT:     Import {
+IMPORTS-NEXT:       Symbol: func (0)
+IMPORTS-NEXT:       Address: 0x180003006
+IMPORTS-NEXT:     }
+IMPORTS-NEXT:   }
+IMPORTS-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
+TESTSEC: 0x180009000 10500000 98600000 00300000 10200000
+
+RUN: llvm-readobj --hex-dump=.testa out.dll | FileCheck --check-prefix=TESTSECA %s
+TESTSECA: 0x18000a000 88600000 08100000
+
+RUN: llvm-objdump -d out.dll | FileCheck --check-prefix=DISASM %s
+DISASM:      0000000180001000 <.text>:
+DISASM-NEXT: 180001000: 52800060     mov     w0, #0x3                // =3
+DISASM-NEXT: 180001004: d65f03c0     ret
+DISASM-NEXT: 180001008: b0000030     adrp    x16, 0x180006000
+DISASM-NEXT: 18000100c: f9404610     ldr     x16, [x16, #0x88]
+DISASM-NEXT: 180001010: d61f0200     br      x16
+DISASM-NEXT: 180001014: b0000031     adrp    x17, 0x180006000
+DISASM-NEXT: 180001018: 91022231     add     x17, x17, #0x88
+DISASM-NEXT: 18000101c: 14000001     b       0x180001020 <.text+0x20>
+DISASM-NEXT: 180001020: a9b37bfd     stp     x29, x30, [sp, #-0xd0]!
+DISASM-NEXT: 180001024: 910003fd     mov     x29, sp
+DISASM-NEXT: 180001028: a90107e0     stp     x0, x1, [sp, #0x10]
+DISASM-NEXT: 18000102c: a9020fe2     stp     x2, x3, [sp, #0x20]
+DISASM-NEXT: 180001030: a90317e4     stp     x4, x5, [sp, #0x30]
+DISASM-NEXT: 180001034: a9041fe6     stp     x6, x7, [sp, #0x40]
+DISASM-NEXT: 180001038: ad0287e0     stp     q0, q1, [sp, #0x50]
+DISASM-NEXT: 18000103c: ad038fe2     stp     q2, q3, [sp, #0x70]
+DISASM-NEXT: 180001040: ad0497e4     stp     q4, q5, [sp, #0x90]
+DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
+DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
+DISASM-NEXT: 18000104c: f0000000     adrp    x0, 0x180004000
+DISASM-NEXT: 180001050: 910d4000     add     x0, x0, #0x350
+DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
+DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
+DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
+DISASM-NEXT: 180001060: ad4497e4     ldp     q4, q5, [sp, #0x90]
+DISASM-NEXT: 180001064: ad438fe2     ldp     q2, q3, [sp, #0x70]
+DISASM-NEXT: 180001068: ad4287e0     ldp     q0, q1, [sp, #0x50]
+DISASM-NEXT: 18000106c: a9441fe6     ldp     x6, x7, [sp, #0x40]
+DISASM-NEXT: 180001070: a94317e4     ldp     x4, x5, [sp, #0x30]
+DISASM-NEXT: 180001074: a9420fe2     ldp     x2, x3, [sp, #0x20]
+DISASM-NEXT: 180001078: a94107e0     ldp     x0, x1, [sp, #0x10]
+DISASM-NEXT: 18000107c: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
+DISASM-NEXT: 180001080: d61f0200     br      x16
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180002004: d65f03c0     ret
+DISASM-NEXT: 180002008: 52800060     mov     w0, #0x3                // =3
+DISASM-NEXT: 18000200c: d65f03c0     ret
+DISASM-NEXT: 180002010: f0000010     adrp    x16, 0x180005000
+DISASM-NEXT: 180002014: f9400a10     ldr     x16, [x16, #0x10]
+DISASM-NEXT: 180002018: d61f0200     br      x16
+DISASM-NEXT: 18000201c: 9000002b     adrp    x11, 0x180006000
+DISASM-NEXT: 180002020: f9404d6b     ldr     x11, [x11, #0x98]
+DISASM-NEXT: 180002024: 9000000a     adrp    x10, 0x180002000 <.text+0x1000>
+DISASM-NEXT: 180002028: 9100c14a     add     x10, x10, #0x30
+DISASM-NEXT: 18000202c: 17fffff5     b       0x180002000 <.text+0x1000>
+DISASM-NEXT: 180002030: 52800080     mov     w0, #0x4                // =4
+DISASM-NEXT: 180002034: d65f03c0     ret
+DISASM-NEXT:                 ...
+DISASM-NEXT: 180003000: ff 25 92 30 00 00            jmpq    *0x3092(%rip)           # 0x180006098
+DISASM-NEXT: 180003006: 48 8d 05 8b 30 00 00         leaq    0x308b(%rip), %rax      # 0x180006098
+DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
+DISASM-NEXT: 180003012: 51                           pushq   %rcx
+DISASM-NEXT: 180003013: 52                           pushq   %rdx
+DISASM-NEXT: 180003014: 41 50                        pushq   %r8
+DISASM-NEXT: 180003016: 41 51                        pushq   %r9
+DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
+DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
+DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
+DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
+DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
+DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
+DISASM-NEXT: 180003036: 48 8d 0d 13 13 00 00         leaq    0x1313(%rip), %rcx # 0x180004350
+DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
+DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
+DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
+DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
+DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
+DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
+DISASM-NEXT: 18000305d: 41 59                        popq    %r9
+DISASM-NEXT: 18000305f: 41 58                        popq    %r8
+DISASM-NEXT: 180003061: 5a                           popq    %rdx
+DISASM-NEXT: 180003062: 59                           popq    %rcx
+DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+
+RUN: llvm-readobj --coff-load-config out.dll | FileCheck --check-prefix=LOADCFG %s
+LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
+LOADCFG-NEXT: AuxiliaryDelayloadIATCopy: 0x4140
+
+RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck --check-prefix=AUXIAT %s
+AUXIAT:      0x180005000 00000000 00000000 00000000 00000000
+AUXIAT-NEXT: 0x180005010 1c200080 01000000 00000000 00000000
+
+
+# Test delayed-load import from native code only.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out-native.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          arm64-helper.obj arm64ec-helper.obj test-arm64.obj test-arm64.lib test-arm64ec.lib -delayload:test.dll
+
+RUN: llvm-readobj --coff-imports out-native.dll | FileCheck --check-prefix=NATIVE-IMPORTS %s
+NATIVE-IMPORTS:      DelayImport {
+NATIVE-IMPORTS-NEXT:   Name: test.dll
+NATIVE-IMPORTS-NEXT:   Attributes: 0x1
+NATIVE-IMPORTS-NEXT:   ModuleHandle: 0x5080
+NATIVE-IMPORTS-NEXT:   ImportAddressTable: 0x5088
+NATIVE-IMPORTS-NEXT:   ImportNameTable: 0x3370
+NATIVE-IMPORTS-NEXT:   BoundDelayImportTable: 0x0
+NATIVE-IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
+NATIVE-IMPORTS-NEXT:   Import {
+NATIVE-IMPORTS-NEXT:     Symbol: func (0)
+NATIVE-IMPORTS-NEXT:     Address: 0x180001014
+NATIVE-IMPORTS-NEXT:   }
+NATIVE-IMPORTS-NEXT: }
+NATIVE-IMPORTS-NEXT: HybridObject {
+NATIVE-IMPORTS-NEXT:   Format: COFF-ARM64EC
+NATIVE-IMPORTS-NEXT:   Arch: aarch64
+NATIVE-IMPORTS-NEXT:   AddressSize: 64bit
+NATIVE-IMPORTS-NEXT:   DelayImport {
+NATIVE-IMPORTS-NEXT:     Name: test.dll
+NATIVE-IMPORTS-NEXT:     Attributes: 0x1
+NATIVE-IMPORTS-NEXT:     ModuleHandle: 0x5080
+NATIVE-IMPORTS-NEXT:     ImportAddressTable: 0x5098
+NATIVE-IMPORTS-NEXT:     ImportNameTable: 0x3380
+NATIVE-IMPORTS-NEXT:     BoundDelayImportTable: 0x0
+NATIVE-IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
+NATIVE-IMPORTS-NEXT:   }
+NATIVE-IMPORTS-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.testa out-native.dll | FileCheck --check-prefix=NATIVE-TESTSECA %s
+NATIVE-TESTSECA: 0x180007000 88500000 08100000
+
+RUN: llvm-objdump -d out-native.dll | FileCheck --check-prefix=NATIVE-DISASM %s
+NATIVE-DISASM:      0000000180001000 <.text>:
+NATIVE-DISASM-NEXT: 180001000: 52800060     mov     w0, #0x3                // =3
+NATIVE-DISASM-NEXT: 180001004: d65f03c0     ret
+NATIVE-DISASM-NEXT: 180001008: 90000030     adrp    x16, 0x180005000
+NATIVE-DISASM-NEXT: 18000100c: f9404610     ldr     x16, [x16, #0x88]
+NATIVE-DISASM-NEXT: 180001010: d61f0200     br      x16
+NATIVE-DISASM-NEXT: 180001014: 90000031     adrp    x17, 0x180005000
+NATIVE-DISASM-NEXT: 180001018: 91022231     add     x17, x17, #0x88
+NATIVE-DISASM-NEXT: 18000101c: 14000001     b       0x180001020 <.text+0x20>
+NATIVE-DISASM-NEXT: 180001020: a9b37bfd     stp     x29, x30, [sp, #-0xd0]!
+NATIVE-DISASM-NEXT: 180001024: 910003fd     mov     x29, sp
+NATIVE-DISASM-NEXT: 180001028: a90107e0     stp     x0, x1, [sp, #0x10]
+NATIVE-DISASM-NEXT: 18000102c: a9020fe2     stp     x2, x3, [sp, #0x20]
+NATIVE-DISASM-NEXT: 180001030: a90317e4     stp     x4, x5, [sp, #0x30]
+NATIVE-DISASM-NEXT: 180001034: a9041fe6     stp     x6, x7, [sp, #0x40]
+NATIVE-DISASM-NEXT: 180001038: ad0287e0     stp     q0, q1, [sp, #0x50]
+NATIVE-DISASM-NEXT: 18000103c: ad038fe2     stp     q2, q3, [sp, #0x70]
+NATIVE-DISASM-NEXT: 180001040: ad0497e4     stp     q4, q5, [sp, #0x90]
+NATIVE-DISASM-NEXT: 180001044: ad059fe6     stp     q6, q7, [sp, #0xb0]
+NATIVE-DISASM-NEXT: 180001048: aa1103e1     mov     x1, x17
+NATIVE-DISASM-NEXT: 18000104c: d0000000     adrp    x0, 0x180003000
+NATIVE-DISASM-NEXT: 180001050: 910cc000     add     x0, x0, #0x330
+NATIVE-DISASM-NEXT: 180001054: 97ffffeb     bl      0x180001000 <.text>
+NATIVE-DISASM-NEXT: 180001058: aa0003f0     mov     x16, x0
+NATIVE-DISASM-NEXT: 18000105c: ad459fe6     ldp     q6, q7, [sp, #0xb0]
+NATIVE-DISASM-NEXT: 180001060: ad4497e4     ldp     q4, q5, [sp, #0x90]
+NATIVE-DISASM-NEXT: 180001064: ad438fe2     ldp     q2, q3, [sp, #0x70]
+NATIVE-DISASM-NEXT: 180001068: ad4287e0     ldp     q0, q1, [sp, #0x50]
+NATIVE-DISASM-NEXT: 18000106c: a9441fe6     ldp     x6, x7, [sp, #0x40]
+NATIVE-DISASM-NEXT: 180001070: a94317e4     ldp     x4, x5, [sp, #0x30]
+NATIVE-DISASM-NEXT: 180001074: a9420fe2     ldp     x2, x3, [sp, #0x20]
+NATIVE-DISASM-NEXT: 180001078: a94107e0     ldp     x0, x1, [sp, #0x10]
+NATIVE-DISASM-NEXT: 18000107c: a8cd7bfd     ldp     x29, x30, [sp], #0xd0
+NATIVE-DISASM-NEXT: 180001080: d61f0200     br      x16
+
+RUN: llvm-readobj --coff-load-config out-native.dll | FileCheck --check-prefix=NATIVE-LOADCFG %s
+NATIVE-LOADCFG:      AuxiliaryDelayloadIAT: 0x4000
+NATIVE-LOADCFG-NEXT: AuxiliaryDelayloadIATCopy: 0x3140
+
+RUN: llvm-readobj --hex-dump=.rdata out-native.dll | FileCheck --check-prefix=NATIVE-AUXIAT %s
+NATIVE-AUXIAT:      0x180004000 00000000 00000000 00000000 00000000
+NATIVE-AUXIAT-NEXT: 0x180004010 00000000 00000000
+
+
+# Test delayed-load import from EC code only.
+
+RUN: lld-link -machine:arm64x -dll -noentry -out:out-ec.dll loadconfig-arm64.obj loadconfig-arm64ec.obj \
+RUN:          arm64-helper.obj arm64ec-helper.obj test-arm64ec.obj test-arm64.lib test-arm64ec.lib -delayload:test.dll
+
+RUN: llvm-readobj --coff-imports out-ec.dll | FileCheck --check-prefix=EC-IMPORTS %s
+EC-IMPORTS:      DelayImport {
+EC-IMPORTS-NEXT:   Name: test.dll
+EC-IMPORTS-NEXT:   Attributes: 0x1
+EC-IMPORTS-NEXT:   ModuleHandle: 0x6080
+EC-IMPORTS-NEXT:   ImportAddressTable: 0x6088
+EC-IMPORTS-NEXT:   ImportNameTable: 0x4388
+EC-IMPORTS-NEXT:   BoundDelayImportTable: 0x0
+EC-IMPORTS-NEXT:   UnloadDelayImportTable: 0x0
+EC-IMPORTS-NEXT: }
+EC-IMPORTS-NEXT: HybridObject {
+EC-IMPORTS-NEXT:   Format: COFF-ARM64EC
+EC-IMPORTS-NEXT:   Arch: aarch64
+EC-IMPORTS-NEXT:   AddressSize: 64bit
+EC-IMPORTS-NEXT:   DelayImport {
+EC-IMPORTS-NEXT:     Name: test.dll
+EC-IMPORTS-NEXT:     Attributes: 0x1
+EC-IMPORTS-NEXT:     ModuleHandle: 0x6080
+EC-IMPORTS-NEXT:     ImportAddressTable: 0x6090
+EC-IMPORTS-NEXT:     ImportNameTable: 0x4390
+EC-IMPORTS-NEXT:     BoundDelayImportTable: 0x0
+EC-IMPORTS-NEXT:     UnloadDelayImportTable: 0x0
+EC-IMPORTS-NEXT:     Import {
+EC-IMPORTS-NEXT:       Symbol: func (0)
+EC-IMPORTS-NEXT:       Address: 0x180003006
+EC-IMPORTS-NEXT:     }
+EC-IMPORTS-NEXT:   }
+EC-IMPORTS-NEXT: }
+
+RUN: llvm-readobj --hex-dump=.test out-ec.dll | FileCheck --check-prefix=EC-TESTSEC %s
+EC-TESTSEC: 0x180009000 08500000 90600000 00300000 10200000
+
+RUN: llvm-objdump -d out-ec.dll | FileCheck --check-prefix=EC-DISASM %s
+EC-DISASM:      0000000180001000 <.text>:
+EC-DISASM-NEXT: 180001000: 52800060     mov     w0, #0x3                // =3
+EC-DISASM-NEXT: 180001004: d65f03c0     ret
+EC-DISASM-NEXT:                 ...
+EC-DISASM-NEXT: 180002000: 52800040     mov     w0, #0x2                // =2
+EC-DISASM-NEXT: 180002004: d65f03c0     ret
+EC-DISASM-NEXT: 180002008: 52800060     mov     w0, #0x3                // =3
+EC-DISASM-NEXT: 18000200c: d65f03c0     ret
+EC-DISASM-NEXT: 180002010: f0000010     adrp    x16, 0x180005000
+EC-DISASM-NEXT: 180002014: f9400610     ldr     x16, [x16, #0x8]
+EC-DISASM-NEXT: 180002018: d61f0200     br      x16
+EC-DISASM-NEXT: 18000201c: 9000002b     adrp    x11, 0x180006000
+EC-DISASM-NEXT: 180002020: f940496b     ldr     x11, [x11, #0x90]
+EC-DISASM-NEXT: 180002024: 9000000a     adrp    x10, 0x180002000 <.text+0x1000>
+EC-DISASM-NEXT: 180002028: 9100c14a     add     x10, x10, #0x30
+EC-DISASM-NEXT: 18000202c: 17fffff5     b       0x180002000 <.text+0x1000>
+EC-DISASM-NEXT: 180002030: 52800080     mov     w0, #0x4                // =4
+EC-DISASM-NEXT: 180002034: d65f03c0     ret
+EC-DISASM-NEXT:                 ...
+EC-DISASM-NEXT: 180003000: ff 25 8a 30 00 00            jmpq    *0x308a(%rip)           # 0x180006090
+EC-DISASM-NEXT: 180003006: 48 8d 05 83 30 00 00         leaq    0x3083(%rip), %rax      # 0x180006090
+EC-DISASM-NEXT: 18000300d: e9 00 00 00 00               jmp     0x180003012 <.text+0x2012>
+EC-DISASM-NEXT: 180003012: 51                           pushq   %rcx
+EC-DISASM-NEXT: 180003013: 52                           pushq   %rdx
+EC-DISASM-NEXT: 180003014: 41 50                        pushq   %r8
+EC-DISASM-NEXT: 180003016: 41 51                        pushq   %r9
+EC-DISASM-NEXT: 180003018: 48 83 ec 48                  subq    $0x48, %rsp
+EC-DISASM-NEXT: 18000301c: 66 0f 7f 04 24               movdqa  %xmm0, (%rsp)
+EC-DISASM-NEXT: 180003021: 66 0f 7f 4c 24 10            movdqa  %xmm1, 0x10(%rsp)
+EC-DISASM-NEXT: 180003027: 66 0f 7f 54 24 20            movdqa  %xmm2, 0x20(%rsp)
+EC-DISASM-NEXT: 18000302d: 66 0f 7f 5c 24 30            movdqa  %xmm3, 0x30(%rsp)
+EC-DISASM-NEXT: 180003033: 48 8b d0                     movq    %rax, %rdx
+EC-DISASM-NEXT: 180003036: 48 8d 0d 0b 13 00 00         leaq    0x130b(%rip), %rcx      # 0x180004348
+EC-DISASM-NEXT: 18000303d: e8 c6 ef ff ff               callq   0x180002008 <.text+0x1008>
+EC-DISASM-NEXT: 180003042: 66 0f 6f 04 24               movdqa  (%rsp), %xmm0
+EC-DISASM-NEXT: 180003047: 66 0f 6f 4c 24 10            movdqa  0x10(%rsp), %xmm1
+EC-DISASM-NEXT: 18000304d: 66 0f 6f 54 24 20            movdqa  0x20(%rsp), %xmm2
+EC-DISASM-NEXT: 180003053: 66 0f 6f 5c 24 30            movdqa  0x30(%rsp), %xmm3
+EC-DISASM-NEXT: 180003059: 48 83 c4 48                  addq    $0x48, %rsp
+EC-DISASM-NEXT: 18000305d: 41 59                        popq    %r9
+EC-DISASM-NEXT: 18000305f: 41 58                        popq    %r8
+EC-DISASM-NEXT: 180003061: 5a                           popq    %rdx
+EC-DISASM-NEXT: 180003062: 59                           popq    %rcx
+EC-DISASM-NEXT: 180003063: ff e0                        jmpq    *%rax
+
+RUN: llvm-readobj --coff-load-config out-ec.dll | FileCheck --check-prefix=EC-LOADCFG %s
+EC-LOADCFG:      AuxiliaryDelayloadIAT: 0x5000
+EC-LOADCFG-NEXT: AuxiliaryDelayloadIATCopy: 0x4140
+
+RUN: llvm-readobj --hex-dump=.rdata out-ec.dll | FileCheck --check-prefix=EC-AUXIAT %s
+EC-AUXIAT:      0x180005000 00000000 00000000 1c200080 01000000
+EC-AUXIAT-NEXT: 0x180005010 00000000 00000000
+
+
+#--- test-arm64ec.s
+    .section .test, "rd"
+    .rva __imp_func
+    .rva __imp_aux_func
+    .rva func
+    .rva "#func"
+
+#--- test-arm64.s
+    .section .testa, "rd"
+    .rva __imp_func
+    .rva func
+
+#--- arm64ec-helper.s
+    .section .text,"xr",discard,__icall_helper_arm64ec
+    .globl __icall_helper_arm64ec
+    .p2align 2, 0x0
+__icall_helper_arm64ec:
+    mov w0, #2
+    ret
+
+    .section .text,"xr",discard,"#__delayLoadHelper2"
+    .globl "#__delayLoadHelper2"
+    .p2align 2, 0x0
+"#__delayLoadHelper2":
+    mov w0, #3
+    ret
+
+    .section .hybmp$x, "yi"
+    .symidx __imp_func
+    .symidx func_exit_thunk
+    .word 4
+
+    .section .wowthk$aa,"xr",discard,func_exit_thunk
+    .globl func_exit_thunk
+func_exit_thunk:
+    mov w0, #4
+    ret
+
+#--- arm64-helper.s
+    .section .text,"xr",discard,__delayLoadHelper2
+    .globl __delayLoadHelper2
+    .p2align 2, 0x0
+__delayLoadHelper2:
+    mov w0, #3
+    ret
+
+#--- test.def
+NAME test.dll
+EXPORTS
+    func

From 431024506c6f5597fe476e1283a08c9f8fa72ad7 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Tue, 28 Jan 2025 06:07:31 -0600
Subject: [PATCH 390/432] [bazel] Remove DebugInfo files covered by more
 specific targets (#124138)

For example, `include/llvm/DebugInfo/DWARF/DWARFContext.h` is included
as part of both the generic "DebugInfo" target as well as the specific
"DebugInfoDWARF" target. It should only be in one. Tooling that manages
build dependencies should be more accurate now.
---
 .../bazel/llvm-project-overlay/llvm/BUILD.bazel  | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index a54d464ac81d3..2f50dfb1c5802 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -424,9 +424,8 @@ cc_library(
 
 cc_library(
     name = "DebugInfo",
-    hdrs = glob(["include/llvm/DebugInfo/**/*.h"]),
+    hdrs = glob(["include/llvm/DebugInfo/*.h"]),
     copts = llvm_copts,
-    textual_hdrs = glob(["include/llvm/DebugInfo/**/*.def"]),
     deps = [
         ":Object",
         ":Support",
@@ -607,6 +606,7 @@ cc_library(
     deps = [
         ":BinaryFormat",
         ":DebugInfo",
+        ":DebugInfoBTF",
         ":DebugInfoDWARF",
         ":DebugInfoPDB",
         ":Demangle",
@@ -649,14 +649,14 @@ cc_binary(
         "utils/TableGen/Basic/Attributes.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.cpp",
         "utils/TableGen/Basic/CodeGenIntrinsics.h",
-        "utils/TableGen/Basic/SDNodeProperties.cpp",
-        "utils/TableGen/Basic/SDNodeProperties.h",
-        "utils/TableGen/Basic/TableGen.h",
-        "utils/TableGen/Basic/TableGen.cpp",
-        "utils/TableGen/Basic/SequenceToOffsetTable.h",
         "utils/TableGen/Basic/DirectiveEmitter.cpp",
         "utils/TableGen/Basic/IntrinsicEmitter.cpp",
         "utils/TableGen/Basic/RISCVTargetDefEmitter.cpp",
+        "utils/TableGen/Basic/SDNodeProperties.cpp",
+        "utils/TableGen/Basic/SDNodeProperties.h",
+        "utils/TableGen/Basic/SequenceToOffsetTable.h",
+        "utils/TableGen/Basic/TableGen.cpp",
+        "utils/TableGen/Basic/TableGen.h",
         "utils/TableGen/Basic/VTEmitter.cpp",
         "utils/TableGen/llvm-min-tblgen.cpp",
     ],
@@ -3019,6 +3019,7 @@ cc_library(
     deps = [
         ":BinaryFormat",
         ":DebugInfo",
+        ":DebugInfoDWARF",
         ":JITLink",
         ":OrcJIT",
         ":OrcShared",
@@ -4505,6 +4506,7 @@ cc_library(
         ":BinaryFormat",
         ":CodeGen",
         ":DebugInfo",
+        ":DebugInfoBTF",
         ":DebugInfoDWARF",
         ":Debuginfod",
         ":Demangle",

From a7f4044bd01919df2bf2204d203ee0378e2e9fb2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 28 Jan 2025 12:08:54 +0000
Subject: [PATCH 391/432] [clang][SME] Emit error for OpenMP captured regions
 in SME functions (#124590)

Currently, these generate incorrect code, as streaming/SME attributes
are not propagated to the outlined function. As we've yet to work on
mixing OpenMP and streaming functions (and determine how they should
interact with OpenMP's runtime), we think it is best to disallow this
for now.
---
 clang/include/clang/AST/Decl.h                |  6 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +
 clang/lib/AST/Decl.cpp                        | 14 ++++
 clang/lib/Sema/SemaARM.cpp                    | 14 ----
 clang/lib/Sema/SemaStmt.cpp                   | 21 +++++
 ...aarch64-sme-attrs-openmp-captured-region.c | 81 +++++++++++++++++++
 6 files changed, 125 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/Sema/aarch64-sme-attrs-openmp-captured-region.c

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 16403774e72b3..9593bab576412 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -5139,6 +5139,12 @@ static constexpr StringRef getOpenMPVariantManglingSeparatorStr() {
 bool IsArmStreamingFunction(const FunctionDecl *FD,
                             bool IncludeLocallyStreaming);
 
+/// Returns whether the given FunctionDecl has Arm ZA state.
+bool hasArmZAState(const FunctionDecl *FD);
+
+/// Returns whether the given FunctionDecl has Arm ZT0 state.
+bool hasArmZT0State(const FunctionDecl *FD);
+
 } // namespace clang
 
 #endif // LLVM_CLANG_AST_DECL_H
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 9487e3b62aaf6..a09fe037fdeb9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3870,6 +3870,9 @@ def err_sme_definition_using_za_in_non_sme_target : Error<
   "function using ZA state requires 'sme'">;
 def err_sme_definition_using_zt0_in_non_sme2_target : Error<
   "function using ZT0 state requires 'sme2'">;
+def err_sme_openmp_captured_region : Error<
+  "OpenMP captured regions are not yet supported in "
+  "%select{streaming functions|functions with ZA state|functions with ZT0 state}0">;
 def warn_sme_streaming_pass_return_vl_to_non_streaming : Warning<
   "%select{returning|passing}0 a VL-dependent argument %select{from|to}0 a function with a different"
   " streaming-mode is undefined behaviour when the streaming and non-streaming vector lengths are different at runtime">,
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index beb5fcaefac53..0bd4d64b54a0f 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -5845,3 +5845,17 @@ bool clang::IsArmStreamingFunction(const FunctionDecl *FD,
 
   return false;
 }
+
+bool clang::hasArmZAState(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZAState(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
+}
+
+bool clang::hasArmZT0State(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZT0State(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZT0());
+}
diff --git a/clang/lib/Sema/SemaARM.cpp b/clang/lib/Sema/SemaARM.cpp
index 2620bbc97ba02..9fbe8358f716b 100644
--- a/clang/lib/Sema/SemaARM.cpp
+++ b/clang/lib/Sema/SemaARM.cpp
@@ -624,20 +624,6 @@ static bool checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
   return true;
 }
 
-static bool hasArmZAState(const FunctionDecl *FD) {
-  const auto *T = FD->getType()->getAs<FunctionProtoType>();
-  return (T && FunctionType::getArmZAState(T->getAArch64SMEAttributes()) !=
-                   FunctionType::ARM_None) ||
-         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
-}
-
-static bool hasArmZT0State(const FunctionDecl *FD) {
-  const auto *T = FD->getType()->getAs<FunctionProtoType>();
-  return (T && FunctionType::getArmZT0State(T->getAArch64SMEAttributes()) !=
-                   FunctionType::ARM_None) ||
-         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZT0());
-}
-
 static ArmSMEState getSMEState(unsigned BuiltinID) {
   switch (BuiltinID) {
   default:
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 25a07d0315eac..947651d514b3b 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -4568,9 +4568,27 @@ buildCapturedStmtCaptureList(Sema &S, CapturedRegionScopeInfo *RSI,
   return false;
 }
 
+static std::optional<int>
+isOpenMPCapturedRegionInArmSMEFunction(Sema const &S, CapturedRegionKind Kind) {
+  if (!S.getLangOpts().OpenMP || Kind != CR_OpenMP)
+    return {};
+  if (const FunctionDecl *FD = S.getCurFunctionDecl(/*AllowLambda=*/true)) {
+    if (IsArmStreamingFunction(FD, /*IncludeLocallyStreaming=*/true))
+      return /* in streaming functions */ 0;
+    if (hasArmZAState(FD))
+      return /* in functions with ZA state */ 1;
+    if (hasArmZT0State(FD))
+      return /* in fuctions with ZT0 state */ 2;
+  }
+  return {};
+}
+
 void Sema::ActOnCapturedRegionStart(SourceLocation Loc, Scope *CurScope,
                                     CapturedRegionKind Kind,
                                     unsigned NumParams) {
+  if (auto ErrorIndex = isOpenMPCapturedRegionInArmSMEFunction(*this, Kind))
+    Diag(Loc, diag::err_sme_openmp_captured_region) << *ErrorIndex;
+
   CapturedDecl *CD = nullptr;
   RecordDecl *RD = CreateCapturedStmtRecordDecl(CD, Loc, NumParams);
 
@@ -4602,6 +4620,9 @@ void Sema::ActOnCapturedRegionStart(SourceLocation Loc, Scope *CurScope,
                                     CapturedRegionKind Kind,
                                     ArrayRef<CapturedParamNameType> Params,
                                     unsigned OpenMPCaptureLevel) {
+  if (auto ErrorIndex = isOpenMPCapturedRegionInArmSMEFunction(*this, Kind))
+    Diag(Loc, diag::err_sme_openmp_captured_region) << *ErrorIndex;
+
   CapturedDecl *CD = nullptr;
   RecordDecl *RD = CreateCapturedStmtRecordDecl(CD, Loc, Params.size());
 
diff --git a/clang/test/Sema/aarch64-sme-attrs-openmp-captured-region.c b/clang/test/Sema/aarch64-sme-attrs-openmp-captured-region.c
new file mode 100644
index 0000000000000..6fb7c60d02cd7
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme-attrs-openmp-captured-region.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -fopenmp -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -fopenmp -fsyntax-only -verify=expected-cpp -x c++ %s
+
+int compute(int);
+
+void streaming_openmp_captured_region(int * out) __arm_streaming {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in streaming functions}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in streaming functions}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+__arm_locally_streaming void locally_streaming_openmp_captured_region(int * out) {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in streaming functions}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in streaming functions}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+void za_state_captured_region(int * out) __arm_inout("za") {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in functions with ZA state}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in functions with ZA state}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+__arm_new("za") void new_za_state_captured_region(int * out) {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in functions with ZA state}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in functions with ZA state}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+void zt0_state_openmp_captured_region(int * out) __arm_inout("zt0") {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in functions with ZT0 state}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in functions with ZT0 state}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+__arm_new("zt0") void new_zt0_state_openmp_captured_region(int * out) {
+  // expected-error@+2 {{OpenMP captured regions are not yet supported in functions with ZT0 state}}
+  // expected-cpp-error@+1 {{OpenMP captured regions are not yet supported in functions with ZT0 state}}
+  #pragma omp parallel for num_threads(32)
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+/// OpenMP directives that don't create a captured region are okay:
+
+void streaming_function_openmp(int * out) __arm_streaming __arm_inout("za", "zt0") {
+  #pragma omp unroll full
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+__arm_locally_streaming void locally_streaming_openmp(int * out) __arm_inout("za", "zt0") {
+  #pragma omp unroll full
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}
+
+__arm_new("za", "zt0") void arm_new_openmp(int * out) {
+  #pragma omp unroll full
+  for (int ci = 0; ci < 8; ci++) {
+    out[ci] = compute(ci);
+  }
+}

From 8353aa2a53b307bfeebb7f8592cc15bb00656c78 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 12:08:26 +0000
Subject: [PATCH 392/432] [llvm][Docs] Add LLDB AArch64 GCS Release note

https://github.com/llvm/llvm-project/pull/124295 just
went in and that's the last piece of functionality.
---
 llvm/docs/ReleaseNotes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 8d89c64df1fa9..29bf284617b43 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -571,6 +571,10 @@ Changes to LLDB
 * LLDB can now read the `fpmr` register from AArch64 Linux processes and core
   files.
 
+* Support was added for debugging AArch64 Linux programs that use the
+  Guarded Control Stack extension (GCS). This includes live processes and core
+  files.
+
 * LLDB now supports execution of user expressions for non-trivial cases for LoongArch and RISC-V targets, like function calls, when some code needs to be executed on the target.
 
 * LLDB now supports optionally enabled/disabled register sets (particularly floating point registers) for RISC-V 64. This happens for targets like `RV64IMAC` or `RV64IMACV`,

From 8ea018ce1de016a2923ebc48d39abc1c06cce41e Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Tue, 28 Jan 2025 13:27:26 +0100
Subject: [PATCH 393/432] [DAGISel] Fix MMRA Handling in copyExtraInfo
 (#124730)

#78569 did not implement this correctly and an edge case breaks it by
triggering `Assertion `!Leafs.empty()' failed.`

Fixes SWDEV-507698
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   2 +-
 llvm/test/CodeGen/AMDGPU/mmra.ll              | 105 +++++++++++++++++-
 2 files changed, 103 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0f9790a10a139..b416c0efbbc4f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13635,7 +13635,7 @@ void SelectionDAG::copyExtraInfo(SDNode *From, SDNode *To) {
   // Use of operator[] on the DenseMap may cause an insertion, which invalidates
   // the iterator, hence the need to make a copy to prevent a use-after-free.
   NodeExtraInfo NEI = I->second;
-  if (LLVM_LIKELY(!NEI.PCSections) && LLVM_LIKELY(!NEI.MMRA)) {
+  if (LLVM_LIKELY(!NEI.PCSections)) {
     // No deep copy required for the types of extra info set.
     //
     // FIXME: Investigate if other types of extra info also need deep copy. This
diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll
index 39650f4295c76..d0696bf329af8 100644
--- a/llvm/test/CodeGen/AMDGPU/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmra.ll
@@ -17,7 +17,7 @@ define void @fence_loads(ptr %ptr) {
   ; CHECK-NEXT:   ATOMIC_FENCE 5, 1, mmra !0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !1
   ; CHECK-NEXT:   [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr, mmra !1 :: (load acquire (s8) from %ir.ptr, align 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec, mmra !2
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]], mmra !2
   ; CHECK-NEXT:   FLAT_STORE_BYTE [[COPY3]], killed [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr, mmra !2 :: (store release (s8) into %ir.ptr, align 4)
   ; CHECK-NEXT:   SI_RETURN
@@ -82,7 +82,7 @@ define void @atomicrmw_rel(ptr %ptr) {
   ; CHECK-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec
   ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1, mmra !2
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !2
   ; CHECK-NEXT:   [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !2 :: (load store release monotonic (s32) on %ir.AlignedAddr)
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !2
@@ -140,7 +140,7 @@ define void @cmpxchg(ptr %ptr) {
   ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[PHI2]], [[V_LSHLREV_B32_e64_2]], implicit $exec
   ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[DEF6:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1, mmra !1
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_OR_B32_e64_]], %subreg.sub0, [[PHI2]], %subreg.sub1
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !1
   ; CHECK-NEXT:   [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY4]], killed [[COPY6]], 0, 1, implicit $exec, implicit $flat_scr, mmra !1 :: (load store acquire acquire (s32) on %ir.AlignedAddr)
   ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI2]], implicit $exec
@@ -180,6 +180,105 @@ define void @cmpxchg(ptr %ptr) {
   ret void
 }
 
+declare <32 x half> @f()
+
+; Variant of atomicrmw_rel that provoked a crash in SelectionDAG::copyExtraInfo
+define void @atomicrmw_rel_deepcopy(ptr %ptr) {
+  ; CHECK-LABEL: name: atomicrmw_rel_deepcopy
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $vgpr0, $vgpr1, $vgpr31
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr31
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr15
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr14
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr13
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr12
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f, target-flags(amdgpu-gotprel32-hi) @f, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY [[COPY10]]
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY [[COPY9]]
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY [[COPY8]]
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY [[COPY7]]
+  ; CHECK-NEXT:   $sgpr12 = COPY [[COPY6]]
+  ; CHECK-NEXT:   $sgpr13 = COPY [[COPY5]]
+  ; CHECK-NEXT:   $sgpr14 = COPY [[COPY4]]
+  ; CHECK-NEXT:   $sgpr15 = COPY [[COPY3]]
+  ; CHECK-NEXT:   $vgpr31 = COPY [[COPY]]
+  ; CHECK-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]]
+  ; CHECK-NEXT:   $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @f, csr_amdgpu, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $vgpr2, implicit-def $vgpr3, implicit-def $vgpr4, implicit-def $vgpr5, implicit-def $vgpr6, implicit-def $vgpr7, implicit-def $vgpr8, implicit-def $vgpr9, implicit-def $vgpr10, implicit-def $vgpr11, implicit-def $vgpr12, implicit-def $vgpr13, implicit-def $vgpr14, implicit-def $vgpr15
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+  ; CHECK-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY $vgpr9
+  ; CHECK-NEXT:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY $vgpr10
+  ; CHECK-NEXT:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY $vgpr11
+  ; CHECK-NEXT:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY $vgpr12
+  ; CHECK-NEXT:   [[COPY26:%[0-9]+]]:vgpr_32 = COPY $vgpr13
+  ; CHECK-NEXT:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY $vgpr14
+  ; CHECK-NEXT:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY $vgpr15
+  ; CHECK-NEXT:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -4
+  ; CHECK-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY29]], killed [[S_MOV_B32_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_]], %subreg.sub0, [[COPY11]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY30:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 3
+  ; CHECK-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY29]], [[S_MOV_B32_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[S_MOV_B32_1]], killed [[V_AND_B32_e64_1]], implicit $exec
+  ; CHECK-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 255
+  ; CHECK-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[V_LSHLREV_B32_e64_]], killed [[S_MOV_B32_2]], implicit $exec
+  ; CHECK-NEXT:   [[V_NOT_B32_e32_:%[0-9]+]]:vgpr_32 = V_NOT_B32_e32 [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; CHECK-NEXT:   [[COPY31:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]], mmra !0
+  ; CHECK-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY31]], 0, 0, implicit $exec, implicit $flat_scr, mmra !0 :: (load (s32) from %ir.AlignedAddr)
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.atomicrmw.start:
+  ; CHECK-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %7, %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %6, %bb.1
+  ; CHECK-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_NOT_B32_e32_]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; CHECK-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[PHI1]], killed [[V_OR_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   [[DEF4:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF5:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_AND_B32_e64_2]], %subreg.sub0, [[PHI1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY32:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]], mmra !0
+  ; CHECK-NEXT:   [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY30]], killed [[COPY32]], 0, 1, implicit $exec, implicit $flat_scr, mmra !0 :: (load store release monotonic (s32) on %ir.AlignedAddr)
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec, mmra !0
+  ; CHECK-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc
+  ; CHECK-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.atomicrmw.end:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
+  ; CHECK-NEXT:   SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   SI_RETURN
+  %C = call <32 x half> @f()
+  %old.2 = atomicrmw add ptr %ptr, i8 0 release, align 1, !mmra !0
+  ret void
+}
+
 attributes #0 = { memory(read) }
 attributes #1 = { memory(write) }
 

From 500a1834d92d701fea914ab5de10d82d6c90dbd9 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 28 Jan 2025 20:26:52 +0800
Subject: [PATCH 394/432] [RISCV][VLOPT] Fix some typos in vl-opt-op-info.mir
 test. NFC

vleN_v_incompatible_emul reassigns to %x and
vsuxeiN_v_idx_incompatible_eew has a dead instruction
---
 llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index edcd32c4098bc..8ae48e0b27e1e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -648,9 +648,9 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: vleN_v_incompatible_emul
     ; CHECK: %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
+    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0 /* tu, mu */
     %x:vr = PseudoVLE8_V_M1 $noreg, $noreg, -1, 3 /* e8 */, 0
-    %x:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
+    %y:vr = PseudoVADD_VV_MF2 $noreg, %x, $noreg, 1, 3 /* e8 */, 0
 ...
 ---
 name: vlm_v
@@ -758,10 +758,8 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: vsuxeiN_v_idx_incompatible_eew
     ; CHECK: %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
     ; CHECK-NEXT: PseudoVSUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */
     %x:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 4 /* e16 */, 0
-    %y:vr = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
     PseudoVSUXEI8_V_M1_M1 $noreg, $noreg, %x, 1, 3 /* e8 */
 ...
 ---

From 4424c44c8c4ec8e071f5c5999fba216d36fb92c9 Mon Sep 17 00:00:00 2001
From: Wolfgang Pieb <wolfgang.pieb@sony.com>
Date: Wed, 25 Sep 2024 16:55:39 +0100
Subject: [PATCH 395/432] [Clang] Add fake use emission to Clang with
 -fextend-lifetimes (#110102)

Following the previous patch which adds the "extend lifetimes" flag
without (almost) any functionality, this patch adds the real feature by
allowing Clang to emit fake uses. These are emitted as a new form of cleanup,
set for variable addresses, which just emits a fake use intrinsic when the
variable falls out of scope. The code for achieving this is simple, with most
of the logic centered on determining whether to emit a fake use for a given
address, and on ensuring that fake uses are ignored in a few cases.

Co-authored-by: Stephen Tozer <stephen.tozer@sony.com>
---
 clang/lib/CodeGen/CGCall.cpp                  | 17 ++++-
 clang/lib/CodeGen/CGCleanup.cpp               | 11 ++-
 clang/lib/CodeGen/CGCleanup.h                 |  8 ++
 clang/lib/CodeGen/CGDecl.cpp                  | 73 +++++++++++++++++++
 clang/lib/CodeGen/CodeGenFunction.cpp         |  6 +-
 clang/lib/CodeGen/CodeGenFunction.h           | 16 ++++
 clang/lib/CodeGen/CodeGenModule.h             |  4 +
 clang/lib/CodeGen/EHScopeStack.h              |  9 ++-
 .../extend-variable-liveness-except.cpp       | 34 +++++++++
 .../extend-variable-liveness-wide-scalar.cpp  | 11 +++
 clang/test/CodeGen/extend-variable-liveness.c | 29 ++++++++
 clang/test/CodeGen/fake-use-determinism.c     | 20 +++++
 clang/test/CodeGen/fake-use-lambda.cpp        | 43 +++++++++++
 clang/test/CodeGen/fake-use-landingpad.c      | 14 ++++
 clang/test/CodeGen/fake-use-noreturn.cpp      | 28 +++++++
 clang/test/CodeGen/fake-use-return-line.c     | 16 ++++
 clang/test/CodeGen/fake-use-sanitizer.cpp     | 61 ++++++++++++++++
 clang/test/CodeGen/fake-use-scalar.c          | 41 +++++++++++
 clang/test/CodeGen/fake-use-this.cpp          | 34 +++++++++
 clang/test/CodeGen/fake-use-while.c           | 18 +++++
 20 files changed, 483 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/CodeGen/extend-variable-liveness-except.cpp
 create mode 100644 clang/test/CodeGen/extend-variable-liveness-wide-scalar.cpp
 create mode 100644 clang/test/CodeGen/extend-variable-liveness.c
 create mode 100644 clang/test/CodeGen/fake-use-determinism.c
 create mode 100644 clang/test/CodeGen/fake-use-lambda.cpp
 create mode 100644 clang/test/CodeGen/fake-use-landingpad.c
 create mode 100644 clang/test/CodeGen/fake-use-noreturn.cpp
 create mode 100644 clang/test/CodeGen/fake-use-return-line.c
 create mode 100644 clang/test/CodeGen/fake-use-sanitizer.cpp
 create mode 100644 clang/test/CodeGen/fake-use-scalar.c
 create mode 100644 clang/test/CodeGen/fake-use-this.cpp
 create mode 100644 clang/test/CodeGen/fake-use-while.c

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index e0cf6ca69f0df..f790e78cd55a8 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3581,15 +3581,26 @@ static llvm::StoreInst *findDominatingStoreToReturnValue(CodeGenFunction &CGF) {
     llvm::BasicBlock *IP = CGF.Builder.GetInsertBlock();
     if (IP->empty()) return nullptr;
 
-    // Look at directly preceding instruction, skipping bitcasts and lifetime
-    // markers.
+    // Look at directly preceding instruction, skipping bitcasts, lifetime
+    // markers, and fake uses and their operands.
+    const llvm::Instruction *LoadIntoFakeUse = nullptr;
     for (llvm::Instruction &I : make_range(IP->rbegin(), IP->rend())) {
+      // Ignore instructions that are just loads for fake uses; the load should
+      // immediately precede the fake use, so we only need to remember the
+      // operand for the last fake use seen.
+      if (LoadIntoFakeUse == &I)
+        continue;
       if (isa<llvm::BitCastInst>(&I))
         continue;
-      if (auto *II = dyn_cast<llvm::IntrinsicInst>(&I))
+      if (auto *II = dyn_cast<llvm::IntrinsicInst>(&I)) {
         if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_end)
           continue;
 
+        if (II->getIntrinsicID() == llvm::Intrinsic::fake_use) {
+          LoadIntoFakeUse = dyn_cast<llvm::Instruction>(II->getArgOperand(0));
+          continue;
+        }
+      }
       return GetStoreIfValid(&I);
     }
     return nullptr;
diff --git a/clang/lib/CodeGen/CGCleanup.cpp b/clang/lib/CodeGen/CGCleanup.cpp
index d9c0dbe45d6cf..7e1c5b7da9552 100644
--- a/clang/lib/CodeGen/CGCleanup.cpp
+++ b/clang/lib/CodeGen/CGCleanup.cpp
@@ -112,11 +112,15 @@ void EHScopeStack::deallocate(size_t Size) {
   StartOfData += llvm::alignTo(Size, ScopeStackAlignment);
 }
 
-bool EHScopeStack::containsOnlyLifetimeMarkers(
+bool EHScopeStack::containsOnlyNoopCleanups(
     EHScopeStack::stable_iterator Old) const {
   for (EHScopeStack::iterator it = begin(); stabilize(it) != Old; it++) {
     EHCleanupScope *cleanup = dyn_cast<EHCleanupScope>(&*it);
-    if (!cleanup || !cleanup->isLifetimeMarker())
+    // If this is anything other than a lifetime marker or fake use cleanup,
+    // then the scope stack does not contain only noop cleanups.
+    if (!cleanup)
+      return false;
+    if (!cleanup->isLifetimeMarker() && !cleanup->isFakeUse())
       return false;
   }
 
@@ -154,6 +158,7 @@ void *EHScopeStack::pushCleanup(CleanupKind Kind, size_t Size) {
   bool IsNormalCleanup = Kind & NormalCleanup;
   bool IsEHCleanup = Kind & EHCleanup;
   bool IsLifetimeMarker = Kind & LifetimeMarker;
+  bool IsFakeUse = Kind & FakeUse;
 
   // Per C++ [except.terminate], it is implementation-defined whether none,
   // some, or all cleanups are called before std::terminate. Thus, when
@@ -176,6 +181,8 @@ void *EHScopeStack::pushCleanup(CleanupKind Kind, size_t Size) {
     InnermostEHScope = stable_begin();
   if (IsLifetimeMarker)
     Scope->setLifetimeMarker();
+  if (IsFakeUse)
+    Scope->setFakeUse();
 
   // With Windows -EHa, Invoke llvm.seh.scope.begin() for EHCleanup
   // If exceptions are disabled/ignored and SEH is not in use, then there is no
diff --git a/clang/lib/CodeGen/CGCleanup.h b/clang/lib/CodeGen/CGCleanup.h
index c73c97146abc4..ba78e5478ac37 100644
--- a/clang/lib/CodeGen/CGCleanup.h
+++ b/clang/lib/CodeGen/CGCleanup.h
@@ -87,6 +87,10 @@ class EHScope {
     LLVM_PREFERRED_TYPE(bool)
     unsigned IsLifetimeMarker : 1;
 
+    /// Whether this cleanup is a fake use
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned IsFakeUse : 1;
+
     /// Whether the normal cleanup should test the activation flag.
     LLVM_PREFERRED_TYPE(bool)
     unsigned TestFlagInNormalCleanup : 1;
@@ -352,6 +356,7 @@ class alignas(8) EHCleanupScope : public EHScope {
     CleanupBits.IsEHCleanup = isEH;
     CleanupBits.IsActive = true;
     CleanupBits.IsLifetimeMarker = false;
+    CleanupBits.IsFakeUse = false;
     CleanupBits.TestFlagInNormalCleanup = false;
     CleanupBits.TestFlagInEHCleanup = false;
     CleanupBits.CleanupSize = cleanupSize;
@@ -384,6 +389,9 @@ class alignas(8) EHCleanupScope : public EHScope {
   bool isLifetimeMarker() const { return CleanupBits.IsLifetimeMarker; }
   void setLifetimeMarker() { CleanupBits.IsLifetimeMarker = true; }
 
+  bool isFakeUse() const { return CleanupBits.IsFakeUse; }
+  void setFakeUse() { CleanupBits.IsFakeUse = true; }
+
   bool hasActiveFlag() const { return ActiveFlag.isValid(); }
   Address getActiveFlag() const {
     return ActiveFlag;
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 60f67d4640370..ded905cdcc9f4 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1355,6 +1355,14 @@ void CodeGenFunction::EmitLifetimeEnd(llvm::Value *Size, llvm::Value *Addr) {
   C->setDoesNotThrow();
 }
 
+void CodeGenFunction::EmitFakeUse(Address Addr) {
+  auto NL = ApplyDebugLocation::CreateEmpty(*this);
+  llvm::Value *V = Builder.CreateLoad(Addr, "fake.use");
+  llvm::CallInst *C = Builder.CreateCall(CGM.getLLVMFakeUseFn(), {V});
+  C->setDoesNotThrow();
+  C->setTailCallKind(llvm::CallInst::TCK_NoTail);
+}
+
 void CodeGenFunction::EmitAndRegisterVariableArrayDimensions(
     CGDebugInfo *DI, const VarDecl &D, bool EmitDebugInfo) {
   // For each dimension stores its QualType and corresponding
@@ -1414,6 +1422,39 @@ void CodeGenFunction::EmitAndRegisterVariableArrayDimensions(
   }
 }
 
+/// Return the maximum size of an aggregate for which we generate a fake use
+/// intrinsic when -fextend-lifetimes is in effect.
+static uint64_t maxFakeUseAggregateSize(const ASTContext &C) {
+  return 4 * C.getTypeSize(C.UnsignedIntTy);
+}
+
+// Helper function to determine whether a variable's or parameter's lifetime
+// should be extended.
+static bool shouldExtendLifetime(const ASTContext &Context,
+                                 const Decl *FuncDecl, const VarDecl &D,
+                                 ImplicitParamDecl *CXXABIThisDecl) {
+  // When we're not inside a valid function it is unlikely that any
+  // lifetime extension is useful.
+  if (!FuncDecl)
+    return false;
+  if (FuncDecl->isImplicit())
+    return false;
+  // Do not extend compiler-created variables except for the this pointer.
+  if (D.isImplicit() && &D != CXXABIThisDecl)
+    return false;
+  QualType Ty = D.getType();
+  // No need to extend volatiles, they have a memory location.
+  if (Ty.isVolatileQualified())
+    return false;
+  // Don't extend variables that exceed a certain size.
+  if (Context.getTypeSize(Ty) > maxFakeUseAggregateSize(Context))
+    return false;
+  // Do not extend variables in nodebug or optnone functions.
+  if (FuncDecl->hasAttr<NoDebugAttr>() || FuncDecl->hasAttr<OptimizeNoneAttr>())
+    return false;
+  return true;
+}
+
 /// EmitAutoVarAlloca - Emit the alloca and debug information for a
 /// local variable.  Does not emit initialization or destruction.
 CodeGenFunction::AutoVarEmission
@@ -1666,6 +1707,18 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) {
                                          emission.getOriginalAllocatedAddress(),
                                          emission.getSizeForLifetimeMarkers());
 
+  // Analogous to lifetime markers, we use a 'cleanup' to emit fake.use
+  // calls for local variables. We are exempting volatile variables and
+  // non-scalars larger than 4 times the size of an unsigned int. Larger
+  // non-scalars are often allocated in memory and may create unnecessary
+  // overhead.
+  if (CGM.getCodeGenOpts().getExtendVariableLiveness() ==
+      CodeGenOptions::ExtendVariableLivenessKind::All) {
+    if (shouldExtendLifetime(getContext(), CurCodeDecl, D, CXXABIThisDecl))
+      EHStack.pushCleanup<FakeUse>(NormalFakeUse,
+                                   emission.getAllocatedAddress());
+  }
+
   return emission;
 }
 
@@ -2532,6 +2585,14 @@ llvm::Function *CodeGenModule::getLLVMLifetimeEndFn() {
   return LifetimeEndFn;
 }
 
+/// Lazily declare the @llvm.fake.use intrinsic.
+llvm::Function *CodeGenModule::getLLVMFakeUseFn() {
+  if (!FakeUseFn)
+    FakeUseFn = llvm::Intrinsic::getDeclaration(&getModule(),
+                                                llvm::Intrinsic::fake_use);
+  return FakeUseFn;
+}
+
 namespace {
   /// A cleanup to perform a release of an object at the end of a
   /// function.  This is used to balance out the incoming +1 of a
@@ -2725,6 +2786,18 @@ void CodeGenFunction::EmitParmDecl(const VarDecl &D, ParamValue Arg,
 
   setAddrOfLocalVar(&D, DeclPtr);
 
+  // Push a FakeUse 'cleanup' object onto the EHStack for the parameter,
+  // which may be the 'this' pointer. This causes the emission of a fake.use
+  // call with the parameter as argument at the end of the function.
+  if (CGM.getCodeGenOpts().getExtendVariableLiveness() ==
+          CodeGenOptions::ExtendVariableLivenessKind::All ||
+      (CGM.getCodeGenOpts().getExtendVariableLiveness() ==
+           CodeGenOptions::ExtendVariableLivenessKind::This &&
+       &D == CXXABIThisDecl)) {
+    if (shouldExtendLifetime(getContext(), CurCodeDecl, D, CXXABIThisDecl))
+      EHStack.pushCleanup<FakeUse>(NormalFakeUse, DeclPtr);
+  }
+
   // Emit debug info for param declarations in non-thunk functions.
   if (CGDebugInfo *DI = getDebugInfo()) {
     if (CGM.getCodeGenOpts().hasReducedDebugInfo() && !CurFuncIsThunk &&
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 11fdddba1144b..bbef277a52448 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -404,9 +404,9 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) {
   // important to do this before we enter the return block or return
   // edges will be *really* confused.
   bool HasCleanups = EHStack.stable_begin() != PrologueCleanupDepth;
-  bool HasOnlyLifetimeMarkers =
-      HasCleanups && EHStack.containsOnlyLifetimeMarkers(PrologueCleanupDepth);
-  bool EmitRetDbgLoc = !HasCleanups || HasOnlyLifetimeMarkers;
+  bool HasOnlyNoopCleanups =
+      HasCleanups && EHStack.containsOnlyNoopCleanups(PrologueCleanupDepth);
+  bool EmitRetDbgLoc = !HasCleanups || HasOnlyNoopCleanups;
 
   std::optional<ApplyDebugLocation> OAL;
   if (HasCleanups) {
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index f70e73fdab0e3..60f16a364d90d 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -723,6 +723,20 @@ class CodeGenFunction : public CodeGenTypeCache {
     }
   };
 
+  // We are using objects of this 'cleanup' class to emit fake.use calls
+  // for -fextend-lifetimes and -fextend-this-ptr. They are placed at the end of
+  // a variable's scope analogous to lifetime markers.
+  class FakeUse final : public EHScopeStack::Cleanup {
+    Address Addr;
+
+  public:
+    FakeUse(Address addr) : Addr(addr) {}
+
+    void Emit(CodeGenFunction &CGF, Flags flags) override {
+      CGF.EmitFakeUse(Addr);
+    }
+  };
+
   /// Header for data within LifetimeExtendedCleanupStack.
   struct LifetimeExtendedCleanupHeader {
     /// The size of the following cleanup object.
@@ -5075,6 +5089,8 @@ class CodeGenFunction : public CodeGenTypeCache {
 
   RValue EmitAtomicExpr(AtomicExpr *E);
 
+  void EmitFakeUse(Address Addr);
+
   //===--------------------------------------------------------------------===//
   //                         Annotations Emission
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 1aa5d483d49c0..0956296e2d5d8 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -647,6 +647,9 @@ class CodeGenModule : public CodeGenTypeCache {
   /// void @llvm.lifetime.end(i64 %size, i8* nocapture <ptr>)
   llvm::Function *LifetimeEndFn = nullptr;
 
+  /// void @llvm.fake.use(...)
+  llvm::Function *FakeUseFn = nullptr;
+
   std::unique_ptr<SanitizerMetadata> SanitizerMD;
 
   llvm::MapVector<const Decl *, bool> DeferredEmptyCoverageMappingDecls;
@@ -1326,6 +1329,7 @@ class CodeGenModule : public CodeGenTypeCache {
 
   llvm::Function *getLLVMLifetimeStartFn();
   llvm::Function *getLLVMLifetimeEndFn();
+  llvm::Function *getLLVMFakeUseFn();
 
   // Make sure that this type is translated.
   void UpdateCompletedType(const TagDecl *TD);
diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h
index 0c667e80bb6d8..ed11dc2bb05d7 100644
--- a/clang/lib/CodeGen/EHScopeStack.h
+++ b/clang/lib/CodeGen/EHScopeStack.h
@@ -87,6 +87,11 @@ enum CleanupKind : unsigned {
 
   LifetimeMarker = 0x8,
   NormalEHLifetimeMarker = LifetimeMarker | NormalAndEHCleanup,
+
+  // FakeUse needs to be recognized as a special cleanup similar to lifetime
+  // markers chiefly to be ignored in most contexts.
+  FakeUse = 0x10,
+  NormalFakeUse = FakeUse | NormalCleanup,
 };
 
 /// A stack of scopes which respond to exceptions, including cleanups
@@ -352,8 +357,8 @@ class EHScopeStack {
   void popTerminate();
 
   // Returns true iff the current scope is either empty or contains only
-  // lifetime markers, i.e. no real cleanup code
-  bool containsOnlyLifetimeMarkers(stable_iterator Old) const;
+  // noop cleanups, i.e. lifetime markers and fake uses.
+  bool containsOnlyNoopCleanups(stable_iterator Old) const;
 
   /// Determines whether the exception-scopes stack is empty.
   bool empty() const { return StartOfData == EndOfBuffer; }
diff --git a/clang/test/CodeGen/extend-variable-liveness-except.cpp b/clang/test/CodeGen/extend-variable-liveness-except.cpp
new file mode 100644
index 0000000000000..13a5c2e644040
--- /dev/null
+++ b/clang/test/CodeGen/extend-variable-liveness-except.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -fcxx-exceptions -fexceptions -o - | FileCheck %s
+// This test checks that the fake uses can be generated in exception handling
+// blocks and that we can emit fake uses for the __int128 data type.
+
+extern int bar();
+
+/// Try block: fake use ends at try-block scope.
+// [[BAR_VAL::%[a-zA-Z0-9\.]+]] = invoke{{.*}} i32 @_Z3barv()
+// store i32 %[[BAR_VAL]], ptr [[K_ALLOC_VAL:%[a-zA-Z0-9\.]+]], align 4
+// [[K_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr [[K_ALLOC_VAL]], align 4
+// call void (...) @llvm.fake.use(i32 [[K_FAKE_USE]]) #2
+// br label
+
+/// Catch block: fetching the caught value...
+// CHECK: [[CATCH_PTR:%[a-zA-Z0-9\.]+]] = call ptr @__cxa_begin_catch(
+// CHECK: [[L_VAL:%[a-zA-Z0-9\.]+]] = load i32, ptr [[CATCH_PTR]], align 4
+
+/// Storing to allocas...
+// CHECK-DAG: store i32 8, ptr [[M_ALLOC_VAL:%[a-zA-Z0-9\.]+]]
+// CHECK-DAG: store i32 [[L_VAL]], ptr [[L_ALLOC_VAL:%[a-zA-Z0-9\.]+]], align 4
+
+/// Load into fake uses - expect M to precede L.
+// CHECK: [[M_FAKE_VAL:%[a-zA-Z0-9\.]+]] = load i32, ptr [[M_ALLOC_VAL]]
+// CHECK: call void (...) @llvm.fake.use(i32 [[M_FAKE_VAL]])
+// CHECK: [[L_FAKE_VAL:%[a-zA-Z0-9\.]+]] = load i32, ptr [[L_ALLOC_VAL]]
+// CHECK: call void (...) @llvm.fake.use(i32 [[L_FAKE_VAL]])
+void foo() {
+  try {
+    int k = bar();
+  } catch (int l) {
+    /// The catch block contains a fake use for the local within its scope.
+    int m = 8;
+  }
+}
diff --git a/clang/test/CodeGen/extend-variable-liveness-wide-scalar.cpp b/clang/test/CodeGen/extend-variable-liveness-wide-scalar.cpp
new file mode 100644
index 0000000000000..42b893cc3cdf7
--- /dev/null
+++ b/clang/test/CodeGen/extend-variable-liveness-wide-scalar.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -triple x86_64-unknown-linux -o - | FileCheck %s
+// REQUIRES: x86-registered-target
+// This test checks that the fake uses can be generated in exception handling
+// blocks and that we can emit fake uses for the __int128 data type.
+
+void bar();
+
+// CHECK: call void (...) @llvm.fake.use(i128 %
+void foo(__int128 wide_int) {
+  bar();
+}
diff --git a/clang/test/CodeGen/extend-variable-liveness.c b/clang/test/CodeGen/extend-variable-liveness.c
new file mode 100644
index 0000000000000..0eae155a9259d
--- /dev/null
+++ b/clang/test/CodeGen/extend-variable-liveness.c
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s --implicit-check-not=llvm.fake.use
+// Check that fake use calls are emitted at the correct locations, i.e.
+// at the end of lexical blocks and at the end of the function.
+
+int glob_i;
+char glob_c;
+float glob_f;
+
+int foo(int i) {
+  // CHECK-LABEL: define{{.*}}foo
+  if (i < 4) {
+    char j = i * 3;
+    if (glob_i > 3) {
+      float f = glob_f;
+      j = f;
+      glob_c = j;
+      // CHECK: call void (...) @llvm.fake.use(float %
+      // CHECK-NEXT: br label %
+    }
+    glob_i = j;
+    // CHECK: call void (...) @llvm.fake.use(i8 %
+    // CHECK-NEXT: br label %
+  }
+  // CHECK: call void (...) @llvm.fake.use(i32 %
+  // CHECK-NEXT: ret
+  return 4;
+}
+
+// CHECK: declare void @llvm.fake.use(...)
diff --git a/clang/test/CodeGen/fake-use-determinism.c b/clang/test/CodeGen/fake-use-determinism.c
new file mode 100644
index 0000000000000..459d7915cbe90
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-determinism.c
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -emit-llvm -fextend-variable-liveness %s -o - | FileCheck %s
+//
+// We are checking that the fake.use calls for i, j and k appear
+// in a particular order. It is not the order itself that is important
+// but that it remains the same between different test runs.
+
+// CHECK:      [[K_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr %k.addr
+// CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[K_FAKE_USE]]) #2
+// CHECK-NEXT: [[J_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr %j.addr
+// CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[J_FAKE_USE]]) #2
+// CHECK-NEXT: [[I_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr %i.addr
+// CHECK-NEXT: call void (...) @llvm.fake.use(i32 [[I_FAKE_USE]]) #2
+
+void bar();
+void foo(int i, int j, int k)
+{
+   for (int l = 0; l < i; l++) {
+      bar();
+   }
+}
diff --git a/clang/test/CodeGen/fake-use-lambda.cpp b/clang/test/CodeGen/fake-use-lambda.cpp
new file mode 100644
index 0000000000000..aaf25caab6a39
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-lambda.cpp
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 %s -triple=%itanium_abi_triple -O1 -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
+// Make sure we don't crash compiling a lambda that is not nested in a function.
+// We also check that fake uses are properly issued in lambdas.
+
+int glob;
+
+extern int foo();
+
+struct S {
+  static const int a;
+};
+
+const int S::a = [](int b) __attribute__((noinline)) {
+  return b * foo();
+}
+(glob);
+
+int func(int param) {
+  return ([=](int lambdaparm) __attribute__((noinline))->int {
+    int lambdalocal = lambdaparm * 2;
+    return lambdalocal;
+  }(glob));
+}
+
+// We are looking for the first lambda's call operator, which should contain
+// 2 fake uses, one for 'b' and one for its 'this' pointer (in that order).
+// The mangled function name contains a $_0, followed by 'cl'.
+// This lambda is an orphaned lambda, i.e. one without lexical parent.
+//
+// CHECK-LABEL: define internal {{.+\"_Z.+\$_0.*cl.*\"}}
+// CHECK-NOT:   ret
+// CHECK:       fake.use(i32
+// CHECK-NOT:   ret
+// CHECK:       fake.use(ptr
+
+// The second lambda. We are looking for 3 fake uses.
+// CHECK-LABEL: define internal {{.+\"_Z.+\$_0.*cl.*\"}}
+// CHECK-NOT:   ret
+// CHECK:       fake.use(i32
+// CHECK-NOT:   ret
+// CHECK:       fake.use(i32
+// CHECK-NOT:   ret
+// CHECK:       fake.use(ptr
diff --git a/clang/test/CodeGen/fake-use-landingpad.c b/clang/test/CodeGen/fake-use-landingpad.c
new file mode 100644
index 0000000000000..ffaf3975ef33f
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-landingpad.c
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -fexceptions -o - | FileCheck %s --implicit-check-not="landingpad {"
+
+// Check that fake uses do not mistakenly cause a landing pad to be generated when
+// exceptions are enabled.
+
+extern void bar(int);
+void foo(int p) {
+  int a = 17;
+  bar(a);
+}
+
+// CHECK:      define {{.*}} @foo
+// CHECK-NOT:  personality
+// CHECK:      call void (...) @llvm.fake.use
diff --git a/clang/test/CodeGen/fake-use-noreturn.cpp b/clang/test/CodeGen/fake-use-noreturn.cpp
new file mode 100644
index 0000000000000..6a3a07cc5abef
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-noreturn.cpp
@@ -0,0 +1,28 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
+//
+// Check we can correctly produce fake uses for function-level variables even
+// when we have a return in a nested conditional and there is no code at the end
+// of the function.
+
+// CHECK-LABEL: define{{.*}}@_Z3fooi
+// CHECK:         [[I_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr %i.addr
+// CHECK:         call void (...) @llvm.fake.use(i32 [[I_FAKE_USE]])
+// CHECK-LABEL: define{{.*}}@_ZN1C3barEi
+// CHECK:         [[J_FAKE_USE:%[a-zA-Z0-9\.]+]] = load i32, ptr %j.addr
+// CHECK:         call void (...) @llvm.fake.use(i32 [[J_FAKE_USE]])
+
+void foo(int i) {
+   while (0)
+     if (1)
+       return;
+}
+
+class C {
+  void bar(int j);
+};
+
+void C::bar(int j) {
+  while (0)
+    if (1)
+      return;
+}
diff --git a/clang/test/CodeGen/fake-use-return-line.c b/clang/test/CodeGen/fake-use-return-line.c
new file mode 100644
index 0000000000000..50d5885c28348
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-return-line.c
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -emit-llvm -debug-info-kind=limited -fextend-variable-liveness -o - %s | FileCheck %s
+
+// Clang adjusts the line numbers of returns based on the line numbers of
+// dominating stores to %retval; we test that fake use intrinsics do not affect
+// this, and the return is given the correct line.
+
+// CHECK: define{{.*}}@main
+// CHECK: call void (...) @llvm.fake.use(i32
+// CHECK-NEXT: ret i32{{.*}}!dbg ![[MDINDEX:[0-9]*]]
+// CHECK: ![[MDINDEX]] = !DILocation(line: [[# @LINE + 5]]
+int main()
+{
+  volatile int a = 1;
+  int b = a + 2;
+  return b;
+}
diff --git a/clang/test/CodeGen/fake-use-sanitizer.cpp b/clang/test/CodeGen/fake-use-sanitizer.cpp
new file mode 100644
index 0000000000000..4d33dcf7c6944
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-sanitizer.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -fsanitize=null -fsanitize-trap=null -o - | FileCheck --check-prefixes=CHECK,NULL --implicit-check-not=ubsantrap %s
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
+
+// With -fextend-lifetimes, the compiler previously generated a fake.use of any
+// reference variable at the end of the scope in which its alloca exists. This
+// caused two issues, where we would get fake uses for uninitialized variables
+// if that variable was declared after an early-return, and UBSan's null checks
+// would complain about this.
+// This test verifies that UBSan does not produce null-checks for arguments to
+// llvm.fake.use, and that fake uses are not emitted for a variable on paths
+// it has not been declared.
+
+struct A { short s1, s2; };
+extern long& getA();
+
+void foo()
+{
+  auto& va = getA();
+  if (va < 5)
+    return;
+
+  auto& vb = getA();
+}
+
+// CHECK-LABEL:  define{{.*}}foo
+// CHECK:  [[VA_CALL:%.+]] = call{{.*}} ptr @_Z4getAv()
+
+/// We check here for the first UBSan check for "va".
+// NULL:   [[VA_ISNULL:%.+]] = icmp ne ptr [[VA_CALL]], null
+// NULL:   br i1 [[VA_ISNULL]], label %{{[^,]+}}, label %[[VA_TRAP:[^,]+]]
+// NULL: [[VA_TRAP]]:
+// NULL:   call void @llvm.ubsantrap(
+
+// CHECK:       [[VA_PTR:%.+]] = load ptr, ptr %va
+// CHECK-NEXT:  [[VA_CMP:%.+]] = load i64, ptr [[VA_PTR]]
+// CHECK-NEXT:  [[VA_CMP_RES:%.+]] = icmp slt i64 [[VA_CMP]], 5
+// CHECK-NEXT:  br i1 [[VA_CMP_RES]], label %[[EARLY_EXIT:[^,]+]], label %[[NOT_EARLY_EXIT:[^,]+]]
+
+// CHECK: [[EARLY_EXIT]]:
+// CHECK:   br label %cleanup
+
+/// The fake use for "vb" only appears on the path where its declaration is
+/// reached.
+// CHECK:     [[NOT_EARLY_EXIT]]:
+// CHECK:  [[VB_CALL:%.+]] = call{{.*}} ptr @_Z4getAv()
+
+/// We check here for the second UBSan check for "vb".
+// NULL:   [[VB_ISNULL:%.+]] = icmp ne ptr [[VB_CALL]], null
+// NULL:   br i1 [[VB_ISNULL]], label %{{[^,]+}}, label %[[VB_TRAP:[^,]+]]
+// NULL: [[VB_TRAP]]:
+// NULL:   call void @llvm.ubsantrap(
+
+// CHECK:       [[VB_FAKE_USE:%.+]] = load ptr, ptr %vb
+// CHECK-NEXT:  call void (...) @llvm.fake.use(ptr [[VB_FAKE_USE]])
+// CHECK:       br label %cleanup
+
+// CHECK:     cleanup:
+// CHECK:       [[VA_FAKE_USE:%.+]] = load ptr, ptr %va
+// CHECK-NEXT:  call void (...) @llvm.fake.use(ptr [[VA_FAKE_USE]])
+
+// NULL: declare void @llvm.ubsantrap
diff --git a/clang/test/CodeGen/fake-use-scalar.c b/clang/test/CodeGen/fake-use-scalar.c
new file mode 100644
index 0000000000000..8514d57958920
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-scalar.c
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s --implicit-check-not=fake.use
+// Make sure we don't generate fake.use for non-scalar variables, unless they
+// are small enough that they may be represented as a scalar in LLVM IR.
+// Make sure we don't generate fake.use for volatile variables
+// and parameters even when they are scalar.
+
+struct BigAggr {
+  unsigned long t;
+  char c[1024];
+  unsigned char r[32];
+};
+
+struct SmallAggr {
+  int i;
+  int j;
+};
+
+int foo(volatile int vol_param, int param)
+{
+  struct BigAggr big;
+  struct SmallAggr small;
+  volatile int vol_local;
+  int local;
+  unsigned long_arr[5];
+  unsigned short_arr[4];
+  return 0;
+}
+
+// CHECK: [[SMALL_ARR_FAKE_USE:%.+]] = load [4 x i[[#UINT_SIZE:]]], ptr %short_arr
+// CHECK: call void (...) @llvm.fake.use([4 x i[[#UINT_SIZE]]] [[SMALL_ARR_FAKE_USE]])
+
+// CHECK: [[LOCAL_FAKE_USE:%.+]] = load i32, ptr %local
+// CHECK: call void (...) @llvm.fake.use(i32 [[LOCAL_FAKE_USE]])
+
+// CHECK: [[SMALL_FAKE_USE:%.+]] = load %struct.SmallAggr, ptr %small
+// CHECK: call void (...) @llvm.fake.use(%struct.SmallAggr [[SMALL_FAKE_USE]])
+
+// CHECK: [[PARAM_FAKE_USE:%.+]] = load i32, ptr %param.addr
+// CHECK: call void (...) @llvm.fake.use(i32 [[PARAM_FAKE_USE]])
+
+// CHECK: declare void @llvm.fake.use
diff --git a/clang/test/CodeGen/fake-use-this.cpp b/clang/test/CodeGen/fake-use-this.cpp
new file mode 100644
index 0000000000000..2c49694b2cd4f
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-this.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-this-ptr-liveness -o - | FileCheck %s --implicit-check-not=fake.use
+// Check that we generate a fake_use call with the 'this' pointer as argument,
+// and no other fake uses.
+// The call should appear after the call to bar().
+
+void bar();
+
+class C
+{
+public:
+    bool test(int p);
+    C(int v): v(v) {}
+
+private:
+    int v;
+};
+
+bool C::test(int p)
+{
+// CHECK-LABEL: define{{.*}}_ZN1C4testEi(ptr{{[^,]*}} %this, i32{{.*}} %p)
+// CHECK:   %this.addr = alloca ptr
+// CHECK:   store ptr %this, ptr %this.addr
+    int res = p - v;
+
+    bar();
+// CHECK: call{{.*}}bar
+
+    return res != 0;
+// CHECK:      [[FAKE_USE:%.+]] = load ptr, ptr %this.addr
+// CHECK-NEXT: call void (...) @llvm.fake.use(ptr{{.*}} [[FAKE_USE]])
+// CHECK-NEXT: ret
+}
+
+// CHECK: declare void @llvm.fake.use
diff --git a/clang/test/CodeGen/fake-use-while.c b/clang/test/CodeGen/fake-use-while.c
new file mode 100644
index 0000000000000..a74887d979528
--- /dev/null
+++ b/clang/test/CodeGen/fake-use-while.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
+//
+// Check we don't assert when there is no more code after a while statement
+// and the body of the while statement ends in a return, i.e. no insertion point
+// is available.
+
+// CHECK: define{{.*}}foo
+// CHECK: call{{.*}}llvm.fake.use
+
+void foo() {
+  {
+    while (1) {
+      int ret;
+      if (1)
+        return;
+    }
+  }
+}

From c0861e9cbbc3b342e49a8688cdc15aa7c8e9f29a Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid@linaro.org>
Date: Tue, 28 Jan 2025 17:30:07 +0500
Subject: [PATCH 396/432] Revert "[flang] IEEE underflow control for Arm
 (#124617)"

This reverts commit c4c76eabb88f8ee5b92fa9a84452875b0cb67d0d.

This breaks LLVM build on Windows:
https://lab.llvm.org/buildbot/#/builders/161/builds/4322
---
 flang/include/flang/Tools/TargetSetup.h | 31 +++++++++++----------
 flang/runtime/exceptions.cpp            | 36 +++++--------------------
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/flang/include/flang/Tools/TargetSetup.h b/flang/include/flang/Tools/TargetSetup.h
index 5d23df6823a94..d1b0da3a42c89 100644
--- a/flang/include/flang/Tools/TargetSetup.h
+++ b/flang/include/flang/Tools/TargetSetup.h
@@ -24,35 +24,34 @@ namespace Fortran::tools {
     const std::string &compilerVersion, const std::string &compilerOptions) {
 
   const llvm::Triple &targetTriple{targetMachine.getTargetTriple()};
-
-  targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting, true);
-
+  // FIXME: Handle real(3) ?
+  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
+    targetCharacteristics.DisableType(
+        Fortran::common::TypeCategory::Real, /*kind=*/10);
+  }
   if (targetTriple.getArch() == llvm::Triple::ArchType::x86_64) {
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
     targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
   }
-
   if (targetTriple.isARM() || targetTriple.isAArch64()) {
     targetCharacteristics.set_haltingSupportIsUnknownAtCompileTime();
     targetCharacteristics.set_ieeeFeature(
         evaluate::IeeeFeature::Halting, false);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/3);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/4);
-    targetCharacteristics.set_hasSubnormalFlushingControl(/*kind=*/8);
-  }
-
-  if (targetTriple.getArch() != llvm::Triple::ArchType::x86_64) {
-    targetCharacteristics.DisableType(
-        Fortran::common::TypeCategory::Real, /*kind=*/10);
+  } else {
+    targetCharacteristics.set_ieeeFeature(evaluate::IeeeFeature::Halting);
   }
 
-  // Check for kind=16 support. See flang/runtime/Float128Math/math-entries.h.
-  // TODO: Take this from TargetInfo::getLongDoubleFormat for cross compilation.
+  // Figure out if we can support F128: see
+  // flang/runtime/Float128Math/math-entries.h
+  // TODO: this should be taken from TargetInfo::getLongDoubleFormat to support
+  // cross-compilation
 #ifdef FLANG_RUNTIME_F128_MATH_LIB
-  constexpr bool f128Support = true; // use libquadmath wrappers
+  // we can use libquadmath wrappers
+  constexpr bool f128Support = true;
 #elif HAS_LDBL128
-  constexpr bool f128Support = true; // use libm wrappers
+  // we can use libm wrappers
+  constexpr bool f128Support = true;
 #else
   constexpr bool f128Support = false;
 #endif
diff --git a/flang/runtime/exceptions.cpp b/flang/runtime/exceptions.cpp
index 21648e8ea8c23..f541b8e844ade 100644
--- a/flang/runtime/exceptions.cpp
+++ b/flang/runtime/exceptions.cpp
@@ -11,9 +11,7 @@
 #include "flang/Runtime/exceptions.h"
 #include "terminator.h"
 #include <cfenv>
-#if __aarch64__
-#include <fpu_control.h>
-#elif __x86_64__
+#if __x86_64__
 #include <xmmintrin.h>
 #endif
 
@@ -92,40 +90,20 @@ bool RTNAME(SupportHalting)([[maybe_unused]] uint32_t except) {
 #endif
 }
 
-// A hardware FZ (flush to zero) bit is the negation of the
-// ieee_[get|set]_underflow_mode GRADUAL argument.
-#if defined(_MM_FLUSH_ZERO_MASK)
-// The x86_64 MXCSR FZ bit affects computations of real kinds 3, 4, and 8.
-#elif defined(_FPU_GETCW)
-// The aarch64 FPCR FZ bit affects computations of real kinds 3, 4, and 8.
-// bit 24: FZ   -- single, double precision flush to zero bit
-// bit 19: FZ16 -- half precision flush to zero bit [not currently relevant]
-#define _FPU_FPCR_FZ_MASK_ 0x01080000
-#endif
-
 bool RTNAME(GetUnderflowMode)(void) {
-#if defined(_MM_FLUSH_ZERO_MASK)
+#if _MM_FLUSH_ZERO_MASK
+  // The MXCSR Flush to Zero flag is the negation of the ieee_get_underflow_mode
+  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
   return _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_OFF;
-#elif defined(_FPU_GETCW)
-  uint64_t fpcr;
-  _FPU_GETCW(fpcr);
-  return (fpcr & _FPU_FPCR_FZ_MASK_) == 0;
 #else
   return false;
 #endif
 }
 void RTNAME(SetUnderflowMode)(bool flag) {
-#if defined(_MM_FLUSH_ZERO_MASK)
+#if _MM_FLUSH_ZERO_MASK
+  // The MXCSR Flush to Zero flag is the negation of the ieee_set_underflow_mode
+  // GRADUAL argument. It affects real computations of kinds 3, 4, and 8.
   _MM_SET_FLUSH_ZERO_MODE(flag ? _MM_FLUSH_ZERO_OFF : _MM_FLUSH_ZERO_ON);
-#elif defined(_FPU_GETCW)
-  uint64_t fpcr;
-  _FPU_GETCW(fpcr);
-  if (flag) {
-    fpcr &= ~_FPU_FPCR_FZ_MASK_;
-  } else {
-    fpcr |= _FPU_FPCR_FZ_MASK_;
-  }
-  _FPU_SETCW(fpcr);
 #endif
 }
 

From e1b5826c2b9dafbdf627b2abbf0157ae12306657 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= <gergely.meszaros@intel.com>
Date: Tue, 28 Jan 2025 13:49:37 +0100
Subject: [PATCH 397/432] [LLVM][CMake][MSVC] Wrap linker options for ICX with
 LLVM_BUILD_INSTUMENTED (#124573)

RFC:
https://discourse.llvm.org/t/rfc-cmake-linker-flags-need-wl-equivalent-for-intel-c-icx-on-windows/82446

The Intel C++ Compiler (ICX) passes linker flags through the driver
unlike MSVC and clang-cl, and therefore needs them to be prefixed with
`/Qoption,link` (the equivalent of `-Wl,` for gcc on *nix).

Previous PRs did not catch this because I did not try building with
`LLVM_BUILD_INSTRUMENTED=ON`.
`CMAKE_CXX_LINKER_WRAPPER_FLAG` is empty for plain clang-cl (icx on
windows behaves as clang-cl), so this is NFC for clang-cl.
---
 llvm/cmake/modules/HandleLLVMOptions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 636622d146505..5db06ccdadbeb 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -1212,7 +1212,7 @@ if (CLANG_CL AND (LLVM_BUILD_INSTRUMENTED OR LLVM_USE_SANITIZER))
   endif()
   file(TO_CMAKE_PATH "${clang_compiler_rt_file}" clang_compiler_rt_file)
   get_filename_component(clang_runtime_dir "${clang_compiler_rt_file}" DIRECTORY)
-  prepend("/libpath:\"${clang_runtime_dir}\""
+  prepend("${CMAKE_CXX_LINKER_WRAPPER_FLAG}/libpath:\"${clang_runtime_dir}\""
     CMAKE_EXE_LINKER_FLAGS
     CMAKE_MODULE_LINKER_FLAGS
     CMAKE_SHARED_LINKER_FLAGS)

From afa4681ce443e88a5f196b808300fe3c133e96fd Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Tue, 28 Jan 2025 12:54:15 +0000
Subject: [PATCH 398/432] [flang][debug] Add support for common blocks.
 (#112398)

This PR adds debug support for common block in flang. As variable which
are part of a common block don't have a special marker to recognize
them, we use the following check to find them.

%0 = fir.address_of(@a)
%1 = fir.convert %0
%2 = fir.coordinate_of %1, %c0
%3 = fir.convert %2
%4 = fircg.ext_declare %3

If the memref of a fircg.ext_declare points to a fir.coordinate_of and
that in turn points to an fir.address_of (ignoring immediate
fir.convert) then we assume that it is a common block variable. The
fir.address_of gives us the global symbol which is the storage for
common block and fir.coordinate_of provides the offset in this storage.

The debug hierarchy looks like as

subroutine f3
  integer :: x, y
  common /a/ x, y
end subroutine

@a_ = global { ... } { ... }, !dbg !26, !dbg !28

!23 = !DISubprogram(name: "f3"...)
!24 = !DICommonBlock(scope: !23, name: "a", ...)
!25 = !DIGlobalVariable(name: "x", scope: !24 ...)
!26 = !DIGlobalVariableExpression(var: !25, expr: !DIExpression())
!27 = !DIGlobalVariable(name: "y", scope: !24 ...)
!28 = !DIGlobalVariableExpression(var: !27, expr:
!DIExpression(DW_OP_plus_uconst, 4))

This required following changes:

1. Instead of using DIGlobalVariableAttr in the FusedLoc of GlobalOp, we
use DIGlobalVariableExpressionAttr. This allows us the generate the
DIExpression where we have the information.

2. Previously, only one DIGlobalVariableExpressionAttr could be linked
to one global op. I recently removed this restriction in mlir. To make
use of it, we add an ArrayAttr to the FusedLoc of a GlobalOp. This
allows us to pass multiple DIGlobalVariableExpressionAttr.

3. I was depending on the name of global for the name of the common
block. The name gets a '_' appended. I could not find a utility function
in flang to remove it so I have to brute force it.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       |  12 +-
 .../lib/Optimizer/Transforms/AddDebugInfo.cpp | 114 +++++++++-
 .../test/Integration/debug-common-block-1.f90 | 138 ++++++++++++
 flang/test/Transforms/debug-common-block.fir  | 213 ++++++++++++++++++
 flang/test/Transforms/debug-module-1.fir      |   6 +-
 flang/test/Transforms/debug-module-2.fir      |   6 +-
 6 files changed, 479 insertions(+), 10 deletions(-)
 create mode 100644 flang/test/Integration/debug-common-block-1.f90
 create mode 100644 flang/test/Transforms/debug-common-block.fir

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 6ff2c20d74453..f19646f7bd334 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2996,11 +2996,13 @@ struct GlobalOpConversion : public fir::FIROpConversion<fir::GlobalOp> {
     llvm::SmallVector<mlir::Attribute> dbgExprs;
 
     if (auto fusedLoc = mlir::dyn_cast<mlir::FusedLoc>(global.getLoc())) {
-      if (auto gvAttr =
-              mlir::dyn_cast_or_null<mlir::LLVM::DIGlobalVariableAttr>(
-                  fusedLoc.getMetadata())) {
-        dbgExprs.push_back(mlir::LLVM::DIGlobalVariableExpressionAttr::get(
-            global.getContext(), gvAttr, mlir::LLVM::DIExpressionAttr()));
+      if (auto gvExprAttr = mlir::dyn_cast_if_present<mlir::ArrayAttr>(
+              fusedLoc.getMetadata())) {
+        for (auto attr : gvExprAttr.getAsRange<mlir::Attribute>())
+          if (auto dbgAttr =
+                  mlir::dyn_cast<mlir::LLVM::DIGlobalVariableExpressionAttr>(
+                      attr))
+            dbgExprs.push_back(dbgAttr);
       }
     }
 
diff --git a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
index a8e9d198ccb97..16404fcda57b4 100644
--- a/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
+++ b/flang/lib/Optimizer/Transforms/AddDebugInfo.cpp
@@ -59,10 +59,19 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
 
 private:
   llvm::StringMap<mlir::LLVM::DIModuleAttr> moduleMap;
+  llvm::StringMap<mlir::LLVM::DICommonBlockAttr> commonBlockMap;
+  // List of GlobalVariableExpressionAttr that are attached to a given global
+  // that represents the storage for common block.
+  llvm::DenseMap<fir::GlobalOp, llvm::SmallVector<mlir::Attribute>>
+      globalToGlobalExprsMap;
 
   mlir::LLVM::DIModuleAttr getOrCreateModuleAttr(
       const std::string &name, mlir::LLVM::DIFileAttr fileAttr,
       mlir::LLVM::DIScopeAttr scope, unsigned line, bool decl);
+  mlir::LLVM::DICommonBlockAttr
+  getOrCreateCommonBlockAttr(llvm::StringRef name,
+                             mlir::LLVM::DIFileAttr fileAttr,
+                             mlir::LLVM::DIScopeAttr scope, unsigned line);
 
   void handleGlobalOp(fir::GlobalOp glocalOp, mlir::LLVM::DIFileAttr fileAttr,
                       mlir::LLVM::DIScopeAttr scope,
@@ -73,6 +82,12 @@ class AddDebugInfoPass : public fir::impl::AddDebugInfoBase<AddDebugInfoPass> {
                     mlir::LLVM::DICompileUnitAttr cuAttr,
                     fir::DebugTypeGenerator &typeGen,
                     mlir::SymbolTable *symbolTable);
+  bool createCommonBlockGlobal(fir::cg::XDeclareOp declOp,
+                               const std::string &name,
+                               mlir::LLVM::DIFileAttr fileAttr,
+                               mlir::LLVM::DIScopeAttr scopeAttr,
+                               fir::DebugTypeGenerator &typeGen,
+                               mlir::SymbolTable *symbolTable);
   std::optional<mlir::LLVM::DIModuleAttr>
   getModuleAttrFromGlobalOp(fir::GlobalOp globalOp,
                             mlir::LLVM::DIFileAttr fileAttr,
@@ -90,6 +105,67 @@ bool debugInfoIsAlreadySet(mlir::Location loc) {
 
 } // namespace
 
+bool AddDebugInfoPass::createCommonBlockGlobal(
+    fir::cg::XDeclareOp declOp, const std::string &name,
+    mlir::LLVM::DIFileAttr fileAttr, mlir::LLVM::DIScopeAttr scopeAttr,
+    fir::DebugTypeGenerator &typeGen, mlir::SymbolTable *symbolTable) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::OpBuilder builder(context);
+  std::optional<std::int64_t> optint;
+  mlir::Operation *op = declOp.getMemref().getDefiningOp();
+
+  if (auto conOp = mlir::dyn_cast_if_present<fir::ConvertOp>(op))
+    op = conOp.getValue().getDefiningOp();
+
+  if (auto cordOp = mlir::dyn_cast_if_present<fir::CoordinateOp>(op)) {
+    optint = fir::getIntIfConstant(cordOp.getOperand(1));
+    if (!optint)
+      return false;
+    op = cordOp.getRef().getDefiningOp();
+    if (auto conOp2 = mlir::dyn_cast_if_present<fir::ConvertOp>(op))
+      op = conOp2.getValue().getDefiningOp();
+
+    if (auto addrOfOp = mlir::dyn_cast_if_present<fir::AddrOfOp>(op)) {
+      mlir::SymbolRefAttr sym = addrOfOp.getSymbol();
+      if (auto global =
+              symbolTable->lookup<fir::GlobalOp>(sym.getRootReference())) {
+
+        unsigned line = getLineFromLoc(global.getLoc());
+        llvm::StringRef commonName(sym.getRootReference());
+        // FIXME: We are trying to extract the name of the common block from the
+        // name of the global. As part of mangling, GetCommonBlockObjectName can
+        // add a trailing _ in the name of that global. The demangle function
+        // does not seem to handle such cases. So the following hack is used to
+        // remove the trailing '_'.
+        if (commonName != Fortran::common::blankCommonObjectName &&
+            commonName.back() == '_')
+          commonName = commonName.drop_back();
+        mlir::LLVM::DICommonBlockAttr commonBlock =
+            getOrCreateCommonBlockAttr(commonName, fileAttr, scopeAttr, line);
+        mlir::LLVM::DITypeAttr diType = typeGen.convertType(
+            fir::unwrapRefType(declOp.getType()), fileAttr, scopeAttr, declOp);
+        line = getLineFromLoc(declOp.getLoc());
+        auto gvAttr = mlir::LLVM::DIGlobalVariableAttr::get(
+            context, commonBlock, mlir::StringAttr::get(context, name),
+            declOp.getUniqName(), fileAttr, line, diType,
+            /*isLocalToUnit*/ false, /*isDefinition*/ true, /* alignInBits*/ 0);
+        mlir::LLVM::DIExpressionAttr expr;
+        if (*optint != 0) {
+          llvm::SmallVector<mlir::LLVM::DIExpressionElemAttr> ops;
+          ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(
+              context, llvm::dwarf::DW_OP_plus_uconst, *optint));
+          expr = mlir::LLVM::DIExpressionAttr::get(context, ops);
+        }
+        auto dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
+            global.getContext(), gvAttr, expr);
+        globalToGlobalExprsMap[global].push_back(dbgExpr);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
                                        mlir::LLVM::DIFileAttr fileAttr,
                                        mlir::LLVM::DIScopeAttr scopeAttr,
@@ -101,6 +177,11 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
 
   if (result.first != fir::NameUniquer::NameKind::VARIABLE)
     return;
+
+  if (createCommonBlockGlobal(declOp, result.second.name, fileAttr, scopeAttr,
+                              typeGen, symbolTable))
+    return;
+
   // If this DeclareOp actually represents a global then treat it as such.
   if (auto global = symbolTable->lookup<fir::GlobalOp>(declOp.getUniqName())) {
     handleGlobalOp(global, fileAttr, scopeAttr, typeGen, symbolTable, declOp);
@@ -136,6 +217,22 @@ void AddDebugInfoPass::handleDeclareOp(fir::cg::XDeclareOp declOp,
   declOp->setLoc(builder.getFusedLoc({declOp->getLoc()}, localVarAttr));
 }
 
+mlir::LLVM::DICommonBlockAttr AddDebugInfoPass::getOrCreateCommonBlockAttr(
+    llvm::StringRef name, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, unsigned line) {
+  mlir::MLIRContext *context = &getContext();
+  mlir::LLVM::DICommonBlockAttr cbAttr;
+  if (auto iter{commonBlockMap.find(name)}; iter != commonBlockMap.end()) {
+    cbAttr = iter->getValue();
+  } else {
+    cbAttr = mlir::LLVM::DICommonBlockAttr::get(
+        context, scope, nullptr, mlir::StringAttr::get(context, name), fileAttr,
+        line);
+    commonBlockMap[name] = cbAttr;
+  }
+  return cbAttr;
+}
+
 // The `module` does not have a first class representation in the `FIR`. We
 // extract information about it from the name of the identifiers and keep a
 // map to avoid duplication.
@@ -227,7 +324,10 @@ void AddDebugInfoPass::handleGlobalOp(fir::GlobalOp globalOp,
       mlir::StringAttr::get(context, globalOp.getName()), fileAttr, line,
       diType, /*isLocalToUnit*/ false,
       /*isDefinition*/ globalOp.isInitialized(), /* alignInBits*/ 0);
-  globalOp->setLoc(builder.getFusedLoc({globalOp->getLoc()}, gvAttr));
+  auto dbgExpr = mlir::LLVM::DIGlobalVariableExpressionAttr::get(
+      globalOp.getContext(), gvAttr, nullptr);
+  auto arrayAttr = mlir::ArrayAttr::get(context, {dbgExpr});
+  globalOp->setLoc(builder.getFusedLoc({globalOp.getLoc()}, arrayAttr));
 }
 
 void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
@@ -409,6 +509,11 @@ void AddDebugInfoPass::handleFuncOp(mlir::func::FuncOp funcOp,
     if (&funcOp.front() == declOp->getBlock())
       handleDeclareOp(declOp, fileAttr, spAttr, typeGen, symbolTable);
   });
+  // commonBlockMap ensures that we don't create multiple DICommonBlockAttr of
+  // the same name in one function. But it is ok (rather required) to create
+  // them in different functions if common block of the same name has been used
+  // there.
+  commonBlockMap.clear();
 }
 
 void AddDebugInfoPass::runOnOperation() {
@@ -461,6 +566,13 @@ void AddDebugInfoPass::runOnOperation() {
   module.walk([&](mlir::func::FuncOp funcOp) {
     handleFuncOp(funcOp, fileAttr, cuAttr, typeGen, &symbolTable);
   });
+  mlir::OpBuilder builder(context);
+  // We have processed all function. Attach common block variables to the
+  // global that represent the storage.
+  for (auto [global, exprs] : globalToGlobalExprsMap) {
+    auto arrayAttr = mlir::ArrayAttr::get(context, exprs);
+    global->setLoc(builder.getFusedLoc({global.getLoc()}, arrayAttr));
+  }
   // Process any global which was not processed through DeclareOp.
   if (debugLevel == mlir::LLVM::DIEmissionKind::Full) {
     // Process 'GlobalOp' only if full debug info is requested.
diff --git a/flang/test/Integration/debug-common-block-1.f90 b/flang/test/Integration/debug-common-block-1.f90
new file mode 100644
index 0000000000000..18217637be0fa
--- /dev/null
+++ b/flang/test/Integration/debug-common-block-1.f90
@@ -0,0 +1,138 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+subroutine f1
+  real(kind=4) :: x, y, xa, ya
+  common // x, y
+  common /a/ xa, ya
+  x = 1.1
+  y = 2.2
+  xa = 3.3
+  ya = 4.4
+  print *, x, y, xa, ya
+end subroutine
+! CHECK-DAG: ![[XF1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "x", linkageName: "_QFf1Ex", scope: ![[CBF1:[0-9]+]], file: !5, line: [[@LINE-9]], type: ![[REAL:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[EXPXF1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XF1]], expr: !DIExpression())
+! CHECK-DAG: ![[YF1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "y", linkageName: "_QFf1Ey", scope: ![[CBF1]], file: !{{[0-9]+}}, line: [[@LINE-11]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPYF1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[YF1]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+! CHECK-DAG: ![[XAF1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "xa", linkageName: "_QFf1Exa", scope: ![[CBAF1:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-13]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPXAF1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XAF1]], expr: !DIExpression())
+! CHECK-DAG: ![[YAF1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "ya", linkageName: "_QFf1Eya", scope: ![[CBAF1]], file: !{{[0-9]+}}, line: [[@LINE-15]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPYAF1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[YAF1]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+
+
+subroutine f2
+  real(kind=4) :: x, y, z, xa, ya, za
+  common // x, y, z
+  common /a/ xa, ya, za
+  print *, x, y, z, xa, ya, za
+end subroutine
+! CHECK-DAG: ![[XF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "x", linkageName: "_QFf2Ex", scope: ![[CBF2:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-5]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPXF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XF2]], expr: !DIExpression())
+! CHECK-DAG: ![[YF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "y", linkageName: "_QFf2Ey", scope: ![[CBF2]], file: !{{[0-9]+}}, line: [[@LINE-7]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPYF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[YF2]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+! CHECK-DAG: ![[ZF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "z", linkageName: "_QFf2Ez", scope: ![[CBF2]], file: !{{[0-9]+}}, line: [[@LINE-9]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPZF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[ZF2]], expr: !DIExpression(DW_OP_plus_uconst, 8))
+! CHECK-DAG: ![[XAF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "xa", linkageName: "_QFf2Exa", scope: ![[CBAF2:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-11]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPXAF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XAF2]], expr: !DIExpression())
+! CHECK-DAG: ![[YAF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "ya", linkageName: "_QFf2Eya", scope: ![[CBAF2]], file: !{{[0-9]+}}, line: [[@LINE-13]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPYAF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[YAF2]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+! CHECK-DAG: ![[ZAF2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "za", linkageName: "_QFf2Eza", scope: ![[CBAF2]], file: !{{[0-9]+}}, line: [[@LINE-15]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPZAF2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[ZAF2]], expr: !DIExpression(DW_OP_plus_uconst, 8))
+
+subroutine f3
+  integer(kind=4) :: x = 42, xa = 42
+  common // x
+  common /a/ xa
+  print *, x
+  print *, xa
+end subroutine
+! CHECK-DAG: ![[XF3:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "x", linkageName: "_QFf3Ex", scope: ![[CBF3:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-6]], type: ![[INT:[0-9]+]]{{.*}})
+! CHECK-DAG: ![[EXPXF3:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XF3]], expr: !DIExpression())
+! CHECK-DAG: ![[XAF3:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "xa", linkageName: "_QFf3Exa", scope: ![[CBAF3:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-8]], type: ![[INT]]{{.*}})
+! CHECK-DAG: ![[EXPXAF3:[0-9]+]] = !DIGlobalVariableExpression(var: ![[XAF3]], expr: !DIExpression())
+
+program test
+  real(kind=4) :: v1, v2, v3, va1, va2, va3
+  common // v1, v2, v3
+  common /a/ va1, va2, va3
+  call f1()
+  call f2()
+  call f3()
+  print *, v1, va1, va3
+END
+! CHECK-DAG: ![[V1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "v1", linkageName: "_QFEv1", scope: ![[CBM:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-8]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPV1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[V1]], expr: !DIExpression())
+! CHECK-DAG: ![[V2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "v2", linkageName: "_QFEv2", scope: ![[CBM]], file: !{{[0-9]+}}, line: [[@LINE-10]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPV2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[V2]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+! CHECK-DAG: ![[V3:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "v3", linkageName: "_QFEv3", scope: ![[CBM]], file: !{{[0-9]+}}, line: [[@LINE-12]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPV3:[0-9]+]] = !DIGlobalVariableExpression(var: ![[V3]], expr: !DIExpression(DW_OP_plus_uconst, 8))
+! CHECK-DAG: ![[VA1:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "va1", linkageName: "_QFEva1", scope: ![[CBAM:[0-9]+]], file: !{{[0-9]+}}, line: [[@LINE-14]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPVA1:[0-9]+]] = !DIGlobalVariableExpression(var: ![[VA1]], expr: !DIExpression())
+! CHECK-DAG: ![[VA2:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "va2", linkageName: "_QFEva2", scope: ![[CBAM]], file: !{{[0-9]+}}, line: [[@LINE-16]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPVA2:[0-9]+]] = !DIGlobalVariableExpression(var: ![[VA2]], expr: !DIExpression(DW_OP_plus_uconst, 4))
+! CHECK-DAG: ![[VA3:[0-9]+]] = {{.*}}!DIGlobalVariable(name: "va3", linkageName: "_QFEva3", scope: ![[CBAM]], file: !{{[0-9]+}}, line: [[@LINE-18]], type: ![[REAL]]{{.*}})
+! CHECK-DAG: ![[EXPVA3:[0-9]+]] = !DIGlobalVariableExpression(var: ![[VA3]], expr: !DIExpression(DW_OP_plus_uconst, 8))
+
+
+! CHECK-DAG: ![[REAL]] = !DIBasicType(name: "real", size: 32, encoding: DW_ATE_float)
+! CHECK-DAG: ![[INT]] = !DIBasicType(name: "integer", size: 32, encoding: DW_ATE_signed)
+
+! CHECK-DAG: ![[F1:[0-9]+]] = {{.*}}!DISubprogram(name: "f1"{{.*}})
+! CHECK-DAG: ![[CBF1]] = !DICommonBlock(scope: ![[F1]], declaration: null, name: "__BLNK__"{{.*}})
+! CHECK-DAG: ![[CBAF1]] = !DICommonBlock(scope: ![[F1]], declaration: null, name: "a"{{.*}})
+
+! CHECK-DAG: ![[F2:[0-9]+]] = {{.*}}!DISubprogram(name: "f2"{{.*}})
+! CHECK-DAG: ![[CBF2]] = !DICommonBlock(scope: ![[F2]], declaration: null, name: "__BLNK__"{{.*}})
+! CHECK-DAG: ![[CBAF2]] = !DICommonBlock(scope: ![[F2]], declaration: null, name: "a"{{.*}})
+
+! CHECK-DAG: ![[F3:[0-9]+]] = {{.*}}!DISubprogram(name: "f3"{{.*}})
+! CHECK-DAG: ![[CBF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "__BLNK__"{{.*}})
+! CHECK-DAG: ![[CBAF3]] = !DICommonBlock(scope: ![[F3]], declaration: null, name: "a"{{.*}})
+
+! CHECK-DAG: ![[MAIN:[0-9]+]] = {{.*}}!DISubprogram(name: "test"{{.*}})
+! CHECK-DAG: ![[CBM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "__BLNK__"{{.*}})
+! CHECK-DAG: ![[CBAM]] = !DICommonBlock(scope: ![[MAIN]], declaration: null, name: "a"{{.*}})
+
+! Using CHECK-DAG-SAME so that we are not dependent on order of variable in these lists.
+! CHECK-DAG: @__BLNK__ = global{{.*}}
+! CHECK-DAG-SAME: !dbg ![[EXPXF1]]
+! CHECK-DAG-SAME: !dbg ![[EXPYF1]]
+! CHECK-DAG-SAME: !dbg ![[EXPXF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPYF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPZF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPXF3]]
+! CHECK-DAG-SAME: !dbg ![[EXPV1]]
+! CHECK-DAG-SAME: !dbg ![[EXPV2]]
+! CHECK-DAG-SAME: !dbg ![[EXPV3]]
+
+! CHECK-DAG: @a_ = global{{.*}}
+! CHECK-DAG-SAME: !dbg ![[EXPXAF1]]
+! CHECK-DAG-SAME: !dbg ![[EXPYAF1]]
+! CHECK-DAG-SAME: !dbg ![[EXPXAF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPYAF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPZAF2]]
+! CHECK-DAG-SAME: !dbg ![[EXPXAF3]]
+! CHECK-DAG-SAME: !dbg ![[EXPVA1]]
+! CHECK-DAG-SAME: !dbg ![[EXPVA2]]
+! CHECK-DAG-SAME: !dbg ![[EXPVA3]]
+
+! CHECK-DAG: !DICompileUnit({{.*}}, globals: ![[GLOBALS:[0-9]+]])
+! CHECK-DAG: ![[GLOBALS]]
+! CHECK-DAG-SAME: ![[EXPXF1]]
+! CHECK-DAG-SAME: ![[EXPYF1]]
+! CHECK-DAG-SAME: ![[EXPXAF1]]
+! CHECK-DAG-SAME: ![[EXPYAF1]]
+! CHECK-DAG-SAME: ![[EXPXF2]]
+! CHECK-DAG-SAME: ![[EXPYF2]]
+! CHECK-DAG-SAME: ![[EXPZF2]]
+! CHECK-DAG-SAME: ![[EXPXAF2]]
+! CHECK-DAG-SAME: ![[EXPYAF2]]
+! CHECK-DAG-SAME: ![[EXPZAF2]]
+! CHECK-DAG-SAME: ![[EXPXF3]]
+! CHECK-DAG-SAME: ![[EXPXAF3]]
+! CHECK-DAG-SAME: ![[EXPV1]]
+! CHECK-DAG-SAME: ![[EXPV2]]
+! CHECK-DAG-SAME: ![[EXPV3]]
+! CHECK-DAG-SAME: ![[EXPVA1]]
+! CHECK-DAG-SAME: ![[EXPVA2]]
+! CHECK-DAG-SAME: ![[EXPVA3]]
diff --git a/flang/test/Transforms/debug-common-block.fir b/flang/test/Transforms/debug-common-block.fir
new file mode 100644
index 0000000000000..481b26369a92c
--- /dev/null
+++ b/flang/test/Transforms/debug-common-block.fir
@@ -0,0 +1,213 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
+  fir.global @__BLNK__ {alignment = 4 : i64} : tuple<i32, !fir.array<8xi8>> {} loc(#loc1)
+  fir.global @a_ {alignment = 4 : i64} : tuple<i32, !fir.array<8xi8>> {} loc(#loc2)
+  func.func @f1() {
+    %c9_i32 = arith.constant 9 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %cst = arith.constant 4.400000e+00 : f32
+    %cst_0 = arith.constant 3.300000e+00 : f32
+    %cst_1 = arith.constant 2.200000e+00 : f32
+    %cst_2 = arith.constant 1.100000e+00 : f32
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = fir.address_of(@__BLNK__) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %1 = fir.convert %0 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %4 = fircg.ext_declare %3 {uniq_name = "_QFf1Ex"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc4)
+    %5 = fir.address_of(@a_) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %6 = fir.convert %5 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %7 = fir.coordinate_of %6, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %8 = fir.convert %7 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %9 = fircg.ext_declare %8 {uniq_name = "_QFf1Exa"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc5)
+    %10 = fir.coordinate_of %1, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %11 = fir.convert %10 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %12 = fircg.ext_declare %11 {uniq_name = "_QFf1Ey"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc6)
+    %13 = fir.coordinate_of %6, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %14 = fir.convert %13 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %15 = fircg.ext_declare %14 {uniq_name = "_QFf1Eya"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc7)
+    return
+  } loc(#loc3)
+  func.func @f2() {
+    %c16_i32 = arith.constant 16 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %c8 = arith.constant 8 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = fir.address_of(@__BLNK__) : !fir.ref<tuple<i32, !fir.array<8xi8>>> loc(#loc19)
+    %1 = fir.convert %0 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %4 = fircg.ext_declare %3 {uniq_name = "_QFf2Ex"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc9)
+    %5 = fir.address_of(@a_) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %6 = fir.convert %5 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %7 = fir.coordinate_of %6, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %8 = fir.convert %7 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %9 = fircg.ext_declare %8 {uniq_name = "_QFf2Exa"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc10)
+    %10 = fir.coordinate_of %1, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %11 = fir.convert %10 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %12 = fircg.ext_declare %11 {uniq_name = "_QFf2Ey"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc11)
+    %13 = fir.coordinate_of %6, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %14 = fir.convert %13 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %15 = fircg.ext_declare %14 {uniq_name = "_QFf2Eya"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc12)
+    %16 = fir.coordinate_of %1, %c8 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %17 = fir.convert %16 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %18 = fircg.ext_declare %17 {uniq_name = "_QFf2Ez"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc13)
+    %19 = fir.coordinate_of %6, %c8 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %20 = fir.convert %19 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %21 = fircg.ext_declare %20 {uniq_name = "_QFf2Eza"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc14)
+    return
+  } loc(#loc8)
+  func.func @f3() {
+    %c24_i32 = arith.constant 24 : i32
+    %c23_i32 = arith.constant 23 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %c0 = arith.constant 0 : index
+    %0 = fir.address_of(@__BLNK__) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %1 = fir.convert %0 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<i32>
+    %4 = fircg.ext_declare %3 {uniq_name = "_QFf3Ex"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc16)
+    %5 = fir.address_of(@a_) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %6 = fir.convert %5 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %7 = fir.coordinate_of %6, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %8 = fir.convert %7 : (!fir.ref<i8>) -> !fir.ref<i32>
+    %9 = fircg.ext_declare %8 {uniq_name = "_QFf3Exa"} : (!fir.ref<i32>) -> !fir.ref<i32> loc(#loc17)
+    return
+  } loc(#loc15)
+  func.func @test() {
+    %c34_i32 = arith.constant 34 : i32
+    %c6_i32 = arith.constant 6 : i32
+    %c8 = arith.constant 8 : index
+    %c4 = arith.constant 4 : index
+    %c0 = arith.constant 0 : index
+    %0 = fir.address_of(@__BLNK__) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %1 = fir.convert %0 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %2 = fir.coordinate_of %1, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %3 = fir.convert %2 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %4 = fircg.ext_declare %3 {uniq_name = "_QFEv1"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc19)
+    %5 = fir.coordinate_of %1, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %6 = fir.convert %5 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %7 = fircg.ext_declare %6 {uniq_name = "_QFEv2"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc20)
+    %8 = fir.coordinate_of %1, %c8 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %9 = fir.convert %8 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %10 = fircg.ext_declare %9 {uniq_name = "_QFEv3"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc21)
+    %11 = fir.address_of(@a_) : !fir.ref<tuple<i32, !fir.array<8xi8>>>
+    %12 = fir.convert %11 : (!fir.ref<tuple<i32, !fir.array<8xi8>>>) -> !fir.ref<!fir.array<?xi8>>
+    %13 = fir.coordinate_of %12, %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %14 = fir.convert %13 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %15 = fircg.ext_declare %14 {uniq_name = "_QFEva1"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc22)
+    %16 = fir.coordinate_of %12, %c4 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %17 = fir.convert %16 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %18 = fircg.ext_declare %17 {uniq_name = "_QFEva2"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc23)
+    %19 = fir.coordinate_of %12, %c8 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    %20 = fir.convert %19 : (!fir.ref<i8>) -> !fir.ref<f32>
+    %21 = fircg.ext_declare %20 {uniq_name = "_QFEva3"} : (!fir.ref<f32>) -> !fir.ref<f32> loc(#loc24)
+    return
+  } loc(#loc18)
+}
+
+#loc1 = loc(unknown)
+#loc2 = loc(unknown)
+#loc3 = loc("common.f90":10:1)
+#loc4 = loc("common.f90":12:19)
+#loc5 = loc("common.f90":12:25)
+#loc6 = loc("common.f90":12:22)
+#loc7 = loc("common.f90":12:29)
+#loc8 = loc("common.f90":20:3)
+#loc9 = loc("common.f90":22:3)
+#loc10 = loc("common.f90":22:6)
+#loc11 = loc("common.f90":22:9)
+#loc12 = loc("common.f90":22:13)
+#loc13 = loc("common.f90":22:16)
+#loc14 = loc("common.f90":22:19)
+#loc15 = loc("common.f90":32:18)
+#loc16 = loc("common.f90":35:7)
+#loc17 = loc("common.f90":35:10)
+#loc18 = loc("common.f90":40:1)
+#loc19 = loc("common.f90":43:19)
+#loc20 = loc("common.f90":43:28)
+#loc21 = loc("common.f90":43:22)
+#loc22 = loc("common.f90":43:32)
+#loc23 = loc("common.f90":43:25)
+#loc24 = loc("common.f90":43:36)
+
+
+// CHECK-DAG: #[[XF1:.*]] = #llvm.di_global_variable<scope = #[[CBF1:.+]], name = "x", linkageName = "_QFf1Ex"{{.*}}>
+// CHECK-DAG: #[[YF1:.*]] = #llvm.di_global_variable<scope = #[[CBF1]], name = "y", linkageName = "_QFf1Ey"{{.*}}>
+// CHECK-DAG: #[[EXPXF1:.*]] = #llvm.di_global_variable_expression<var = #[[XF1]]>
+// CHECK-DAG: #[[EXPYF1:.*]] = #llvm.di_global_variable_expression<var = #[[YF1]], expr = <[DW_OP_plus_uconst(4)]>>
+// CHECK-DAG: #[[XAF1:.*]] = #llvm.di_global_variable<scope = #[[CBAF1:.+]], name = "xa", linkageName = "_QFf1Exa"{{.*}}>
+// CHECK-DAG: #[[YAF1:.*]] = #llvm.di_global_variable<scope = #[[CBAF1]], name = "ya", linkageName = "_QFf1Eya"{{.*}}>
+// CHECK-DAG: #[[EXPXAF1:.*]] = #llvm.di_global_variable_expression<var = #[[XAF1]]>
+// CHECK-DAG: #[[EXPYAF1:.*]] = #llvm.di_global_variable_expression<var = #[[YAF1]], expr = <[DW_OP_plus_uconst(4)]>>
+
+// CHECK-DAG: #[[XF2:.*]] = #llvm.di_global_variable<scope = #[[CBF2:.+]], name = "x", linkageName = "_QFf2Ex"{{.*}}>
+// CHECK-DAG: #[[YF2:.*]] = #llvm.di_global_variable<scope = #[[CBF2]], name = "y", linkageName = "_QFf2Ey"{{.*}}>
+// CHECK-DAG: #[[ZF2:.*]] = #llvm.di_global_variable<scope = #[[CBF2]], name = "z", linkageName = "_QFf2Ez"{{.*}}>
+// CHECK-DAG: #[[EXPXF2:.*]] = #llvm.di_global_variable_expression<var = #[[XF2]]>
+// CHECK-DAG: #[[EXPYF2:.*]] = #llvm.di_global_variable_expression<var = #[[YF2]], expr = <[DW_OP_plus_uconst(4)]>>
+// CHECK-DAG: #[[EXPZF2:.*]] = #llvm.di_global_variable_expression<var = #[[ZF2]], expr = <[DW_OP_plus_uconst(8)]>>
+// CHECK-DAG: #[[XAF2:.*]] = #llvm.di_global_variable<scope = #[[CBAF2:.+]], name = "xa", linkageName = "_QFf2Exa"{{.*}}>
+// CHECK-DAG: #[[YAF2:.*]] = #llvm.di_global_variable<scope = #[[CBAF2]], name = "ya", linkageName = "_QFf2Eya"{{.*}}>
+// CHECK-DAG: #[[ZAF2:.*]] = #llvm.di_global_variable<scope = #[[CBAF2]], name = "za", linkageName = "_QFf2Eza"{{.*}}>
+// CHECK-DAG: #[[EXPXAF2:.*]] = #llvm.di_global_variable_expression<var = #[[XAF2]]>
+// CHECK-DAG: #[[EXPYAF2:.*]] = #llvm.di_global_variable_expression<var = #[[YAF2]], expr = <[DW_OP_plus_uconst(4)]>>
+// CHECK-DAG: #[[EXPZAF2:.*]] = #llvm.di_global_variable_expression<var = #[[ZAF2]], expr = <[DW_OP_plus_uconst(8)]>>
+
+// CHECK-DAG: #[[XF3:.*]] = #llvm.di_global_variable<scope = #[[CBF3:.+]], name = "x", linkageName = "_QFf3Ex"{{.*}}>
+// CHECK-DAG: #[[EXPXF3:.*]] = #llvm.di_global_variable_expression<var = #[[XF3]]>
+// CHECK-DAG: #[[XAF3:.*]] = #llvm.di_global_variable<scope = #[[CBAF3:.+]], name = "xa", linkageName = "_QFf3Exa"{{.*}}>
+// CHECK-DAG: #[[EXPXAF3:.*]] = #llvm.di_global_variable_expression<var = #[[XAF3]]>
+
+// CHECK-DAG: #[[V1:.*]] = #llvm.di_global_variable<scope = #[[CBM:.+]], name = "v1", linkageName = "_QFEv1"{{.*}}>
+// CHECK-DAG: #[[EXPV1:.*]] = #llvm.di_global_variable_expression<var = #[[V1]]>
+// CHECK-DAG: #[[V2:.*]] = #llvm.di_global_variable<scope = #[[CBM]], name = "v2", linkageName = "_QFEv2"{{.*}}>
+// CHECK-DAG: #[[EXPV2:.*]] = #llvm.di_global_variable_expression<var = #[[V2]], expr = <[DW_OP_plus_uconst(4)]>>
+// CHECK-DAG: #[[V3:.*]] = #llvm.di_global_variable<scope = #[[CBM]], name = "v3", linkageName = "_QFEv3"{{.*}}>
+// CHECK-DAG: #[[EXPV3:.*]] = #llvm.di_global_variable_expression<var = #[[V3]], expr = <[DW_OP_plus_uconst(8)]>>
+// CHECK-DAG: #[[VA1:.*]] = #llvm.di_global_variable<scope = #[[CBAM:.+]], name = "va1", linkageName = "_QFEva1"{{.*}}>
+// CHECK-DAG: #[[EXPVA1:.*]] = #llvm.di_global_variable_expression<var = #[[VA1]]>
+// CHECK-DAG: #[[VA2:.*]] = #llvm.di_global_variable<scope = #[[CBAM]], name = "va2", linkageName = "_QFEva2"{{.*}}>
+// CHECK-DAG: #[[EXPVA2:.*]] = #llvm.di_global_variable_expression<var = #[[VA2]], expr = <[DW_OP_plus_uconst(4)]>>
+// CHECK-DAG: #[[VA3:.*]] = #llvm.di_global_variable<scope = #[[CBAM]], name = "va3", linkageName = "_QFEva3"{{.*}}>
+// CHECK-DAG: #[[EXPVA3:.*]] = #llvm.di_global_variable_expression<var = #[[VA3]], expr = <[DW_OP_plus_uconst(8)]>>
+
+// CHECK-DAG: #[[F1:.*]] = #llvm.di_subprogram<{{.*}}name = "f1"{{.*}}>
+// CHECK-DAG: #[[F2:.*]] = #llvm.di_subprogram<{{.*}}name = "f2"{{.*}}>
+// CHECK-DAG: #[[F3:.*]] = #llvm.di_subprogram<{{.*}}name = "f3"{{.*}}>
+// CHECK-DAG: #[[TEST:.*]] = #llvm.di_subprogram<{{.*}}name = "test"{{.*}}>
+
+// CHECK-DAG: #[[CBF1]] = #llvm.di_common_block<scope = #[[F1]], name = "__BLNK__"{{.*}}>
+// CHECK-DAG: #[[CBF2]] = #llvm.di_common_block<scope = #[[F2]], name = "__BLNK__"{{.*}}>
+// CHECK-DAG: #[[CBF3]] = #llvm.di_common_block<scope = #[[F3]], name = "__BLNK__"{{.*}}>
+// CHECK-DAG: #[[CBM]] = #llvm.di_common_block<scope = #[[TEST]], name = "__BLNK__"{{.*}}>
+// CHECK-DAG: #[[CBAF1]] = #llvm.di_common_block<scope = #[[F1]], name = "a"{{.*}}>
+// CHECK-DAG: #[[CBAF2]] = #llvm.di_common_block<scope = #[[F2]], name = "a"{{.*}}>
+// CHECK-DAG: #[[CBAF3]] = #llvm.di_common_block<scope = #[[F3]], name = "a"{{.*}}>
+// CHECK-DAG: #[[CBAM]] = #llvm.di_common_block<scope = #[[TEST]], name = "a"{{.*}}>
+
+// CHECK-DAG: fir.global @__BLNK__ {{.*}} loc(#[[LOC1:.*]])
+// CHECK-DAG: fir.global @a_ {{.*}} loc(#[[LOC2:.*]])
+// CHECK-DAG: #[[LOC1]]
+// CHECK-DAG-SAME: #[[EXPXF1]]
+// CHECK-DAG-SAME: #[[EXPYF1]]
+// CHECK-DAG-SAME: #[[EXPXF2]]
+// CHECK-DAG-SAME: #[[EXPYF2]]
+// CHECK-DAG-SAME: #[[EXPZF2]]
+// CHECK-DAG-SAME: #[[EXPXF3]]
+// CHECK-DAG-SAME: #[[EXPV1]]
+// CHECK-DAG-SAME: #[[EXPV2]]
+// CHECK-DAG-SAME: #[[EXPV3]]
+// CHECK-DAG: #[[LOC2]]
+// CHECK-DAG-SAME: #[[EXPXAF1]]
+// CHECK-DAG-SAME: #[[EXPYAF1]]
+// CHECK-DAG-SAME: #[[EXPXAF2]]
+// CHECK-DAG-SAME: #[[EXPYAF2]]
+// CHECK-DAG-SAME: #[[EXPZAF2]]
+// CHECK-DAG-SAME: #[[EXPXAF3]]
+// CHECK-DAG-SAME: #[[EXPVA1]]
+// CHECK-DAG-SAME: #[[EXPVA2]]
+// CHECK-DAG-SAME: #[[EXPVA3]]
diff --git a/flang/test/Transforms/debug-module-1.fir b/flang/test/Transforms/debug-module-1.fir
index 71457d32b1596..ede996f053835 100644
--- a/flang/test/Transforms/debug-module-1.fir
+++ b/flang/test/Transforms/debug-module-1.fir
@@ -30,11 +30,13 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
 // CHECK-DAG: #[[MOD:.*]] = #llvm.di_module<{{.*}}scope = #[[CU]], name = "helper"{{.*}}>
 // CHECK-DAG: #[[LOC1:.*]] = loc("{{.*}}test.f90":12{{.*}})
 // CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "gli", linkageName = "_QMhelperEgli"{{.*}}line = 12, type = #[[I4]], isDefined = true>
+// CHECK-DAG: #[[GLIE:.*]] = #llvm.di_global_variable_expression<var = #[[GLI]]>
 // CHECK-DAG: #[[LOC2:.*]] = loc("{{.*}}test.f90":15{{.*}})
 // CHECK-DAG: #[[GLR:.*]] = #llvm.di_global_variable<scope = #[[MOD]], name = "glr", linkageName = "_QMhelperEglr"{{.*}}line = 15, type = #[[R4]], isDefined = true>
+// CHECK-DAG: #[[GLRE:.*]] = #llvm.di_global_variable_expression<var = #[[GLR]]>
 // CHECK-DAG: #[[LOC3:.*]] = loc("{{.*}}test.f90":20{{.*}})
 // CHECK-DAG: #[[TEST:.*]] = #llvm.di_subprogram<{{.*}}compileUnit = #[[CU]], scope = #[[MOD]], name = "test", linkageName = "_QMhelperPtest"{{.*}}line = 20, scopeLine = 20{{.*}}>
-// CHECK-DAG: loc(fused<#[[GLI]]>[#[[LOC1]]])
-// CHECK-DAG: loc(fused<#[[GLR]]>[#[[LOC2]]])
+// CHECK-DAG: loc(fused<[#[[GLIE]]]>[#[[LOC1]]])
+// CHECK-DAG: loc(fused<[#[[GLRE]]]>[#[[LOC2]]])
 // CHECK-DAG: loc(fused<#[[TEST]]>[#[[LOC3]]])
 
diff --git a/flang/test/Transforms/debug-module-2.fir b/flang/test/Transforms/debug-module-2.fir
index c8d618ce34b26..32a25e303751e 100644
--- a/flang/test/Transforms/debug-module-2.fir
+++ b/flang/test/Transforms/debug-module-2.fir
@@ -20,11 +20,13 @@ module {
 #di_module = #llvm.di_module<file = #di_file, scope = #di_compile_unit, name = "helper", line = 11>
 #di_global_variable = #llvm.di_global_variable<scope = #di_module, name = "gli", linkageName = "_QMhelperEgli", file = #di_file, line = 12, type = #di_basic_type, isDefined = true>
 #di_global_variable1 = #llvm.di_global_variable<scope = #di_module, name = "glr", linkageName = "_QMhelperEglr", file = #di_file, line = 15, type = #di_basic_type1, isDefined = true>
+#di_global_variable_expression = #llvm.di_global_variable_expression<var = #di_global_variable>
+#di_global_variable_expression1 = #llvm.di_global_variable_expression<var = #di_global_variable1>
 
 #loc1 = loc("test.f90":12:11)
 #loc2 = loc("test.f90":15:8)
-#loc3 = loc(fused<#di_global_variable>[#loc1])
-#loc4 = loc(fused<#di_global_variable1>[#loc2])
+#loc3 = loc(fused<[#di_global_variable_expression]>[#loc1])
+#loc4 = loc(fused<[#di_global_variable_expression1]>[#loc2])
 
 
 // CHECK-DAG: #[[GLI:.*]] = #llvm.di_global_variable<{{.*}}name = "gli", linkageName = "_QMhelperEgli"{{.*}}>

From 62c16d82194f4c209fe9431deddf302827d37fb4 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 28 Jan 2025 13:13:42 +0000
Subject: [PATCH 399/432] [MCJIT][test] Move MCJIT non-interpreter tests to
 MCJIT subdirectory (#124463)

Moving to the MCJIT subdirectory means they can be gated by a common
lit.local.cfg. I remove the `; UNSUPPORTED: target=loongarch{{.*}}`
lines because of this (as the logic in `MCJIT/lit.local.cfg` should be
sufficient).

The move is motivated by my desire to enable more of these tests for
RISCV, and it seems like it would be wrong to keep extending the
`UNSUPPORTED` lines for these individual tests.

This patch does not move the MCJIT tests in the top-level directory that
do `-force-interpreter=true`.
---
 llvm/test/ExecutionEngine/{ => MCJIT}/fma3-jit.ll              | 0
 llvm/test/ExecutionEngine/{ => MCJIT}/frem.ll                  | 3 ---
 llvm/test/ExecutionEngine/{ => MCJIT}/mov64zext32.ll           | 3 ---
 .../{ => MCJIT}/test-interp-vec-arithm_float.ll                | 3 ---
 .../ExecutionEngine/{ => MCJIT}/test-interp-vec-arithm_int.ll  | 3 ---
 .../ExecutionEngine/{ => MCJIT}/test-interp-vec-logical.ll     | 3 ---
 .../ExecutionEngine/{ => MCJIT}/test-interp-vec-setcond-fp.ll  | 3 ---
 .../ExecutionEngine/{ => MCJIT}/test-interp-vec-setcond-int.ll | 3 ---
 8 files changed, 21 deletions(-)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/fma3-jit.ll (100%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/frem.ll (90%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/mov64zext32.ll (77%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/test-interp-vec-arithm_float.ll (92%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/test-interp-vec-arithm_int.ll (96%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/test-interp-vec-logical.ll (91%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/test-interp-vec-setcond-fp.ll (93%)
 rename llvm/test/ExecutionEngine/{ => MCJIT}/test-interp-vec-setcond-int.ll (97%)

diff --git a/llvm/test/ExecutionEngine/fma3-jit.ll b/llvm/test/ExecutionEngine/MCJIT/fma3-jit.ll
similarity index 100%
rename from llvm/test/ExecutionEngine/fma3-jit.ll
rename to llvm/test/ExecutionEngine/MCJIT/fma3-jit.ll
diff --git a/llvm/test/ExecutionEngine/frem.ll b/llvm/test/ExecutionEngine/MCJIT/frem.ll
similarity index 90%
rename from llvm/test/ExecutionEngine/frem.ll
rename to llvm/test/ExecutionEngine/MCJIT/frem.ll
index d33e4fca876e1..b8739c249cf58 100644
--- a/llvm/test/ExecutionEngine/frem.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/frem.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; LLI.exe used to crash on Windows\X86 when certain single precession
 ; floating point intrinsics (defined as macros) are used.
 ; This unit test guards against the failure.
diff --git a/llvm/test/ExecutionEngine/mov64zext32.ll b/llvm/test/ExecutionEngine/MCJIT/mov64zext32.ll
similarity index 77%
rename from llvm/test/ExecutionEngine/mov64zext32.ll
rename to llvm/test/ExecutionEngine/MCJIT/mov64zext32.ll
index 43bd0fb2f7b43..bba1a1987350a 100644
--- a/llvm/test/ExecutionEngine/mov64zext32.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/mov64zext32.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 
diff --git a/llvm/test/ExecutionEngine/test-interp-vec-arithm_float.ll b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_float.ll
similarity index 92%
rename from llvm/test/ExecutionEngine/test-interp-vec-arithm_float.ll
rename to llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_float.ll
index 99d95791c5ad2..6f784265a73e1 100644
--- a/llvm/test/ExecutionEngine/test-interp-vec-arithm_float.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_float.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 
diff --git a/llvm/test/ExecutionEngine/test-interp-vec-arithm_int.ll b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_int.ll
similarity index 96%
rename from llvm/test/ExecutionEngine/test-interp-vec-arithm_int.ll
rename to llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_int.ll
index 2e5592d4d363c..6896af83c44fb 100644
--- a/llvm/test/ExecutionEngine/test-interp-vec-arithm_int.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-arithm_int.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 
diff --git a/llvm/test/ExecutionEngine/test-interp-vec-logical.ll b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-logical.ll
similarity index 91%
rename from llvm/test/ExecutionEngine/test-interp-vec-logical.ll
rename to llvm/test/ExecutionEngine/MCJIT/test-interp-vec-logical.ll
index 1e11659b1898a..f654120eaf8e0 100644
--- a/llvm/test/ExecutionEngine/test-interp-vec-logical.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-logical.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 
diff --git a/llvm/test/ExecutionEngine/test-interp-vec-setcond-fp.ll b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-fp.ll
similarity index 93%
rename from llvm/test/ExecutionEngine/test-interp-vec-setcond-fp.ll
rename to llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-fp.ll
index e919550de2689..84bdec1cf5597 100644
--- a/llvm/test/ExecutionEngine/test-interp-vec-setcond-fp.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-fp.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 
diff --git a/llvm/test/ExecutionEngine/test-interp-vec-setcond-int.ll b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-int.ll
similarity index 97%
rename from llvm/test/ExecutionEngine/test-interp-vec-setcond-int.ll
rename to llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-int.ll
index 9862d6af1f6eb..5a20fc4f11721 100644
--- a/llvm/test/ExecutionEngine/test-interp-vec-setcond-int.ll
+++ b/llvm/test/ExecutionEngine/MCJIT/test-interp-vec-setcond-int.ll
@@ -1,6 +1,3 @@
-; LoongArch does not support mcjit.
-; UNSUPPORTED: target=loongarch{{.*}}
-
 ; RUN: %lli -jit-kind=mcjit %s > /dev/null
 ; RUN: %lli %s > /dev/null
 

From 8ad9e1ecb7e565c2f99b4ef67517d1f37d1c80d0 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <Stephen.Tozer@sony.com>
Date: Tue, 28 Jan 2025 12:59:01 +0000
Subject: [PATCH 400/432] [Clang] Fix use of deprecated method and missing
 triple

Fixes two buildbot errors caused by 4424c44c (#110102):

The first error, seen on some sanitizer bots:
https://lab.llvm.org/buildbot/#/builders/51/builds/9901

The initial commit used the deprecated getDeclaration intrinsic instead
of the non-deprecated getOrInsert- equivalent. This patch trivially
updates the code in question to use the new intrinsic.

The second error, seen on the clang-armv8-quick bot:
https://lab.llvm.org/buildbot/#/builders/154/builds/10983

One of the tests depends on a particular triple to get the exact output
expected by the test, but did not specify this triple; this patch adds
the triple in question.
---
 clang/lib/CodeGen/CGDecl.cpp              | 7 ++++---
 clang/test/CodeGen/fake-use-sanitizer.cpp | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index ded905cdcc9f4..e0d2048262f8f 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -2587,9 +2587,10 @@ llvm::Function *CodeGenModule::getLLVMLifetimeEndFn() {
 
 /// Lazily declare the @llvm.fake.use intrinsic.
 llvm::Function *CodeGenModule::getLLVMFakeUseFn() {
-  if (!FakeUseFn)
-    FakeUseFn = llvm::Intrinsic::getDeclaration(&getModule(),
-                                                llvm::Intrinsic::fake_use);
+  if (FakeUseFn)
+    return FakeUseFn;
+  FakeUseFn = llvm::Intrinsic::getOrInsertDeclaration(
+      &getModule(), llvm::Intrinsic::fake_use);
   return FakeUseFn;
 }
 
diff --git a/clang/test/CodeGen/fake-use-sanitizer.cpp b/clang/test/CodeGen/fake-use-sanitizer.cpp
index 4d33dcf7c6944..d544bf85d2d9e 100644
--- a/clang/test/CodeGen/fake-use-sanitizer.cpp
+++ b/clang/test/CodeGen/fake-use-sanitizer.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -fsanitize=null -fsanitize-trap=null -o - | FileCheck --check-prefixes=CHECK,NULL --implicit-check-not=ubsantrap %s
-// RUN: %clang_cc1 %s -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -fextend-variable-liveness -fsanitize=null -fsanitize-trap=null -o - | FileCheck --check-prefixes=CHECK,NULL --implicit-check-not=ubsantrap %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -fextend-variable-liveness -o - | FileCheck %s
 
 // With -fextend-lifetimes, the compiler previously generated a fake.use of any
 // reference variable at the end of the scope in which its alloca exists. This

From 606cf887416ba2f136f3104ac12469fc81f73968 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Tue, 28 Jan 2025 13:23:40 +0000
Subject: [PATCH 401/432] [ExecutionEngine][test][RISCV] Don't mark RISCV as
 unsupported (#124464)

Although MCJIT is unsupported, we can and should be running the other
tests. Stacks on top of #124463 (needed to avoid unsupported MCJIT tests
in the top-level test/ExecutionEngine directory running when they
shouldn't).

This effectively reverts b8feeba0b407e703385738af7e415d0c8972a420.

ninja check-llvm-executionengine before:
Total Discovered Tests: 335
  Unsupported: 335 (100.00%)

ninja check-llvm-executionengine after:
Total Discovered Tests: 335
  Unsupported      : 125 (37.31%)
  Passed           : 209 (62.39%)
  Expectedly Failed:   1 (0.30%)
---
 llvm/test/ExecutionEngine/lit.local.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/ExecutionEngine/lit.local.cfg b/llvm/test/ExecutionEngine/lit.local.cfg
index c748de14c8409..1951f140ea889 100644
--- a/llvm/test/ExecutionEngine/lit.local.cfg
+++ b/llvm/test/ExecutionEngine/lit.local.cfg
@@ -1,4 +1,4 @@
-if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon', 'RISCV']:
+if config.root.native_target in ['Sparc', 'SystemZ', 'Hexagon']:
     config.unsupported = True
 
 # ExecutionEngine tests are not expected to pass in a cross-compilation setup.

From 015aed18ee357ee6db4e72cba2e65aeaafa333c9 Mon Sep 17 00:00:00 2001
From: abhishek-kaushik22 <abhishek.kaushik@intel.com>
Date: Tue, 28 Jan 2025 05:24:49 -0800
Subject: [PATCH 402/432] [SelectionDAG] WidenVecOp_INSERT_SUBVECTOR - Replace
 `INSERT_SUBVECTOR` with series of `INSERT_VECTOR_ELT` (#124420)

If the operands to `INSERT_SUBVECTOR` can't be widened legally, just
replace the `INSERT_SUBVECTOR` with a series of `INSERT_VECTOR_ELT`.

Closes #124255 (and possibly #102016)
---
 .../SelectionDAG/LegalizeVectorTypes.cpp      | 27 +++++++++--
 llvm/test/CodeGen/X86/pr124255.ll             | 47 +++++++++++++++++++
 2 files changed, 71 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/pr124255.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index f39d9ca15496a..1000235ab4061 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7040,6 +7040,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
   SDValue SubVec = N->getOperand(1);
   SDValue InVec = N->getOperand(0);
 
+  EVT OrigVT = SubVec.getValueType();
   if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
     SubVec = GetWidenedVector(SubVec);
 
@@ -7064,14 +7065,34 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  SDLoc DL(N);
+
   // We need to make sure that the indices are still valid, otherwise we might
   // widen what was previously well-defined to something undefined.
   if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, InVec, SubVec,
+    return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
                        N->getOperand(2));
 
-  report_fatal_error("Don't know how to widen the operands for "
-                     "INSERT_SUBVECTOR");
+  if (!IndicesValid || OrigVT.isScalableVector())
+    report_fatal_error(
+        "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
+  // If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
+  // with a series of INSERT_VECTOR_ELT
+  unsigned Idx = N->getConstantOperandVal(2);
+
+  SDValue InsertElt = InVec;
+  EVT VectorIdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+  for (unsigned I = 0, E = OrigVT.getVectorNumElements(); I != E; ++I) {
+    SDValue ExtractElt =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT.getVectorElementType(),
+                    SubVec, DAG.getConstant(I, DL, VectorIdxTy));
+    InsertElt =
+        DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, InsertElt, ExtractElt,
+                    DAG.getConstant(I + Idx, DL, VectorIdxTy));
+  }
+
+  return InsertElt;
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
diff --git a/llvm/test/CodeGen/X86/pr124255.ll b/llvm/test/CodeGen/X86/pr124255.ll
new file mode 100644
index 0000000000000..da502442baddb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr124255.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+
+define <4 x i32> @insert_v2i32_in_v4i32_at_0(<4 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: insert_v2i32_in_v4i32_at_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT:    retq
+  %result = tail call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 0)
+  ret <4 x i32> %result
+}
+
+define <4 x i32> @insert_v2i32_in_v4i32_at_2(<4 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: insert_v2i32_in_v4i32_at_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+  %result = tail call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> %a, <2 x i32> %b, i64 2)
+  ret <4 x i32> %result
+}
+
+define <4 x float> @insert_v2f32_in_v4f32_at_0(<4 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: insert_v2f32_in_v4f32_at_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; CHECK-NEXT:    retq
+  %result = tail call <4 x float> @llvm.vector.insert.v4float.v2float(<4 x float> %a, <2 x float> %b, i64 0)
+  ret <4 x float> %result
+}
+
+define <8 x i32> @insert_v2i32_in_v8i32_at_0(<8 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: insert_v2i32_in_v8i32_at_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK-NEXT:    retq
+  %result = tail call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %a, <2 x i32> %b, i64 0)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @insert_v2i32_in_v8i32_at_6(<8 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: insert_v2i32_in_v8i32_at_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; CHECK-NEXT:    retq
+  %result = tail call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %a, <2 x i32> %b, i64 6)
+  ret <8 x i32> %result
+}

From 13dcc95dcd4999ff99f2de89d881f1aed5b21709 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 28 Jan 2025 07:26:13 -0600
Subject: [PATCH 403/432] [Offload] Rework offloading entry type to be more
 generic (#124018)

Summary:
The previous offloading entry type did not fit the current use-cases
very well. This widens it and adds a version to prevent further
annoyances. It also includes the kind to better sort who's using it.

The first 64-bytes are reserved as zero so the OpenMP runtime can detect
the old format for binary compatibilitry.
---
 clang/lib/CodeGen/CGCUDANV.cpp                |  27 +--
 clang/test/CodeGenCUDA/offloading-entries.cu  |  62 ++++---
 clang/test/Driver/linker-wrapper-image.c      | 168 ++++++++++--------
 clang/test/OpenMP/declare_mapper_codegen.cpp  |   6 +-
 clang/test/OpenMP/declare_target_codegen.cpp  |   2 +-
 .../OpenMP/declare_target_link_codegen.cpp    |   6 +-
 ..._target_requires_unified_shared_memory.cpp |   4 +-
 .../OpenMP/openmp_offload_registration.cpp    |   2 +-
 clang/test/OpenMP/target_codegen.cpp          |   6 +-
 .../OpenMP/target_codegen_registration.cpp    |  76 +++-----
 clang/test/OpenMP/target_depend_codegen.cpp   |   6 +-
 clang/test/OpenMP/target_indirect_codegen.cpp |   8 +-
 .../target_parallel_codegen_registration.cpp  |  76 +++-----
 .../OpenMP/target_parallel_depend_codegen.cpp |   6 +-
 ...rget_parallel_for_codegen_registration.cpp |  76 +++-----
 .../target_parallel_for_depend_codegen.cpp    |   6 +-
 ...parallel_for_simd_codegen_registration.cpp |  76 +++-----
 ...arget_parallel_for_simd_depend_codegen.cpp |   6 +-
 clang/test/OpenMP/target_simd_codegen.cpp     |   6 +-
 .../target_simd_codegen_registration.cpp      |  76 +++-----
 .../OpenMP/target_simd_depend_codegen.cpp     |   6 +-
 .../target_teams_codegen_registration.cpp     |  76 +++-----
 .../OpenMP/target_teams_depend_codegen.cpp    |   6 +-
 ..._teams_distribute_codegen_registration.cpp |  76 +++-----
 ...target_teams_distribute_depend_codegen.cpp |   6 +-
 ...distribute_parallel_for_depend_codegen.cpp |   6 +-
 ...parallel_for_simd_codegen_registration.cpp |  76 +++-----
 ...ibute_parallel_for_simd_depend_codegen.cpp |   6 +-
 ...s_distribute_simd_codegen_registration.cpp |  76 +++-----
 ...t_teams_distribute_simd_depend_codegen.cpp |   6 +-
 .../llvm/Frontend/Offloading/Utility.h        |  38 ++--
 .../Frontend/Offloading/OffloadWrapper.cpp    |  64 ++++---
 llvm/lib/Frontend/Offloading/Utility.cpp      |  56 +++---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   6 +-
 .../omptarget-declare-target-llvm-host.mlir   |  32 ++--
 offload/include/PluginManager.h               |   6 +
 offload/src/PluginManager.cpp                 |  58 ++++++
 offload/test/offloading/requires.c            |  22 ++-
 .../kernelreplay/llvm-omp-kernel-replay.cpp   |   3 +-
 openmp/docs/ReleaseNotes.rst                  |   2 +
 40 files changed, 609 insertions(+), 719 deletions(-)

diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 23a40b8f7c32a..e8d2dca855dd9 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -1199,11 +1199,14 @@ void CGNVCUDARuntime::transformManagedVars() {
 void CGNVCUDARuntime::createOffloadingEntries() {
   SmallVector<char, 32> Out;
   StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
+  llvm::object::OffloadKind Kind = CGM.getLangOpts().HIP
+                                       ? llvm::object::OffloadKind::OFK_HIP
+                                       : llvm::object::OffloadKind::OFK_Cuda;
 
   llvm::Module &M = CGM.getModule();
   for (KernelInfo &I : EmittedKernels)
     llvm::offloading::emitOffloadingEntry(
-        M, KernelHandles[I.Kernel->getName()],
+        M, Kind, KernelHandles[I.Kernel->getName()],
         getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
         llvm::offloading::OffloadGlobalEntry, Section);
 
@@ -1221,42 +1224,30 @@ void CGNVCUDARuntime::createOffloadingEntries() {
              ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
              : 0);
     if (I.Flags.getKind() == DeviceVarFlags::Variable) {
-      // TODO: Update the offloading entries struct to avoid this indirection.
       if (I.Flags.isManaged()) {
         assert(I.Var->getName().ends_with(".managed") &&
                "HIP managed variables not transformed");
 
-        // Create a struct to contain the two variables.
         auto *ManagedVar = M.getNamedGlobal(
             I.Var->getName().drop_back(StringRef(".managed").size()));
-        llvm::Constant *StructData[] = {ManagedVar, I.Var};
-        llvm::Constant *Initializer = llvm::ConstantStruct::get(
-            llvm::offloading::getManagedTy(M), StructData);
-        auto *Struct = new llvm::GlobalVariable(
-            M, llvm::offloading::getManagedTy(M),
-            /*IsConstant=*/true, llvm::GlobalValue::PrivateLinkage, Initializer,
-            I.Var->getName(), /*InsertBefore=*/nullptr,
-            llvm::GlobalVariable::NotThreadLocal,
-            M.getDataLayout().getDefaultGlobalsAddressSpace());
-
         llvm::offloading::emitOffloadingEntry(
-            M, Struct, getDeviceSideName(I.D), VarSize,
+            M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
             llvm::offloading::OffloadGlobalManagedEntry | Flags,
-            /*Data=*/static_cast<uint32_t>(I.Var->getAlignment()), Section);
+            /*Data=*/I.Var->getAlignment(), Section, ManagedVar);
       } else {
         llvm::offloading::emitOffloadingEntry(
-            M, I.Var, getDeviceSideName(I.D), VarSize,
+            M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
             llvm::offloading::OffloadGlobalEntry | Flags,
             /*Data=*/0, Section);
       }
     } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
       llvm::offloading::emitOffloadingEntry(
-          M, I.Var, getDeviceSideName(I.D), VarSize,
+          M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
           llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
           I.Flags.getSurfTexType(), Section);
     } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
       llvm::offloading::emitOffloadingEntry(
-          M, I.Var, getDeviceSideName(I.D), VarSize,
+          M, Kind, I.Var, getDeviceSideName(I.D), VarSize,
           llvm::offloading::OffloadGlobalTextureEntry | Flags,
           I.Flags.getSurfTexType(), Section);
     }
diff --git a/clang/test/CodeGenCUDA/offloading-entries.cu b/clang/test/CodeGenCUDA/offloading-entries.cu
index d46a25969e3ec..4f65e7335d85b 100644
--- a/clang/test/CodeGenCUDA/offloading-entries.cu
+++ b/clang/test/CodeGenCUDA/offloading-entries.cu
@@ -19,63 +19,61 @@
 //.
 // CUDA: @managed = global i32 undef, align 4
 // CUDA: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { ptr @managed, ptr @.offloading.entry_name.3, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @managed, ptr @.offloading.entry_name.3, i64 4, i64 0, ptr null }, section "cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.4, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "cuda_offloading_entries", align 1
 // CUDA: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.5, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries", align 1
+// CUDA: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "cuda_offloading_entries", align 1
 //.
 // HIP: @managed.managed = global i32 0, align 4
 // HIP: @managed = externally_initialized global ptr null
 // HIP: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "hip_offloading_entries", align 1
 // HIP: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "hip_offloading_entries", align 1
 // HIP: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries", align 1
-// HIP: @managed.managed.3 = private constant %struct.__managed_var { ptr @managed, ptr @managed.managed }
-// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { ptr @managed.managed.3, ptr @.offloading.entry_name.4, i64 4, i32 1, i32 4 }, section "hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.5, i64 4, i32 2, i32 1 }, section "hip_offloading_entries", align 1
-// HIP: @.offloading.entry_name.6 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.6, i64 4, i32 3, i32 1 }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "hip_offloading_entries", align 1
+// HIP: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
+// HIP: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "hip_offloading_entries", align 1
 //.
 // CUDA-COFF: @managed = dso_local global i32 undef, align 4
 // CUDA-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z18__device_stub__foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @_Z21__device_stub__kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { ptr @managed, ptr @.offloading.entry_name.3, i64 4, i32 0, i32 0 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 0, ptr @managed, ptr @.offloading.entry_name.3, i64 4, i64 0, ptr null }, section "cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.4, i64 4, i32 2, i32 1 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "cuda_offloading_entries$OE", align 1
 // CUDA-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// CUDA-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.5, i64 4, i32 3, i32 1 }, section "cuda_offloading_entries$OE", align 1
+// CUDA-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 2, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "cuda_offloading_entries$OE", align 1
 //.
 // HIP-COFF: @managed.managed = dso_local global i32 0, align 4
 // HIP-COFF: @managed = dso_local externally_initialized global ptr null
 // HIP-COFF: @.offloading.entry_name = internal unnamed_addr constant [8 x i8] c"_Z3foov\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry._Z3foov = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z3foov, ptr @.offloading.entry_name, i64 0, i64 0, ptr null }, section "hip_offloading_entries$OE", align 1
 // HIP-COFF: @.offloading.entry_name.1 = internal unnamed_addr constant [11 x i8] c"_Z6kernelv\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry._Z6kernelv = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @_Z6kernelv, ptr @.offloading.entry_name.1, i64 0, i64 0, ptr null }, section "hip_offloading_entries$OE", align 1
 // HIP-COFF: @.offloading.entry_name.2 = internal unnamed_addr constant [4 x i8] c"var\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @var, ptr @.offloading.entry_name.2, i64 4, i32 0, i32 0 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @managed.managed.3 = private constant %struct.__managed_var { ptr @managed, ptr @managed.managed }
-// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { ptr @managed.managed.3, ptr @.offloading.entry_name.4, i64 4, i32 1, i32 4 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { ptr @surf, ptr @.offloading.entry_name.5, i64 4, i32 2, i32 1 }, section "hip_offloading_entries$OE", align 1
-// HIP-COFF: @.offloading.entry_name.6 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
-// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { ptr @tex, ptr @.offloading.entry_name.6, i64 4, i32 3, i32 1 }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 0, ptr @var, ptr @.offloading.entry_name.2, i64 4, i64 0, ptr null }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry_name.3 = internal unnamed_addr constant [8 x i8] c"managed\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.managed = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 1, ptr @managed.managed, ptr @.offloading.entry_name.3, i64 4, i64 4, ptr @managed }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry_name.4 = internal unnamed_addr constant [5 x i8] c"surf\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.surf = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 2, ptr @surf, ptr @.offloading.entry_name.4, i64 4, i64 1, ptr null }, section "hip_offloading_entries$OE", align 1
+// HIP-COFF: @.offloading.entry_name.5 = internal unnamed_addr constant [4 x i8] c"tex\00", section ".llvm.rodata.offloading", align 1
+// HIP-COFF: @.offloading.entry.tex = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 3, i32 3, ptr @tex, ptr @.offloading.entry_name.5, i64 4, i64 1, ptr null }, section "hip_offloading_entries$OE", align 1
 //.
 // CUDA-LABEL: @_Z18__device_stub__foov(
 // CUDA-NEXT:  entry:
diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 7f96f629e9127..197ba4c31f6b4 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -73,45 +73,54 @@
 // CUDA-NEXT:   %1 = call i32 @atexit(ptr @.cuda.fatbin_unreg)
 // CUDA-NEXT:   ret void
 // CUDA-NEXT: }
-
+//
 //      CUDA: define internal void @.cuda.fatbin_unreg() section ".text.startup" {
 // CUDA-NEXT: entry:
 // CUDA-NEXT:   %0 = load ptr, ptr @.cuda.binary_handle, align 8
 // CUDA-NEXT:   call void @__cudaUnregisterFatBinary(ptr %0)
 // CUDA-NEXT:   ret void
 // CUDA-NEXT: }
-
+//
 //      CUDA: define internal void @.cuda.globals_reg(ptr %0) section ".text.startup" {
 // CUDA-NEXT: entry:
 // CUDA-NEXT:   %1 = icmp ne ptr @__start_cuda_offloading_entries, @__stop_cuda_offloading_entries
 // CUDA-NEXT:   br i1 %1, label %while.entry, label %while.end
-
+//
 //      CUDA: while.entry:
-// CUDA-NEXT:   %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %13, %if.end ]
-// CUDA-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
+// CUDA-NEXT:   %entry1 = phi ptr [ @__start_cuda_offloading_entries, %entry ], [ %16, %if.end ]
+// CUDA-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 4
 // CUDA-NEXT:   %addr = load ptr, ptr %2, align 8
-// CUDA-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
-// CUDA-NEXT:   %name = load ptr, ptr %3, align 8
-// CUDA-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
-// CUDA-NEXT:   %size = load i64, ptr %4, align 4
-// CUDA-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
-// CUDA-NEXT:   %flags = load i32, ptr %5, align 4
-// CUDA-NEXT:   %6 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4
-// CUDA-NEXT:   %textype = load i32, ptr %6, align 4
+// CUDA-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 8
+// CUDA-NEXT:   %aux_addr = load ptr, ptr %3, align 8
+// CUDA-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 2
+// CUDA-NEXT:   %kind = load i16, ptr %4, align 2
+// CUDA-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 5
+// CUDA-NEXT:   %name = load ptr, ptr %5, align 8
+// CUDA-NEXT:   %6 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 6
+// CUDA-NEXT:   %size = load i64, ptr %6, align 4
+// CUDA-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 3
+// CUDA-NEXT:   %flags = load i32, ptr %7, align 4
+// CUDA-NEXT:   %8 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 7
+// CUDA-NEXT:   %data = load i64, ptr %8, align 4
+// CUDA-NEXT:   %9 = trunc i64 %data to i32
 // CUDA-NEXT:   %type = and i32 %flags, 7
-// CUDA-NEXT:   %7 = and i32 %flags, 8
-// CUDA-NEXT:   %extern = lshr i32 %7, 3
-// CUDA-NEXT:   %8 = and i32 %flags, 16
-// CUDA-NEXT:   %constant = lshr i32 %8, 4
-// CUDA-NEXT:   %9 = and i32 %flags, 32
-// CUDA-NEXT:   %normalized = lshr i32 %9, 5
-// CUDA-NEXT:   %10 = icmp eq i64 %size, 0
-// CUDA-NEXT:   br i1 %10, label %if.then, label %if.else
-
+// CUDA-NEXT:   %10 = and i32 %flags, 8
+// CUDA-NEXT:   %extern = lshr i32 %10, 3
+// CUDA-NEXT:   %11 = and i32 %flags, 16
+// CUDA-NEXT:   %constant = lshr i32 %11, 4
+// CUDA-NEXT:   %12 = and i32 %flags, 32
+// CUDA-NEXT:   %normalized = lshr i32 %12, 5
+// CUDA-NEXT:   %13 = icmp eq i16 %kind, 2
+// CUDA-NEXT:   br i1 %13, label %if.kind, label %if.end
+//
+//      CUDA: if.kind:
+// CUDA-NEXT:   %14 = icmp eq i64 %size, 0
+// CUDA-NEXT:   br i1 %14, label %if.then, label %if.else
+//
 //      CUDA: if.then:
-// CUDA-NEXT:   %11 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// CUDA-NEXT:   %15 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
 // CUDA-NEXT:   br label %if.end
-
+//
 //      CUDA: if.else:
 // CUDA-NEXT:   switch i32 %type, label %if.end [
 // CUDA-NEXT:     i32 0, label %sw.global
@@ -119,29 +128,26 @@
 // CUDA-NEXT:     i32 2, label %sw.surface
 // CUDA-NEXT:     i32 3, label %sw.texture
 // CUDA-NEXT:   ]
-
+//
 //      CUDA: sw.global:
 // CUDA-NEXT:   call void @__cudaRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0)
 // CUDA-NEXT:   br label %if.end
-
+//
 //      CUDA: sw.managed:
-// CUDA-NEXT:  %managed.addr = load ptr, ptr %addr, align 8
-// CUDA-NEXT:  %12 = getelementptr inbounds ptr, ptr %addr, i64 1
-// CUDA-NEXT:  %managed.addr2 = load ptr, ptr %12, align 8
-// CUDA-NEXT:  call void @__cudaRegisterManagedVar(ptr %0, ptr %managed.addr, ptr %managed.addr2, ptr %name, i64 %size, i32 %textype)
-// CUDA-NEXT:  br label %if.end
-
+// CUDA-NEXT:   call void @__cudaRegisterManagedVar(ptr %0, ptr %aux_addr, ptr %addr, ptr %name, i64 %size, i32 %9)
+// CUDA-NEXT:   br label %if.end
+//
 //      CUDA: sw.surface:
 // CUDA-NEXT:   br label %if.end
-
+//
 //      CUDA: sw.texture:
 // CUDA-NEXT:   br label %if.end
-
+//
 //      CUDA: if.end:
-// CUDA-NEXT:   %13 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
-// CUDA-NEXT:   %14 = icmp eq ptr %13, @__stop_cuda_offloading_entries
-// CUDA-NEXT:   br i1 %14, label %while.end, label %while.entry
-
+// CUDA-NEXT:   %16 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
+// CUDA-NEXT:   %17 = icmp eq ptr %16, @__stop_cuda_offloading_entries
+// CUDA-NEXT:   br i1 %17, label %while.end, label %while.entry
+//
 //      CUDA: while.end:
 // CUDA-NEXT:   ret void
 // CUDA-NEXT: }
@@ -177,45 +183,54 @@
 // HIP-NEXT:   %1 = call i32 @atexit(ptr @.hip.fatbin_unreg)
 // HIP-NEXT:   ret void
 // HIP-NEXT: }
-
+//
 //      HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" {
 // HIP-NEXT: entry:
 // HIP-NEXT:   %0 = load ptr, ptr @.hip.binary_handle, align 8
 // HIP-NEXT:   call void @__hipUnregisterFatBinary(ptr %0)
 // HIP-NEXT:   ret void
 // HIP-NEXT: }
-
+//
 //      HIP: define internal void @.hip.globals_reg(ptr %0) section ".text.startup" {
 // HIP-NEXT: entry:
 // HIP-NEXT:   %1 = icmp ne ptr @__start_hip_offloading_entries, @__stop_hip_offloading_entries
 // HIP-NEXT:   br i1 %1, label %while.entry, label %while.end
-
+//
 //      HIP: while.entry:
-// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %13, %if.end ]
-// HIP-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 0
+// HIP-NEXT:   %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %16, %if.end ]
+// HIP-NEXT:   %2 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 4
 // HIP-NEXT:   %addr = load ptr, ptr %2, align 8
-// HIP-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 1
-// HIP-NEXT:   %name = load ptr, ptr %3, align 8
-// HIP-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 2
-// HIP-NEXT:   %size = load i64, ptr %4, align 4
-// HIP-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 3
-// HIP-NEXT:   %flags = load i32, ptr %5, align 4
-// HIP-NEXT:   %6 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 0, i32 4
-// HIP-NEXT:   %textype = load i32, ptr %6, align 4
+// HIP-NEXT:   %3 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 8
+// HIP-NEXT:   %aux_addr = load ptr, ptr %3, align 8
+// HIP-NEXT:   %4 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 2
+// HIP-NEXT:   %kind = load i16, ptr %4, align 2
+// HIP-NEXT:   %5 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 5
+// HIP-NEXT:   %name = load ptr, ptr %5, align 8
+// HIP-NEXT:   %6 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 6
+// HIP-NEXT:   %size = load i64, ptr %6, align 4
+// HIP-NEXT:   %7 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 3
+// HIP-NEXT:   %flags = load i32, ptr %7, align 4
+// HIP-NEXT:   %8 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i32 0, i32 7
+// HIP-NEXT:   %data = load i64, ptr %8, align 4
+// HIP-NEXT:   %9 = trunc i64 %data to i32
 // HIP-NEXT:   %type = and i32 %flags, 7
-// HIP-NEXT:   %7 = and i32 %flags, 8
-// HIP-NEXT:   %extern = lshr i32 %7, 3
-// HIP-NEXT:   %8 = and i32 %flags, 16
-// HIP-NEXT:   %constant = lshr i32 %8, 4
-// HIP-NEXT:   %9 = and i32 %flags, 32
-// HIP-NEXT:   %normalized = lshr i32 %9, 5
-// HIP-NEXT:   %10 = icmp eq i64 %size, 0
-// HIP-NEXT:   br i1 %10, label %if.then, label %if.else
-
+// HIP-NEXT:   %10 = and i32 %flags, 8
+// HIP-NEXT:   %extern = lshr i32 %10, 3
+// HIP-NEXT:   %11 = and i32 %flags, 16
+// HIP-NEXT:   %constant = lshr i32 %11, 4
+// HIP-NEXT:   %12 = and i32 %flags, 32
+// HIP-NEXT:   %normalized = lshr i32 %12, 5
+// HIP-NEXT:   %13 = icmp eq i16 %kind, 3
+// HIP-NEXT:   br i1 %13, label %if.kind, label %if.end
+//
+//      HIP: if.kind:
+// HIP-NEXT:   %14 = icmp eq i64 %size, 0
+// HIP-NEXT:   br i1 %14, label %if.then, label %if.else
+//
 //      HIP: if.then:
-// HIP-NEXT:   %11 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
+// HIP-NEXT:   %15 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null)
 // HIP-NEXT:   br label %if.end
-
+//
 //      HIP: if.else:
 // HIP-NEXT:   switch i32 %type, label %if.end [
 // HIP-NEXT:     i32 0, label %sw.global
@@ -223,31 +238,28 @@
 // HIP-NEXT:     i32 2, label %sw.surface
 // HIP-NEXT:     i32 3, label %sw.texture
 // HIP-NEXT:   ]
-
+//
 //      HIP: sw.global:
 // HIP-NEXT:   call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %extern, i64 %size, i32 %constant, i32 0)
 // HIP-NEXT:   br label %if.end
-
+//
 //      HIP: sw.managed:
-// HIP-NEXT:  %managed.addr = load ptr, ptr %addr, align 8
-// HIP-NEXT:  %12 = getelementptr inbounds ptr, ptr %addr, i64 1
-// HIP-NEXT:  %managed.addr2 = load ptr, ptr %12, align 8
-// HIP-NEXT:  call void @__hipRegisterManagedVar(ptr %0, ptr %managed.addr, ptr %managed.addr2, ptr %name, i64 %size, i32 %textype)
-// HIP-NEXT:  br label %if.end
-
+// HIP-NEXT:   call void @__hipRegisterManagedVar(ptr %0, ptr %aux_addr, ptr %addr, ptr %name, i64 %size, i32 %9)
+// HIP-NEXT:   br label %if.end
+//
 //      HIP: sw.surface:
-// HIP-NEXT:   call void @__hipRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern)
+// HIP-NEXT:   call void @__hipRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %9, i32 %extern)
 // HIP-NEXT:   br label %if.end
-
+//
 //      HIP: sw.texture:
-// HIP-NEXT:   call void @__hipRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern)
+// HIP-NEXT:   call void @__hipRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %9, i32 %normalized, i32 %extern)
 // HIP-NEXT:   br label %if.end
-
+//
 //      HIP: if.end:
-// HIP-NEXT:   %13 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
-// HIP-NEXT:   %14 = icmp eq ptr %13, @__stop_hip_offloading_entries
-// HIP-NEXT:   br i1 %14, label %while.end, label %while.entry
-
+// HIP-NEXT:   %16 = getelementptr inbounds %struct.__tgt_offload_entry, ptr %entry1, i64 1
+// HIP-NEXT:   %17 = icmp eq ptr %16, @__stop_hip_offloading_entries
+// HIP-NEXT:   br i1 %17, label %while.end, label %while.entry
+//
 //      HIP: while.end:
 // HIP-NEXT:   ret void
 // HIP-NEXT: }
diff --git a/clang/test/OpenMP/declare_mapper_codegen.cpp b/clang/test/OpenMP/declare_mapper_codegen.cpp
index f9da3d97766d9..81453223b2a27 100644
--- a/clang/test/OpenMP/declare_mapper_codegen.cpp
+++ b/clang/test/OpenMP/declare_mapper_codegen.cpp
@@ -23,7 +23,7 @@
 // Mapper function code generation and runtime interface.
 
 // CK0: [[IDENT_T:%.+]] = type { i32, i32, i32, i32, ptr }
-// CK0: [[ENTRY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CK0: [[ENTRY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 // CK0: [[ANON_T:%.+]] = type { ptr }
 // CK0: [[ANON_T_0:%.+]] = type { ptr }
 // CK0: [[KMP_TASK_T_WITH_PRIVATES:%.+]] = type { [[KMP_TASK_T:%[^,]+]], [[KMP_PRIVATES_T:%.+]] }
@@ -235,7 +235,7 @@ void foo(int a){
 // CK0-DAG: [[PGEP]] = getelementptr inbounds {{.+}}[[PS:%[^,]+]], i32 0, i32 0
 // CK0-DAG: [[BP1:%.+]] = getelementptr inbounds {{.+}}[[BPS]], i32 0, i32 0
 // CK0-DAG: [[P1:%.+]] = getelementptr inbounds {{.+}}[[PS]], i32 0, i32 0
-// CK0-DAG: [[MPR1:%.+]] = getelementptr inbounds {{.+}}[[MPR]], i[[sz]] 0, i[[sz]] 0
+// CK0-DAG: [[MPR1:%.+]] = getelementptr inbounds {{.+}}[[MPR]], i[[sz:32|64]] 0, i[[sz]] 0
 // CK0-DAG: store ptr [[VAL:%[^,]+]], ptr [[BP1]]
 // CK0-DAG: store ptr [[VAL]], ptr [[P1]]
 // CK0-DAG: store ptr [[MPRFUNC]], ptr [[MPR1]]
@@ -250,7 +250,7 @@ void foo(int a){
   // CK0: store ptr [[CADDR:%[^,]+]], ptr [[BP2GEP]], align
   // CK0: [[P2GEP:%.+]] = getelementptr inbounds [1 x ptr], ptr [[OFFLOAD_P2:%[^,]+]], i32 0, i32 0
   // CK0: store ptr [[CADDR]], ptr [[P2GEP]], align
-  // CK0: [[MAPPER2GEP:%.+]] = getelementptr inbounds [1 x ptr], ptr [[OFFLOAD_MAPPER2:%[^,]+]], i[[SZ]] 0, i[[SZ]] 0
+  // CK0: [[MAPPER2GEP:%.+]] = getelementptr inbounds [1 x ptr], ptr [[OFFLOAD_MAPPER2:%[^,]+]], i[[SZ:32|64]] 0, i[[SZ]] 0
   // CK0: store ptr [[MPRFUNC]], ptr [[MAPPER2GEP]], align
   // CK0: [[BP2:%.+]] = getelementptr inbounds [1 x ptr], ptr [[OFFLOAD_BP2]], i32 0, i32 0
   // CK0: [[P2:%.+]] = getelementptr inbounds [1 x ptr], ptr [[OFFLOAD_P2]], i32 0, i32 0
diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp
index ba93772ede3e8..d2fd29dcfbc50 100644
--- a/clang/test/OpenMP/declare_target_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_codegen.cpp
@@ -30,7 +30,7 @@
 // CHECK-DAG: @dx = {{protected | }}global i32 0,
 // CHECK-DAG: @dy = {{protected | }}global i32 0,
 // CHECK-DAG: @bbb = {{protected | }}global i32 0,
-// CHECK-DAG: weak constant %struct.__tgt_offload_entry { ptr @bbb,
+// CHECK-DAG: weak constant %struct.__tgt_offload_entry {
 // CHECK-DAG: @ccc = external global i32,
 // CHECK-DAG: @ddd = {{protected | }}global i32 0,
 // CHECK-DAG: @hhh_decl_tgt_ref_ptr = weak global ptr null
diff --git a/clang/test/OpenMP/declare_target_link_codegen.cpp b/clang/test/OpenMP/declare_target_link_codegen.cpp
index 189c9ac59c153..1675e7ebbb83d 100644
--- a/clang/test/OpenMP/declare_target_link_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_link_codegen.cpp
@@ -27,11 +27,11 @@
 // HOST: [[SIZES:@.+]] = private unnamed_addr constant [3 x i64] [i64 4, i64 4, i64 4]
 // HOST: [[MAPTYPES:@.+]] = private unnamed_addr constant [3 x i64] [i64 35, i64 531, i64 531]
 // HOST: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_decl_tgt_ref_ptr\00"
-// HOST: @.offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @c_decl_tgt_ref_ptr, ptr @.offloading.entry_name, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
-// HOST-COFF: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry { ptr @.{{.*}}, ptr @.{{.*}}, i64 0, i32 0, i32 0 }, section "omp_offloading_entries$OE", align 1 
+// HOST: @.offloading.entry.c_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @c_decl_tgt_ref_ptr, ptr @.offloading.entry_name, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1 
+// HOST-COFF: @.offloading.entry.{{.*}} = weak constant %struct.__tgt_offload_entry { {{.*}} }, section "omp_offloading_entries$OE", align 1 
 // DEVICE-NOT: internal unnamed_addr constant [{{[0-9]+}} x i8] c"c_{{.*}}_decl_tgt_ref_ptr\00"
 // HOST: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [{{[0-9]+}} x i8] c"_{{.*}}d_{{.*}}_decl_tgt_ref_ptr\00"
-// HOST: @.offloading.entry.[[D_PTR]] = weak constant %struct.__tgt_offload_entry { ptr @[[D_PTR]], ptr @.offloading.entry_name{{.*}}
+// HOST: @.offloading.entry.[[D_PTR]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @[[D_PTR]], ptr @.offloading.entry_name.3, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1 
 
 extern int c;
 #pragma omp declare target link(c)
diff --git a/clang/test/OpenMP/nvptx_target_requires_unified_shared_memory.cpp b/clang/test/OpenMP/nvptx_target_requires_unified_shared_memory.cpp
index c1bfe36507d14..0bc249d1c4d3e 100644
--- a/clang/test/OpenMP/nvptx_target_requires_unified_shared_memory.cpp
+++ b/clang/test/OpenMP/nvptx_target_requires_unified_shared_memory.cpp
@@ -47,10 +47,10 @@ int bar(int n){
 // CHECK-HOST: [[OFFLOAD_MAPTYPES:@.+]] = private unnamed_addr constant [2 x i64] [i64 800, i64 800]
 
 // CHECK-HOST: [[OMP_OFFLOAD_ENTRY_LINK_VAR_PTR_NAME:@.+]] = internal unnamed_addr constant [21 x i8]
-// CHECK-HOST: [[OMP_OFFLOAD_ENTRY_LINK_VAR_PTR:@.+]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr [[VAR_DECL_TGT_LINK_PTR]], ptr [[OMP_OFFLOAD_ENTRY_LINK_VAR_PTR_NAME]], i64 8, i32 1, i32 0 }, section "omp_offloading_entries"
+// CHECK-HOST: [[OMP_OFFLOAD_ENTRY_LINK_VAR_PTR:@.+]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr [[VAR_DECL_TGT_LINK_PTR]], ptr [[OMP_OFFLOAD_ENTRY_LINK_VAR_PTR_NAME]], i64 8, i64 0, ptr null }, section "omp_offl
 
 // CHECK-HOST: [[OMP_OFFLOAD_ENTRY_TO_VAR_PTR_NAME:@.+]] = internal unnamed_addr constant [24 x i8]
-// CHECK-HOST: [[OMP_OFFLOAD_ENTRY_TO_VAR_PTR:@.+]] = weak{{.*}} constant %struct.__tgt_offload_entry { ptr [[VAR_DECL_TGT_TO_PTR]], ptr [[OMP_OFFLOAD_ENTRY_TO_VAR_PTR_NAME]], i64 8, i32 0, i32 0 }, section "omp_offloading_entries"
+// CHECK-HOST: [[OMP_OFFLOAD_ENTRY_TO_VAR_PTR:@.+]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr [[VAR_DECL_TGT_TO_PTR]], ptr [[OMP_OFFLOAD_ENTRY_TO_VAR_PTR_NAME]], i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
 
 // CHECK-HOST: [[N_CASTED:%.+]] = alloca i64
 // CHECK-HOST: [[SUM_CASTED:%.+]] = alloca i64
diff --git a/clang/test/OpenMP/openmp_offload_registration.cpp b/clang/test/OpenMP/openmp_offload_registration.cpp
index aff8d431650dc..be6d308fbda94 100644
--- a/clang/test/OpenMP/openmp_offload_registration.cpp
+++ b/clang/test/OpenMP/openmp_offload_registration.cpp
@@ -8,7 +8,7 @@ void foo(void) {
   {}
 }
 
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // Check presence of foo() and the outlined target region
 // CHECK: define{{.*}} void [[FOO:@.+]]()
diff --git a/clang/test/OpenMP/target_codegen.cpp b/clang/test/OpenMP/target_codegen.cpp
index 1e38e9c3fe082..ff126fbe4d02c 100644
--- a/clang/test/OpenMP/target_codegen.cpp
+++ b/clang/test/OpenMP/target_codegen.cpp
@@ -67,12 +67,12 @@
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
 // CHECK-DAG: [[S2:%.+]] = type { i32, i32, i32 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 // CHECK-DAG: [[ANON_T:%.+]] = type { ptr, i32, i32 }
 // CHECK-32-DAG: [[KMP_PRIVATES_T]] = type { [2 x i64], ptr, i32, [2 x ptr], [2 x ptr] }
 // CHECK-64-DAG: [[KMP_PRIVATES_T]] = type { ptr, [2 x ptr], [2 x ptr], [2 x i64], i32 }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // We have 9 target regions, but only 8 that actually will generate offloading
 // code and have mapped arguments, and only 6 have all-constant map sizes.
@@ -180,7 +180,7 @@ int foo(int n) {
     local1 = global;
   }
 
-  // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}})
+  // CHECK:       call void [[HVT1:@.+]](i[[SZ:32|64]] {{[^,]+}})
   #pragma omp target if(0) firstprivate(global)
   {
     global += 1;
diff --git a/clang/test/OpenMP/target_codegen_registration.cpp b/clang/test/OpenMP/target_codegen_registration.cpp
index 4927147d080f2..97994f0e3182a 100644
--- a/clang/test/OpenMP/target_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_codegen_registration.cpp
@@ -51,9 +51,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -119,54 +119,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME1]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME2]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME3]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME4]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME5]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME6]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME7]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME8]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME9]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME10]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME11]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: @.offloading.entry.[[NAME12]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME1]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME2]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME3]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME4]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME5]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME6]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME7]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME8]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME9]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME10]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME11]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: @.offloading.entry.[[NAME12]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -401,31 +377,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 204, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 254, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 276, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 396, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 293, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 299, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 287, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 229, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_depend_codegen.cpp b/clang/test/OpenMP/target_depend_codegen.cpp
index babd2843309f7..73ffa120452c1 100644
--- a/clang/test/OpenMP/target_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [3 x i64] [i64 0, i64 4, i64 {{16|12}}]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [3 x i64] [i64 544, i64 800, i64 3]
@@ -79,7 +79,7 @@ int foo(int n) {
 // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
 // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
 // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr [[DEP_START:%.+]], i[[SZ]] 1
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr [[DEP_START]], i[[SZ]] 2
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr [[DEP_START]], i[[SZ]] 3
diff --git a/clang/test/OpenMP/target_indirect_codegen.cpp b/clang/test/OpenMP/target_indirect_codegen.cpp
index 974f8b20c0bfc..20a36c2935516 100644
--- a/clang/test/OpenMP/target_indirect_codegen.cpp
+++ b/clang/test/OpenMP/target_indirect_codegen.cpp
@@ -11,13 +11,13 @@
 //.
 // HOST: @[[VAR:.+]] = global i8 0, align 1
 // HOST: @[[FOO_ENTRY_NAME:.+]] = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[FOO_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_foo_l[0-9]+]]\00"
-// HOST: @.offloading.entry.[[FOO_NAME]] = weak constant %struct.__tgt_offload_entry { ptr @_Z3foov, ptr @[[FOO_ENTRY_NAME]], i64 8, i32 8, i32 0 }, section "omp_offloading_entries", align 1
+// HOST: @.offloading.entry.[[FOO_NAME]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 8, ptr @_Z3foov, ptr @[[FOO_ENTRY_NAME]], i64 8, i64 0, ptr null }
 // HOST: @[[BAZ_ENTRY_NAME:.+]] = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[BAZ_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_baz_l[0-9]+]]\00"
-// HOST: @.offloading.entry.[[BAZ_NAME]] = weak constant %struct.__tgt_offload_entry { ptr @_Z3bazv, ptr @[[BAZ_ENTRY_NAME]], i64 8, i32 8, i32 0 }, section "omp_offloading_entries", align 1
+// HOST: @.offloading.entry.[[BAZ_NAME]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 8, ptr @_Z3bazv, ptr @[[BAZ_ENTRY_NAME]], i64 8, i64 0, ptr null }
 // HOST: @[[VAR_ENTRY_NAME:.+]] = internal unnamed_addr constant [4 x i8] c"var\00"
-// HOST: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { ptr @[[VAR]], ptr @[[VAR_ENTRY_NAME]], i64 1, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+// HOST: @.offloading.entry.var = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @[[VAR]], ptr @[[VAR_ENTRY_NAME]], i64 1, i64 0, ptr null }
 // HOST: @[[BAR_ENTRY_NAME:.+]] = internal unnamed_addr constant [{{[0-9]+}} x i8] c"[[BAR_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_bar_l[0-9]+]]\00"
-// HOST: @.offloading.entry.[[BAR_NAME]] = weak constant %struct.__tgt_offload_entry { ptr @_ZL3barv, ptr @[[BAR_ENTRY_NAME]], i64 8, i32 8, i32 0 }, section "omp_offloading_entries", align 1
+// HOST: @.offloading.entry.[[BAR_NAME]] = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 8, ptr @_ZL3barv, ptr @[[BAR_ENTRY_NAME]], i64 8, i64 0, ptr null }
 //.
 // DEVICE: @[[FOO_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_foo_l[0-9]+]] = protected addrspace(1) constant ptr @_Z3foov
 // DEVICE: @[[BAZ_NAME:__omp_offloading_[0-9a-z]+_[0-9a-z]+_baz_l[0-9]+]] = protected addrspace(1) constant ptr @_Z3bazv
diff --git a/clang/test/OpenMP/target_parallel_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_codegen_registration.cpp
index dd3ec98bcfef7..0997a6f006b52 100644
--- a/clang/test/OpenMP/target_parallel_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -442,31 +418,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_parallel_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
index 86c26523c2e14..52264ee79e123 100644
--- a/clang/test/OpenMP/target_parallel_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
 // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
 // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
 // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp
index b65241109b0f8..653f5996d0142 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,31 +428,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
index 928ea0397571a..aec4feda15cf0 100644
--- a/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp
index 4c996c88de530..312ad5615ca1c 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,31 +428,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
index bb16edbb8a090..62f772ea79156 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[IN:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_simd_codegen.cpp b/clang/test/OpenMP/target_simd_codegen.cpp
index e2ff3d5f53e44..c8ecaa814bafe 100644
--- a/clang/test/OpenMP/target_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_codegen.cpp
@@ -71,9 +71,9 @@
 // CHECK-DAG: [[KMP_TASK_T]] = type { ptr, ptr, i32, %{{[^,]+}}, %{{[^,]+}} }
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
 // CHECK-DAG: [[S1:%.+]] = type { double }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // We have 8 target regions, but only 7 that actually will generate offloading
 // code, only 6 will have mapped arguments, and only 4 have all-constant map
@@ -136,7 +136,7 @@ int foo(int n) {
   for (int i = 3; i < 32; i += 5) {
   }
 
-  // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}}, {{[^)]+}})
+  // CHECK:       call void [[HVT1:@.+]](i[[SZ:32|64]] {{[^,]+}}, {{[^)]+}})
   long long k = get_val();
   #pragma omp target simd if(target: 0) linear(k : 3)
   for (int i = 10; i > 1; i--) {
diff --git a/clang/test/OpenMP/target_simd_codegen_registration.cpp b/clang/test/OpenMP/target_simd_codegen_registration.cpp
index 4c0ca8947b3fc..69cfe5c8e200f 100644
--- a/clang/test/OpenMP/target_simd_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_simd_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,31 +428,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_simd_depend_codegen.cpp b/clang/test/OpenMP/target_simd_depend_codegen.cpp
index d127078af5f4e..d813173dffbef 100644
--- a/clang/test/OpenMP/target_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
 // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
 // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
 // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_teams_codegen_registration.cpp b/clang/test/OpenMP/target_teams_codegen_registration.cpp
index 60c596853a64f..0627f7a822770 100644
--- a/clang/test/OpenMP/target_teams_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_teams_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -442,31 +418,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 295, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 311, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 317, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 437, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 340, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 328, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 270, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_teams_depend_codegen.cpp b/clang/test/OpenMP/target_teams_depend_codegen.cpp
index 960c5980cbe68..b2280b80995fb 100644
--- a/clang/test/OpenMP/target_teams_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
 // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
 // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
 // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+// CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr @1, i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
 // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp
index d619927d5bb80..21099be1dfc86 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,31 +428,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
index c0db03d29711f..f9306d3091ee8 100644
--- a/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
index 57e19df273825..4d93515c738ed 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp
index f93ac400a2660..5a12a8de3d8fc 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,31 +428,31 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 #endif
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
index 15c116bad4b68..ed760d7e2e000 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_depend_codegen.cpp
@@ -39,9 +39,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // CHECK-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -76,7 +76,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp
index 9814448d5a6b0..18acfc4f3b690 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen_registration.cpp
@@ -92,9 +92,9 @@
 // CHECK-DAG: [[SE:%.+]] = type { [64 x i32] }
 // CHECK-DAG: [[ST1:%.+]] = type { [228 x i32] }
 // CHECK-DAG: [[ST2:%.+]] = type { [1128 x i32] }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK:    [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // CHECK-DAG: [[A1:@.+]] = internal global [[SA]]
 // CHECK-DAG: [[A2:@.+]] ={{.*}} global [[SA]]
@@ -160,54 +160,30 @@
 // CHECK-NTARGET-NOT: private unnamed_addr constant [1 x i
 
 // CHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// CHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// CHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// CHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// CHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// CHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// CHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// CHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// CHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// CHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// CHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// CHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // CHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// CHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // TCHECK-DAG: [[NAMEPTR1:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME1:__omp_offloading_[0-9a-f]+_[0-9a-f]+__Z.+_l[0-9]+]]\00"
-// TCHECK-DAG: [[ENTRY1:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR1]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR2:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME2:.+]]\00"
-// TCHECK-DAG: [[ENTRY2:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR2]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR3:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME3:.+]]\00"
-// TCHECK-DAG: [[ENTRY3:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR3]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR4:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME4:.+]]\00"
-// TCHECK-DAG: [[ENTRY4:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR4]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR5:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME5:.+]]\00"
-// TCHECK-DAG: [[ENTRY5:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR5]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR6:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME6:.+]]\00"
-// TCHECK-DAG: [[ENTRY6:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR6]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR7:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME7:.+]]\00"
-// TCHECK-DAG: [[ENTRY7:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR7]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR8:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME8:.+]]\00"
-// TCHECK-DAG: [[ENTRY8:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR8]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR9:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME9:.+]]\00"
-// TCHECK-DAG: [[ENTRY9:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR9]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR10:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME10:.+]]\00"
-// TCHECK-DAG: [[ENTRY10:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR10]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR11:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME11:.+]]\00"
-// TCHECK-DAG: [[ENTRY11:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR11]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 // TCHECK-DAG: [[NAMEPTR12:@.+]] = internal unnamed_addr constant [{{.*}} x i8] c"[[NAME12:.+]]\00"
-// TCHECK-DAG: [[ENTRY12:@.+]] = weak{{.*}} constant [[ENTTY]] { ptr @{{.*}}, ptr [[NAMEPTR12]], i[[SZ]] 0, i32 0, i32 0 }, section "omp_offloading_entries", align 1
 
 // We have 4 initializers, one for the 500 priority, another one for 501, or more for the default priority, and the last one for the offloading registration function.
 // CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [
@@ -452,32 +428,32 @@ int bar(int a){
 
 // Check metadata is properly generated:
 // CHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// CHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK:     !omp_offload.info = !{!{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}, !{{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 245, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 297, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 315, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 322, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 446, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 341, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 348, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 334, i32 0, i32 {{[0-9]+}}}
-// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 271, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID:-?[0-9]+]], i32 [[FILEID:-?[0-9]+]], !"_ZN2SB3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SDD1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_Z3bari", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EEC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi1000EED1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2STILi100EE3fooEv", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
+// TCHECK-DAG: = !{i32 0, i32 [[DEVID]], i32 [[FILEID]], !"_ZN2SCC1Ev", i32 {{[0-9]+}}, i32 0, i32 {{[0-9]+}}}
 
 // TCHECK-DAG: !{!"llvm.loop.vectorize.enable", i1 true}
 // CHECK-DAG: !{!"llvm.loop.vectorize.enable", i1 true}
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
index f3dfd1f10316f..9335c65bb2296 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_depend_codegen.cpp
@@ -48,9 +48,9 @@
 #define HEADER
 
 // CHECK-DAG: [[TT:%.+]] = type { i64, i8 }
-// CHECK-DAG: [[ENTTY:%.+]] = type { ptr, ptr, i[[SZ:32|64]], i32, i32 }
+// CHECK-DAG: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
-// TCHECK: [[ENTTY:%.+]] = type { ptr, ptr, i{{32|64}}, i32, i32 }
+// TCHECK: [[ENTTY:%.+]] = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 
 // OMP45-DAG: [[SIZET:@.+]] = private unnamed_addr constant [2 x i64] [i64 0, i64 4]
 // OMP45-DAG: [[MAPT:@.+]] = private unnamed_addr constant [2 x i64] [i64 544, i64 800]
@@ -90,7 +90,7 @@ int foo(int n) {
   // CHECK:       [[GEP:%.+]] = getelementptr inbounds nuw %{{.+}}, ptr %{{.+}}, i32 0, i32 0
   // CHECK:       [[DEV:%.+]] = load i32, ptr [[DEVICE_CAP]],
   // CHECK:       store i32 [[DEV]], ptr [[GEP]],
-  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
+  // CHECK:       [[TASK:%.+]] = call ptr @__kmpc_omp_task_alloc(ptr [[ID:@.+]], i32 [[GTID:%.+]], i32 1, i[[SZ:32|64]] {{20|40}}, i[[SZ]] 4, ptr [[TASK_ENTRY0:@.+]])
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 0
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 1
   // CHECK:       getelementptr %struct.kmp_depend_info, ptr %{{.+}}, i[[SZ]] 2
diff --git a/llvm/include/llvm/Frontend/Offloading/Utility.h b/llvm/include/llvm/Frontend/Offloading/Utility.h
index ddcf0a946d635..7932fd5acbe1e 100644
--- a/llvm/include/llvm/Frontend/Offloading/Utility.h
+++ b/llvm/include/llvm/Frontend/Offloading/Utility.h
@@ -24,11 +24,24 @@ namespace offloading {
 /// This is the record of an object that just be registered with the offloading
 /// runtime.
 struct EntryTy {
+  /// Reserved bytes used to detect an older version of the struct, always zero.
+  uint64_t Reserved = 0x0;
+  /// The current version of the struct for runtime forward compatibility.
+  uint16_t Version = 0x1;
+  /// The expected consumer of this entry, e.g. CUDA or OpenMP.
+  uint16_t Kind;
+  /// Flags associated with the global.
+  uint32_t Flags;
+  /// The address of the global to be registered by the runtime.
   void *Address;
+  /// The name of the symbol in the device image.
   char *SymbolName;
-  size_t Size;
-  int32_t Flags;
-  int32_t Data;
+  /// The number of bytes the symbol takes.
+  uint64_t Size;
+  /// Extra generic data used to register this entry.
+  uint64_t Data;
+  /// An extra pointer, usually null.
+  void *AuxAddr;
 };
 
 /// Offloading entry flags for CUDA / HIP. The first three bits indicate the
@@ -55,29 +68,30 @@ enum OffloadEntryKindFlag : uint32_t {
 /// globals that will be registered with the offloading runtime.
 StructType *getEntryTy(Module &M);
 
-/// Returns the struct type we store the two pointers for CUDA / HIP managed
-/// variables in. Necessary until we widen the offload entry struct.
-StructType *getManagedTy(Module &M);
-
 /// Create an offloading section struct used to register this global at
 /// runtime.
 ///
 /// \param M The module to be used
 /// \param Addr The pointer to the global being registered.
+/// \param Kind The offloading language expected to consume this.
 /// \param Name The symbol name associated with the global.
 /// \param Size The size in bytes of the global (0 for functions).
 /// \param Flags Flags associated with the entry.
 /// \param Data Extra data storage associated with the entry.
 /// \param SectionName The section this entry will be placed at.
-void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
-                         uint64_t Size, int32_t Flags, int32_t Data,
-                         StringRef SectionName);
+/// \param AuxAddr An extra pointer if needed.
+void emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr,
+                         StringRef Name, uint64_t Size, uint32_t Flags,
+                         uint64_t Data, StringRef SectionName,
+                         Constant *AuxAddr = nullptr);
+
 /// Create a constant struct initializer used to register this global at
 /// runtime.
 /// \return the constant struct and the global variable holding the symbol name.
 std::pair<Constant *, GlobalVariable *>
-getOffloadingEntryInitializer(Module &M, Constant *Addr, StringRef Name,
-                              uint64_t Size, int32_t Flags, int32_t Data);
+getOffloadingEntryInitializer(Module &M, object::OffloadKind Kind,
+                              Constant *Addr, StringRef Name, uint64_t Size,
+                              uint32_t Flags, uint64_t Data, Constant *AuxAddr);
 
 /// Creates a pair of globals used to iterate the array of offloading entries by
 /// accessing the section variables provided by the linker.
diff --git a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
index 478cc8ab05d1a..b494c3748ac67 100644
--- a/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
+++ b/llvm/lib/Frontend/Offloading/OffloadWrapper.cpp
@@ -181,7 +181,7 @@ GlobalVariable *createBinDesc(Module &M, ArrayRef<ArrayRef<char>> Bufs,
       ConstantInt::get(Type::getInt32Ty(C), ImagesInits.size()), ImagesB,
       EntriesB, EntriesE);
 
-  return new GlobalVariable(M, DescInit->getType(), /*isConstant*/ true,
+  return new GlobalVariable(M, DescInit->getType(), /*isConstant=*/true,
                             GlobalValue::InternalLinkage, DescInit,
                             ".omp_offloading.descriptor" + Suffix);
 }
@@ -317,7 +317,10 @@ GlobalVariable *createFatbinDesc(Module &M, ArrayRef<char> Image, bool IsHIP,
 /// void __cudaRegisterTest(void **fatbinHandle) {
 ///   for (struct __tgt_offload_entry *entry = &__start_cuda_offloading_entries;
 ///        entry != &__stop_cuda_offloading_entries; ++entry) {
-///     if (!entry->size)
+///     if (entry->Kind != OFK_CUDA)
+///       continue
+///
+///     if (!entry->Size)
 ///       __cudaRegisterFunction(fatbinHandle, entry->addr, entry->name,
 ///                              entry->name, -1, 0, 0, 0, 0, 0);
 ///     else
@@ -391,6 +394,7 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP,
   // Create the loop to register all the entries.
   IRBuilder<> Builder(BasicBlock::Create(C, "entry", RegGlobalsFn));
   auto *EntryBB = BasicBlock::Create(C, "while.entry", RegGlobalsFn);
+  auto *IfKindBB = BasicBlock::Create(C, "if.kind", RegGlobalsFn);
   auto *IfThenBB = BasicBlock::Create(C, "if.then", RegGlobalsFn);
   auto *IfElseBB = BasicBlock::Create(C, "if.else", RegGlobalsFn);
   auto *SwGlobalBB = BasicBlock::Create(C, "sw.global", RegGlobalsFn);
@@ -406,30 +410,42 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP,
   auto *Entry = Builder.CreatePHI(PointerType::getUnqual(C), 2, "entry");
   auto *AddrPtr =
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
-                                {ConstantInt::get(getSizeTTy(M), 0),
-                                 ConstantInt::get(Type::getInt32Ty(C), 0)});
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 4)});
   auto *Addr = Builder.CreateLoad(Int8PtrTy, AddrPtr, "addr");
+  auto *AuxAddrPtr =
+      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 8)});
+  auto *AuxAddr = Builder.CreateLoad(Int8PtrTy, AuxAddrPtr, "aux_addr");
+  auto *KindPtr =
+      Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 2)});
+  auto *Kind = Builder.CreateLoad(Type::getInt16Ty(C), KindPtr, "kind");
   auto *NamePtr =
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
-                                {ConstantInt::get(getSizeTTy(M), 0),
-                                 ConstantInt::get(Type::getInt32Ty(C), 1)});
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 5)});
   auto *Name = Builder.CreateLoad(Int8PtrTy, NamePtr, "name");
   auto *SizePtr =
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
-                                {ConstantInt::get(getSizeTTy(M), 0),
-                                 ConstantInt::get(Type::getInt32Ty(C), 2)});
-  auto *Size = Builder.CreateLoad(getSizeTTy(M), SizePtr, "size");
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 6)});
+  auto *Size = Builder.CreateLoad(Type::getInt64Ty(C), SizePtr, "size");
   auto *FlagsPtr =
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
-                                {ConstantInt::get(getSizeTTy(M), 0),
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
                                  ConstantInt::get(Type::getInt32Ty(C), 3)});
   auto *Flags = Builder.CreateLoad(Type::getInt32Ty(C), FlagsPtr, "flags");
   auto *DataPtr =
       Builder.CreateInBoundsGEP(offloading::getEntryTy(M), Entry,
-                                {ConstantInt::get(getSizeTTy(M), 0),
-                                 ConstantInt::get(Type::getInt32Ty(C), 4)});
-  auto *Data = Builder.CreateLoad(Type::getInt32Ty(C), DataPtr, "textype");
-  auto *Kind = Builder.CreateAnd(
+                                {ConstantInt::get(Type::getInt32Ty(C), 0),
+                                 ConstantInt::get(Type::getInt32Ty(C), 7)});
+  auto *Data = Builder.CreateTrunc(
+      Builder.CreateLoad(Type::getInt64Ty(C), DataPtr, "data"),
+      Type::getInt32Ty(C));
+  auto *Type = Builder.CreateAnd(
       Flags, ConstantInt::get(Type::getInt32Ty(C), 0x7), "type");
 
   // Extract the flags stored in the bit-field and convert them to C booleans.
@@ -448,8 +464,14 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP,
                               llvm::offloading::OffloadGlobalNormalized));
   auto *Normalized = Builder.CreateLShr(
       NormalizedBit, ConstantInt::get(Type::getInt32Ty(C), 5), "normalized");
-  auto *FnCond =
-      Builder.CreateICmpEQ(Size, ConstantInt::getNullValue(getSizeTTy(M)));
+  auto *KindCond = Builder.CreateICmpEQ(
+      Kind, ConstantInt::get(Type::getInt16Ty(C),
+                             IsHIP ? object::OffloadKind::OFK_HIP
+                                   : object::OffloadKind::OFK_Cuda));
+  Builder.CreateCondBr(KindCond, IfKindBB, IfEndBB);
+  Builder.SetInsertPoint(IfKindBB);
+  auto *FnCond = Builder.CreateICmpEQ(
+      Size, ConstantInt::getNullValue(Type::getInt64Ty(C)));
   Builder.CreateCondBr(FnCond, IfThenBB, IfElseBB);
 
   // Create kernel registration code.
@@ -464,7 +486,7 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP,
   Builder.CreateBr(IfEndBB);
   Builder.SetInsertPoint(IfElseBB);
 
-  auto *Switch = Builder.CreateSwitch(Kind, IfEndBB);
+  auto *Switch = Builder.CreateSwitch(Type, IfEndBB);
   // Create global variable registration code.
   Builder.SetInsertPoint(SwGlobalBB);
   Builder.CreateCall(RegVar,
@@ -476,12 +498,8 @@ Function *createRegisterGlobalsFunction(Module &M, bool IsHIP,
 
   // Create managed variable registration code.
   Builder.SetInsertPoint(SwManagedBB);
-  auto *ManagedVar = Builder.CreateLoad(Int8PtrTy, Addr, "managed.addr");
-  auto *ManagedAddr = Builder.CreateInBoundsGEP(
-      Int8PtrTy, Addr, {ConstantInt::get(Builder.getInt64Ty(), 1)});
-  auto *Managed = Builder.CreateLoad(Int8PtrTy, ManagedAddr, "managed.addr");
-  Builder.CreateCall(RegManagedVar, {RegGlobalsFn->arg_begin(), ManagedVar,
-                                     Managed, Name, Size, Data});
+  Builder.CreateCall(RegManagedVar, {RegGlobalsFn->arg_begin(), AuxAddr, Addr,
+                                     Name, Size, Data});
   Builder.CreateBr(IfEndBB);
   Switch->addCase(Builder.getInt32(llvm::offloading::OffloadGlobalManagedEntry),
                   SwManagedBB);
diff --git a/llvm/lib/Frontend/Offloading/Utility.cpp b/llvm/lib/Frontend/Offloading/Utility.cpp
index 26dab0b22fa12..8117a42b8a45c 100644
--- a/llvm/lib/Frontend/Offloading/Utility.cpp
+++ b/llvm/lib/Frontend/Offloading/Utility.cpp
@@ -27,31 +27,23 @@ StructType *offloading::getEntryTy(Module &M) {
       StructType::getTypeByName(C, "struct.__tgt_offload_entry");
   if (!EntryTy)
     EntryTy = StructType::create(
-        "struct.__tgt_offload_entry", PointerType::getUnqual(C),
-        PointerType::getUnqual(C), M.getDataLayout().getIntPtrType(C),
-        Type::getInt32Ty(C), Type::getInt32Ty(C));
+        "struct.__tgt_offload_entry", Type::getInt64Ty(C), Type::getInt16Ty(C),
+        Type::getInt16Ty(C), Type::getInt32Ty(C), PointerType::getUnqual(C),
+        PointerType::getUnqual(C), Type::getInt64Ty(C), Type::getInt64Ty(C),
+        PointerType::getUnqual(C));
   return EntryTy;
 }
 
-StructType *offloading::getManagedTy(Module &M) {
-  LLVMContext &C = M.getContext();
-  StructType *StructTy = StructType::getTypeByName(C, "struct.__managed_var");
-  if (!StructTy)
-    StructTy = llvm::StructType::create("struct.__managed_var",
-                                        PointerType::getUnqual(M.getContext()),
-                                        PointerType::getUnqual(M.getContext()));
-  return StructTy;
-}
-
-// TODO: Rework this interface to be more generic.
 std::pair<Constant *, GlobalVariable *>
-offloading::getOffloadingEntryInitializer(Module &M, Constant *Addr,
-                                          StringRef Name, uint64_t Size,
-                                          int32_t Flags, int32_t Data) {
+offloading::getOffloadingEntryInitializer(Module &M, object::OffloadKind Kind,
+                                          Constant *Addr, StringRef Name,
+                                          uint64_t Size, uint32_t Flags,
+                                          uint64_t Data, Constant *AuxAddr) {
   llvm::Triple Triple(M.getTargetTriple());
-  Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
+  Type *PtrTy = PointerType::getUnqual(M.getContext());
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
-  Type *SizeTy = M.getDataLayout().getIntPtrType(M.getContext());
+  Type *Int16Ty = Type::getInt16Ty(M.getContext());
 
   Constant *AddrName = ConstantDataArray::getString(M.getContext(), Name);
 
@@ -74,23 +66,29 @@ offloading::getOffloadingEntryInitializer(Module &M, Constant *Addr,
 
   // Construct the offloading entry.
   Constant *EntryData[] = {
-      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Addr, Int8PtrTy),
-      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, Int8PtrTy),
-      ConstantInt::get(SizeTy, Size),
+      ConstantExpr::getNullValue(Int64Ty),
+      ConstantInt::get(Int16Ty, 1),
+      ConstantInt::get(Int16Ty, Kind),
       ConstantInt::get(Int32Ty, Flags),
-      ConstantInt::get(Int32Ty, Data),
-  };
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Addr, PtrTy),
+      ConstantExpr::getPointerBitCastOrAddrSpaceCast(Str, PtrTy),
+      ConstantInt::get(Int64Ty, Size),
+      ConstantInt::get(Int64Ty, Data),
+      AuxAddr ? ConstantExpr::getPointerBitCastOrAddrSpaceCast(AuxAddr, PtrTy)
+              : ConstantExpr::getNullValue(PtrTy)};
   Constant *EntryInitializer = ConstantStruct::get(getEntryTy(M), EntryData);
   return {EntryInitializer, Str};
 }
 
-void offloading::emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name,
-                                     uint64_t Size, int32_t Flags, int32_t Data,
-                                     StringRef SectionName) {
+void offloading::emitOffloadingEntry(Module &M, object::OffloadKind Kind,
+                                     Constant *Addr, StringRef Name,
+                                     uint64_t Size, uint32_t Flags,
+                                     uint64_t Data, StringRef SectionName,
+                                     Constant *AuxAddr) {
   llvm::Triple Triple(M.getTargetTriple());
 
-  auto [EntryInitializer, NameGV] =
-      getOffloadingEntryInitializer(M, Addr, Name, Size, Flags, Data);
+  auto [EntryInitializer, NameGV] = getOffloadingEntryInitializer(
+      M, Kind, Addr, Name, Size, Flags, Data, AuxAddr);
 
   StringRef Prefix =
       Triple.isNVPTX() ? "$offloading$entry$" : ".offloading.entry.";
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 6d234262e402f..4c4a0d25906cb 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -9219,7 +9219,8 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
                                          StringRef Name) {
   if (!Config.isGPU()) {
     llvm::offloading::emitOffloadingEntry(
-        M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
+        M, object::OffloadKind::OFK_OpenMP, ID,
+        Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
         "omp_offloading_entries");
     return;
   }
@@ -9394,7 +9395,8 @@ void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
   //       entries should be redesigned to better suit this use-case.
   if (Config.hasRequiresFlags() && !Config.isTargetDevice())
     offloading::emitOffloadingEntry(
-        M, Constant::getNullValue(PointerType::getUnqual(M.getContext())),
+        M, object::OffloadKind::OFK_OpenMP,
+        Constant::getNullValue(PointerType::getUnqual(M.getContext())),
         /*Name=*/"",
         /*Size=*/0, OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
         Config.getRequiresFlags(), "omp_offloading_entries");
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
index 244c0315c2dbc..76c29dc9a1770 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-host.mlir
@@ -1,20 +1,20 @@
 // RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
 
-// CHECK-DAG: %struct.__tgt_offload_entry = type { ptr, ptr, i64, i32, i32 }
+// CHECK-DAG: %struct.__tgt_offload_entry = type { i64, i16, i16, i32, ptr, ptr, i64, i64, ptr }
 // CHECK-DAG: !omp_offload.info = !{!{{.*}}}
 module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_target_device = false} {
 
   // CHECK-DAG: @_QMtest_0Earray_1d = global [3 x i32] [i32 1, i32 2, i32 3]
   // CHECK-DAG: @_QMtest_0Earray_1d_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Earray_1d
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Earray_1d_decl_tgt_ref_ptr\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Earray_1d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Earray_1d_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Earray_1d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Earray_1d_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Earray_1d_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Earray_1d(dense<[1, 2, 3]> : tensor<3xi32>) {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.array<3 x i32>
 
   // CHECK-DAG: @_QMtest_0Earray_2d = global [2 x [2 x i32]] {{.*}}
   // CHECK-DAG: @_QMtest_0Earray_2d_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Earray_2d
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Earray_2d_decl_tgt_ref_ptr\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Earray_2d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Earray_2d_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Earray_2d_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Earray_2d_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Earray_2d_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Earray_2d() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.array<2 x array<2 x i32>> {
     %0 = llvm.mlir.undef : !llvm.array<2 x array<2 x i32>>
@@ -34,7 +34,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
   // CHECK-DAG:  @_QMtest_0Edata_extended_link_1 = global float 2.000000e+00
   // CHECK-DAG:  @_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_extended_link_1
   // CHECK-DAG:  @.offloading.entry_name{{.*}} = internal unnamed_addr constant [48 x i8] c"_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr\00"
-  // CHECK-DAG:  @.offloading.entry._QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG:  @.offloading.entry._QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG:  !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_link_1_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_link_1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
     %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
@@ -44,7 +44,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
   // CHECK-DAG:  @_QMtest_0Edata_extended_link_2 = global float 3.000000e+00
   // CHECK-DAG:  @_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_extended_link_2
   // CHECK-DAG:  @.offloading.entry_name{{.*}} = internal unnamed_addr constant [48 x i8] c"_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr\00"
-  // CHECK-DAG:  @.offloading.entry._QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG:  @.offloading.entry._QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG:  !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_link_2_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_link_2() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : f32 {
     %0 = llvm.mlir.constant(3.000000e+00 : f32) : f32
@@ -53,7 +53,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_extended_to_1 = global float 2.000000e+00
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [29 x i8] c"_QMtest_0Edata_extended_to_1\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_to_1 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_to_1, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_to_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_extended_to_1, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_to_1", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_to_1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
     %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
@@ -62,7 +62,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_extended_enter_1 = global float 2.000000e+00
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [32 x i8] c"_QMtest_0Edata_extended_enter_1\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_enter_1 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_enter_1, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_enter_1 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_extended_enter_1, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_enter_1", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_enter_1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32 {
     %0 = llvm.mlir.constant(2.000000e+00 : f32) : f32
@@ -71,7 +71,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_extended_to_2 = global float 3.000000e+00
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [29 x i8] c"_QMtest_0Edata_extended_to_2\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_to_2 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_to_2, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_to_2 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_extended_to_2, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_to_2", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_to_2() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : f32 {
     %0 = llvm.mlir.constant(3.000000e+00 : f32) : f32
@@ -80,7 +80,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_extended_enter_2 = global float 3.000000e+00
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [32 x i8] c"_QMtest_0Edata_extended_enter_2\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_enter_2 = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_extended_enter_2, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_extended_enter_2 = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_extended_enter_2, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_extended_enter_2", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_extended_enter_2() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : f32 {
     %0 = llvm.mlir.constant(3.000000e+00 : f32) : f32
@@ -90,7 +90,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
   // CHECK-DAG: @_QMtest_0Edata_int = global i32 1
   // CHECK-DAG: @_QMtest_0Edata_int_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Edata_int
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Edata_int_decl_tgt_ref_ptr\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Edata_int_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_int() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
     %0 = llvm.mlir.constant(10 : i32) : i32
@@ -99,7 +99,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_int_clauseless_to = global i32 1
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [33 x i8] c"_QMtest_0Edata_int_clauseless_to\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_clauseless_to = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_clauseless_to, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_clauseless_to = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_int_clauseless_to, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_clauseless_to", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_int_clauseless_to() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
     %0 = llvm.mlir.constant(1 : i32) : i32
@@ -108,7 +108,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_int_clauseless_enter = global i32 1
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [36 x i8] c"_QMtest_0Edata_int_clauseless_enter\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_clauseless_enter = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_clauseless_enter, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_clauseless_enter = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_int_clauseless_enter, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_clauseless_enter", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_int_clauseless_enter() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : i32 {
     %0 = llvm.mlir.constant(1 : i32) : i32
@@ -117,7 +117,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_int_to = global i32 5
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [22 x i8] c"_QMtest_0Edata_int_to\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_to = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_to, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_to = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_int_to, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_to", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_int_to() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} : i32 {
     %0 = llvm.mlir.constant(5 : i32) : i32
@@ -126,7 +126,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
 
   // CHECK-DAG: @_QMtest_0Edata_int_enter = global i32 5
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [25 x i8] c"_QMtest_0Edata_int_enter\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_enter = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Edata_int_enter, ptr @.offloading.entry_name{{.*}}, i64 4, i32 0, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Edata_int_enter = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 0, ptr @_QMtest_0Edata_int_enter, ptr @.offloading.entry_name{{.*}}, i64 4, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Edata_int_enter", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Edata_int_enter() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (enter)>} : i32 {
     %0 = llvm.mlir.constant(5 : i32) : i32
@@ -136,7 +136,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
   // CHECK-DAG: @_QMtest_0Ept1 = global { ptr, i64, i32, i8, i8, i8, i8 } { ptr null, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20180515, i8 0, i8 9, i8 1, i8 0 }
   // CHECK-DAG: @_QMtest_0Ept1_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Ept1
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [31 x i8] c"_QMtest_0Ept1_decl_tgt_ref_ptr\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Ept1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Ept1_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Ept1_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Ept1_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Ept1_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Ept1() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)> {
     %0 = llvm.mlir.zero : !llvm.ptr
@@ -167,7 +167,7 @@ module attributes {llvm.target_triple = "x86_64-unknown-linux-gnu", omp.is_targe
   // CHECK-DAG: @_QMtest_0Ept2_tar = global i32 5
   // CHECK-DAG: @_QMtest_0Ept2_tar_decl_tgt_ref_ptr = weak global ptr @_QMtest_0Ept2_tar
   // CHECK-DAG: @.offloading.entry_name{{.*}} = internal unnamed_addr constant [35 x i8] c"_QMtest_0Ept2_tar_decl_tgt_ref_ptr\00"
-  // CHECK-DAG: @.offloading.entry._QMtest_0Ept2_tar_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { ptr @_QMtest_0Ept2_tar_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i32 1, i32 0 }, section "omp_offloading_entries", align 1
+  // CHECK-DAG: @.offloading.entry._QMtest_0Ept2_tar_decl_tgt_ref_ptr = weak constant %struct.__tgt_offload_entry { i64 0, i16 1, i16 1, i32 1, ptr @_QMtest_0Ept2_tar_decl_tgt_ref_ptr, ptr @.offloading.entry_name{{.*}}, i64 8, i64 0, ptr null }, section "omp_offloading_entries", align 1
   // CHECK-DAG: !{{.*}} = !{i32 {{.*}}, !"_QMtest_0Ept2_tar_decl_tgt_ref_ptr", i32 {{.*}}, i32 {{.*}}}
   llvm.mlir.global external @_QMtest_0Ept2_tar() {addr_space = 0 : i32, omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (link)>} : i32 {
     %0 = llvm.mlir.constant(5 : i32) : i32
diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h
index 324dcac7787ea..f4febac69c45e 100644
--- a/offload/include/PluginManager.h
+++ b/offload/include/PluginManager.h
@@ -170,6 +170,12 @@ struct PluginManager {
 
   /// Devices associated with plugins, accesses to the container are exclusive.
   ProtectedObj<DeviceContainerTy> Devices;
+
+  /// References to upgraded legacy offloading entires.
+  std::list<llvm::SmallVector<llvm::offloading::EntryTy, 0>> LegacyEntries;
+  std::list<llvm::SmallVector<__tgt_device_image, 0>> LegacyImages;
+  llvm::DenseMap<__tgt_bin_desc *, __tgt_bin_desc> UpgradedDescriptors;
+  __tgt_bin_desc *upgradeLegacyEntries(__tgt_bin_desc *Desc);
 };
 
 /// Initialize the plugin manager and OpenMP runtime.
diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp
index 96fa0bb170489..b389d2ddc206f 100644
--- a/offload/src/PluginManager.cpp
+++ b/offload/src/PluginManager.cpp
@@ -124,9 +124,65 @@ void PluginManager::initializeAllDevices() {
   }
 }
 
+// Returns a pointer to the binary descriptor, upgrading from a legacy format if
+// necessary.
+__tgt_bin_desc *PluginManager::upgradeLegacyEntries(__tgt_bin_desc *Desc) {
+  struct LegacyEntryTy {
+    void *Address;
+    char *SymbolName;
+    size_t Size;
+    int32_t Flags;
+    int32_t Data;
+  };
+
+  if (UpgradedDescriptors.contains(Desc))
+    return &UpgradedDescriptors[Desc];
+
+  if (Desc->HostEntriesBegin == Desc->HostEntriesEnd ||
+      Desc->HostEntriesBegin->Reserved == 0)
+    return Desc;
+
+  // The new format mandates that each entry starts with eight bytes of zeroes.
+  // This allows us to detect the old format as this is a null pointer.
+  llvm::SmallVector<llvm::offloading::EntryTy, 0> &NewEntries =
+      LegacyEntries.emplace_back();
+  for (LegacyEntryTy &Entry : llvm::make_range(
+           reinterpret_cast<LegacyEntryTy *>(Desc->HostEntriesBegin),
+           reinterpret_cast<LegacyEntryTy *>(Desc->HostEntriesEnd))) {
+    llvm::offloading::EntryTy &NewEntry = NewEntries.emplace_back();
+
+    NewEntry.Address = Entry.Address;
+    NewEntry.Flags = Entry.Flags;
+    NewEntry.Data = Entry.Data;
+    NewEntry.Size = Entry.Size;
+    NewEntry.SymbolName = Entry.SymbolName;
+  }
+
+  // Create a new image struct so we can update the entries list.
+  llvm::SmallVector<__tgt_device_image, 0> &NewImages =
+      LegacyImages.emplace_back();
+  for (int32_t Image = 0; Image < Desc->NumDeviceImages; ++Image)
+    NewImages.emplace_back(
+        __tgt_device_image{Desc->DeviceImages[Image].ImageStart,
+                           Desc->DeviceImages[Image].ImageEnd,
+                           NewEntries.begin(), NewEntries.end()});
+
+  // Create the new binary descriptor containing the newly created memory.
+  __tgt_bin_desc &NewDesc = UpgradedDescriptors[Desc];
+  NewDesc.DeviceImages = NewImages.begin();
+  NewDesc.NumDeviceImages = Desc->NumDeviceImages;
+  NewDesc.HostEntriesBegin = NewEntries.begin();
+  NewDesc.HostEntriesEnd = NewEntries.end();
+
+  return &NewDesc;
+}
+
 void PluginManager::registerLib(__tgt_bin_desc *Desc) {
   PM->RTLsMtx.lock();
 
+  // Upgrade the entries from the legacy implementation if necessary.
+  Desc = upgradeLegacyEntries(Desc);
+
   // Add in all the OpenMP requirements associated with this binary.
   for (llvm::offloading::EntryTy &Entry :
        llvm::make_range(Desc->HostEntriesBegin, Desc->HostEntriesEnd))
@@ -232,6 +288,8 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 void PluginManager::unregisterLib(__tgt_bin_desc *Desc) {
   DP("Unloading target library!\n");
 
+  Desc = upgradeLegacyEntries(Desc);
+
   PM->RTLsMtx.lock();
   // Find which RTL understands each image, if any.
   for (DeviceImageTy &DI : PM->deviceImages()) {
diff --git a/offload/test/offloading/requires.c b/offload/test/offloading/requires.c
index 2a129a7ae86dc..249c478215d42 100644
--- a/offload/test/offloading/requires.c
+++ b/offload/test/offloading/requires.c
@@ -17,11 +17,15 @@
 // Various definitions copied from OpenMP RTL
 
 typedef struct {
-  void *addr;
-  char *name;
-  size_t size;
-  int32_t flags;
-  int32_t data;
+  uint64_t Reserved;
+  uint16_t Version;
+  uint16_t Kind;
+  uint32_t Flags;
+  void *Address;
+  char *SymbolName;
+  uint64_t Size;
+  uint64_t Data;
+  void *AuxAddr;
 } __tgt_offload_entry;
 
 enum Flags {
@@ -55,8 +59,9 @@ void run_reg_requires() {
 
   // This is the 2nd time this function is called so it should print SUCCESS if
   // REQ is compatible with `1` and otherwise cause an error.
-  __tgt_offload_entry entries[] = {{NULL, "", 0, OMP_REGISTER_REQUIRES, 1},
-                                   {NULL, "", 0, OMP_REGISTER_REQUIRES, REQ}};
+  __tgt_offload_entry entries[] = {
+      {0, 0, 1, OMP_REGISTER_REQUIRES, NULL, "", 0, 1, NULL},
+      {0, 0, 1, OMP_REGISTER_REQUIRES, NULL, "", 0, REQ, NULL}};
   __tgt_device_image image = {NULL, NULL, &entries[0], &entries[1] + 1};
   __tgt_bin_desc bin = {1, &image, &entries[0], &entries[1] + 1};
 
@@ -78,7 +83,8 @@ int main() {
 
 // This also runs reg requires for the first time.
 #pragma omp target
-  {}
+  {
+  }
 
   return 0;
 }
diff --git a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index ff77154033239..bec2fac50142b 100644
--- a/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/offload/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -93,7 +93,8 @@ int main(int argc, char **argv) {
   void *BAllocStart = reinterpret_cast<void *>(
       JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
 
-  llvm::offloading::EntryTy KernelEntry = {nullptr, nullptr, 0, 0, 0};
+  llvm::offloading::EntryTy KernelEntry = {~0U,     0, 0, 0,      nullptr,
+                                           nullptr, 0, 0, nullptr};
   std::string KernelEntryName = KernelFunc.value().str();
   KernelEntry.SymbolName = const_cast<char *>(KernelEntryName.c_str());
   // Anything non-zero works to uniquely identify the kernel.
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index 0089f1aa31d14..cfdc77b71890f 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -25,3 +25,5 @@ Device Runtime
 - Changed the OpenMP DeviceRTL to use 'generic' IR. The
   ``LIBOMPTARGET_DEVICE_ARCHITECTURES`` CMake argument is now unused and will
   always build support for AMDGPU and NVPTX targets.
+- Updated the offloading entry format but retained backwards compatibility with
+  the old format.

From e84f6b6a88c1222d512edf0644c8f869dd12b8ef Mon Sep 17 00:00:00 2001
From: Luohao Wang <luohaothu@live.com>
Date: Tue, 28 Jan 2025 21:40:47 +0800
Subject: [PATCH 404/432] [mlir] Fix conflict of user defined reserved
 functions with internal prototypes (#123378)

On lowering from `memref` to LLVM, `malloc` and other intrinsic
functions from `libc` will be declared in the current module. User's
redefinition of these reserved functions will poison the internal
analysis with wrong prototype. This patch adds assertion on the found
function's type and reports if it mismatch with the intended type.

Related to #120950


---------

Co-authored-by: Luohao Wang <Luohaothu@users.noreply.github.com>
---
 .../Conversion/LLVMCommon/PrintCallHelper.h   |   9 +-
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |  58 +++---
 .../Conversion/AsyncToLLVM/AsyncToLLVM.cpp    |   8 +-
 .../ControlFlowToLLVM/ControlFlowToLLVM.cpp   |  10 +-
 mlir/lib/Conversion/LLVMCommon/Pattern.cpp    |  17 +-
 .../Conversion/LLVMCommon/PrintCallHelper.cpp |  11 +-
 .../MemRefToLLVM/AllocLikeConversion.cpp      |  26 +--
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  |  17 +-
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |  23 ++-
 .../Dialect/LLVMIR/IR/FunctionCallUtils.cpp   | 175 +++++++++++-------
 .../test/Conversion/MemRefToLLVM/invalid.mlir |   7 +
 11 files changed, 224 insertions(+), 137 deletions(-)

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
index c2742b6fc1d73..33402301115b7 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/PrintCallHelper.h
@@ -23,11 +23,10 @@ namespace LLVM {
 /// Generate IR that prints the given string to stdout.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
-void createPrintStrCall(OpBuilder &builder, Location loc, ModuleOp moduleOp,
-                        StringRef symbolName, StringRef string,
-                        const LLVMTypeConverter &typeConverter,
-                        bool addNewline = true,
-                        std::optional<StringRef> runtimeFunctionName = {});
+LogicalResult createPrintStrCall(
+    OpBuilder &builder, Location loc, ModuleOp moduleOp, StringRef symbolName,
+    StringRef string, const LLVMTypeConverter &typeConverter,
+    bool addNewline = true, std::optional<StringRef> runtimeFunctionName = {});
 } // namespace LLVM
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 852490cf7428f..05e9fe9d58859 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -16,7 +16,6 @@
 
 #include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
-#include <optional>
 
 namespace mlir {
 class Location;
@@ -29,42 +28,47 @@ class ValueRange;
 namespace LLVM {
 class LLVMFuncOp;
 
-/// Helper functions to lookup or create the declaration for commonly used
+/// Helper functions to look up or create the declaration for commonly used
 /// external C function calls. The list of functions provided here must be
 /// implemented separately (e.g. as part of a support runtime library or as part
 /// of the libc).
-LLVM::LLVMFuncOp lookupOrCreatePrintI64Fn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintU64Fn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintF16Fn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintBF16Fn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintF32Fn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintF64Fn(Operation *moduleOp);
+/// Failure if an unexpected version of function is found.
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintI64Fn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintU64Fn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF16Fn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintBF16Fn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF32Fn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintF64Fn(Operation *moduleOp);
 /// Declares a function to print a C-string.
 /// If a custom runtime function is defined via `runtimeFunctionName`, it must
 /// have the signature void(char const*). The default function is `printString`.
-LLVM::LLVMFuncOp
+FailureOr<LLVM::LLVMFuncOp>
 lookupOrCreatePrintStringFn(Operation *moduleOp,
                             std::optional<StringRef> runtimeFunctionName = {});
-LLVM::LLVMFuncOp lookupOrCreatePrintOpenFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintCloseFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintCommaFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreateMallocFn(Operation *moduleOp, Type indexType);
-LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(Operation *moduleOp,
-                                              Type indexType);
-LLVM::LLVMFuncOp lookupOrCreateFreeFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(Operation *moduleOp,
-                                              Type indexType);
-LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(Operation *moduleOp,
-                                                     Type indexType);
-LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(Operation *moduleOp);
-LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(Operation *moduleOp, Type indexType,
-                                            Type unrankedDescriptorType);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintOpenFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintCloseFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintCommaFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreatePrintNewlineFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateMallocFn(Operation *moduleOp,
+                                                   Type indexType);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateAlignedAllocFn(Operation *moduleOp,
+                                                         Type indexType);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateFreeFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateGenericAllocFn(Operation *moduleOp,
+                                                         Type indexType);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateGenericAlignedAllocFn(Operation *moduleOp, Type indexType);
+FailureOr<LLVM::LLVMFuncOp> lookupOrCreateGenericFreeFn(Operation *moduleOp);
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateMemRefCopyFn(Operation *moduleOp, Type indexType,
+                           Type unrankedDescriptorType);
 
 /// Create a FuncOp with signature `resultType`(`paramTypes`)` and name `name`.
-LLVM::LLVMFuncOp lookupOrCreateFn(Operation *moduleOp, StringRef name,
-                                  ArrayRef<Type> paramTypes = {},
-                                  Type resultType = {}, bool isVarArg = false);
+/// Return a failure if the FuncOp found has unexpected signature.
+FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateFn(Operation *moduleOp, StringRef name,
+                 ArrayRef<Type> paramTypes = {}, Type resultType = {},
+                 bool isVarArg = false, bool isReserved = false);
 
 } // namespace LLVM
 } // namespace mlir
diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
index 9b5aeb3fef30b..47d4474a5c28d 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
+++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
@@ -396,8 +396,10 @@ class CoroBeginOpConversion : public AsyncOpConversionPattern<CoroBeginOp> {
     // Allocate memory for the coroutine frame.
     auto allocFuncOp = LLVM::lookupOrCreateAlignedAllocFn(
         op->getParentOfType<ModuleOp>(), rewriter.getI64Type());
+    if (failed(allocFuncOp))
+      return failure();
     auto coroAlloc = rewriter.create<LLVM::CallOp>(
-        loc, allocFuncOp, ValueRange{coroAlign, coroSize});
+        loc, allocFuncOp.value(), ValueRange{coroAlign, coroSize});
 
     // Begin a coroutine: @llvm.coro.begin.
     auto coroId = CoroBeginOpAdaptor(adaptor.getOperands()).getId();
@@ -431,7 +433,9 @@ class CoroFreeOpConversion : public AsyncOpConversionPattern<CoroFreeOp> {
     // Free the memory.
     auto freeFuncOp =
         LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>());
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFuncOp,
+    if (failed(freeFuncOp))
+      return failure();
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFuncOp.value(),
                                               ValueRange(coroMem.getResult()));
 
     return success();
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index d0ffb94f3f96a..debfd003bd5b5 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -61,9 +61,13 @@ struct AssertOpLowering : public ConvertOpToLLVMPattern<cf::AssertOp> {
 
     // Failed block: Generate IR to print the message and call `abort`.
     Block *failureBlock = rewriter.createBlock(opBlock->getParent());
-    LLVM::createPrintStrCall(rewriter, loc, module, "assert_msg", op.getMsg(),
-                             *getTypeConverter(), /*addNewLine=*/false,
-                             /*runtimeFunctionName=*/"puts");
+    auto createResult = LLVM::createPrintStrCall(
+        rewriter, loc, module, "assert_msg", op.getMsg(), *getTypeConverter(),
+        /*addNewLine=*/false,
+        /*runtimeFunctionName=*/"puts");
+    if (createResult.failed())
+      return failure();
+
     if (abortOnFailedAssert) {
       // Insert the `abort` declaration if necessary.
       auto abortFunc = module.lookupSymbol<LLVM::LLVMFuncOp>("abort");
diff --git a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
index a47a2872ceb07..840bd3df61a06 100644
--- a/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/Pattern.cpp
@@ -276,11 +276,17 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
 
   // Find the malloc and free, or declare them if necessary.
   auto module = builder.getInsertionPoint()->getParentOfType<ModuleOp>();
-  LLVM::LLVMFuncOp freeFunc, mallocFunc;
-  if (toDynamic)
+  FailureOr<LLVM::LLVMFuncOp> freeFunc, mallocFunc;
+  if (toDynamic) {
     mallocFunc = LLVM::lookupOrCreateMallocFn(module, indexType);
-  if (!toDynamic)
+    if (failed(mallocFunc))
+      return failure();
+  }
+  if (!toDynamic) {
     freeFunc = LLVM::lookupOrCreateFreeFn(module);
+    if (failed(freeFunc))
+      return failure();
+  }
 
   unsigned unrankedMemrefPos = 0;
   for (unsigned i = 0, e = operands.size(); i < e; ++i) {
@@ -293,7 +299,8 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
     // Allocate memory, copy, and free the source if necessary.
     Value memory =
         toDynamic
-            ? builder.create<LLVM::CallOp>(loc, mallocFunc, allocationSize)
+            ? builder
+                  .create<LLVM::CallOp>(loc, mallocFunc.value(), allocationSize)
                   .getResult()
             : builder.create<LLVM::AllocaOp>(loc, getVoidPtrType(),
                                              IntegerType::get(getContext(), 8),
@@ -302,7 +309,7 @@ LogicalResult ConvertToLLVMPattern::copyUnrankedDescriptors(
     Value source = desc.memRefDescPtr(builder, loc);
     builder.create<LLVM::MemcpyOp>(loc, memory, source, allocationSize, false);
     if (!toDynamic)
-      builder.create<LLVM::CallOp>(loc, freeFunc, source);
+      builder.create<LLVM::CallOp>(loc, freeFunc.value(), source);
 
     // Create a new descriptor. The same descriptor can be returned multiple
     // times, attempting to modify its pointer can lead to memory leaks
diff --git a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
index bd7b401efec17..337c01f01a7cc 100644
--- a/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/PrintCallHelper.cpp
@@ -27,7 +27,7 @@ static std::string ensureSymbolNameIsUnique(ModuleOp moduleOp,
   return uniqueName;
 }
 
-void mlir::LLVM::createPrintStrCall(
+LogicalResult mlir::LLVM::createPrintStrCall(
     OpBuilder &builder, Location loc, ModuleOp moduleOp, StringRef symbolName,
     StringRef string, const LLVMTypeConverter &typeConverter, bool addNewline,
     std::optional<StringRef> runtimeFunctionName) {
@@ -59,8 +59,11 @@ void mlir::LLVM::createPrintStrCall(
   SmallVector<LLVM::GEPArg> indices(1, 0);
   Value gep =
       builder.create<LLVM::GEPOp>(loc, ptrTy, arrayTy, msgAddr, indices);
-  Operation *printer =
+  FailureOr<LLVM::LLVMFuncOp> printer =
       LLVM::lookupOrCreatePrintStringFn(moduleOp, runtimeFunctionName);
-  builder.create<LLVM::CallOp>(loc, TypeRange(), SymbolRefAttr::get(printer),
-                               gep);
+  if (failed(printer))
+    return failure();
+  builder.create<LLVM::CallOp>(loc, TypeRange(),
+                               SymbolRefAttr::get(printer.value()), gep);
+  return success();
 }
diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
index a6408391b1330..c5b2e83df93dc 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
@@ -14,9 +14,9 @@
 
 using namespace mlir;
 
-namespace {
-LLVM::LLVMFuncOp getNotalignedAllocFn(const LLVMTypeConverter *typeConverter,
-                                      Operation *module, Type indexType) {
+static FailureOr<LLVM::LLVMFuncOp>
+getNotalignedAllocFn(const LLVMTypeConverter *typeConverter, Operation *module,
+                     Type indexType) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
   if (useGenericFn)
     return LLVM::lookupOrCreateGenericAllocFn(module, indexType);
@@ -24,8 +24,9 @@ LLVM::LLVMFuncOp getNotalignedAllocFn(const LLVMTypeConverter *typeConverter,
   return LLVM::lookupOrCreateMallocFn(module, indexType);
 }
 
-LLVM::LLVMFuncOp getAlignedAllocFn(const LLVMTypeConverter *typeConverter,
-                                   Operation *module, Type indexType) {
+static FailureOr<LLVM::LLVMFuncOp>
+getAlignedAllocFn(const LLVMTypeConverter *typeConverter, Operation *module,
+                  Type indexType) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
@@ -34,8 +35,6 @@ LLVM::LLVMFuncOp getAlignedAllocFn(const LLVMTypeConverter *typeConverter,
   return LLVM::lookupOrCreateAlignedAllocFn(module, indexType);
 }
 
-} // end namespace
-
 Value AllocationOpLLVMLowering::createAligned(
     ConversionPatternRewriter &rewriter, Location loc, Value input,
     Value alignment) {
@@ -80,10 +79,13 @@ std::tuple<Value, Value> AllocationOpLLVMLowering::allocateBufferManuallyAlign(
         << " to integer address space "
            "failed. Consider adding memory space conversions.";
   }
-  LLVM::LLVMFuncOp allocFuncOp = getNotalignedAllocFn(
+  FailureOr<LLVM::LLVMFuncOp> allocFuncOp = getNotalignedAllocFn(
       getTypeConverter(), op->getParentWithTrait<OpTrait::SymbolTable>(),
       getIndexType());
-  auto results = rewriter.create<LLVM::CallOp>(loc, allocFuncOp, sizeBytes);
+  if (failed(allocFuncOp))
+    return std::make_tuple(Value(), Value());
+  auto results =
+      rewriter.create<LLVM::CallOp>(loc, allocFuncOp.value(), sizeBytes);
 
   Value allocatedPtr =
       castAllocFuncResult(rewriter, loc, results.getResult(), memRefType,
@@ -146,11 +148,13 @@ Value AllocationOpLLVMLowering::allocateBufferAutoAlign(
     sizeBytes = createAligned(rewriter, loc, sizeBytes, allocAlignment);
 
   Type elementPtrType = this->getElementPtrType(memRefType);
-  LLVM::LLVMFuncOp allocFuncOp = getAlignedAllocFn(
+  FailureOr<LLVM::LLVMFuncOp> allocFuncOp = getAlignedAllocFn(
       getTypeConverter(), op->getParentWithTrait<OpTrait::SymbolTable>(),
       getIndexType());
+  if (failed(allocFuncOp))
+    return Value();
   auto results = rewriter.create<LLVM::CallOp>(
-      loc, allocFuncOp, ValueRange({allocAlignment, sizeBytes}));
+      loc, allocFuncOp.value(), ValueRange({allocAlignment, sizeBytes}));
 
   return castAllocFuncResult(rewriter, loc, results.getResult(), memRefType,
                              elementPtrType, *getTypeConverter());
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index f7542b8b3bc5c..af1dba4587dc1 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -38,12 +38,12 @@ using namespace mlir;
 
 namespace {
 
-bool isStaticStrideOrOffset(int64_t strideOrOffset) {
+static bool isStaticStrideOrOffset(int64_t strideOrOffset) {
   return !ShapedType::isDynamic(strideOrOffset);
 }
 
-LLVM::LLVMFuncOp getFreeFn(const LLVMTypeConverter *typeConverter,
-                           ModuleOp module) {
+static FailureOr<LLVM::LLVMFuncOp>
+getFreeFn(const LLVMTypeConverter *typeConverter, ModuleOp module) {
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
@@ -220,8 +220,10 @@ struct DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
   matchAndRewrite(memref::DeallocOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // Insert the `free` declaration if it is not already present.
-    LLVM::LLVMFuncOp freeFunc =
+    FailureOr<LLVM::LLVMFuncOp> freeFunc =
         getFreeFn(getTypeConverter(), op->getParentOfType<ModuleOp>());
+    if (failed(freeFunc))
+      return failure();
     Value allocatedPtr;
     if (auto unrankedTy =
             llvm::dyn_cast<UnrankedMemRefType>(op.getMemref().getType())) {
@@ -236,7 +238,8 @@ struct DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
       allocatedPtr = MemRefDescriptor(adaptor.getMemref())
                          .allocatedPtr(rewriter, op.getLoc());
     }
-    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFunc, allocatedPtr);
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFunc.value(),
+                                              allocatedPtr);
     return success();
   }
 };
@@ -838,7 +841,9 @@ struct MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
     auto elemSize = getSizeInBytes(loc, srcType.getElementType(), rewriter);
     auto copyFn = LLVM::lookupOrCreateMemRefCopyFn(
         op->getParentOfType<ModuleOp>(), getIndexType(), sourcePtr.getType());
-    rewriter.create<LLVM::CallOp>(loc, copyFn,
+    if (failed(copyFn))
+      return failure();
+    rewriter.create<LLVM::CallOp>(loc, copyFn.value(),
                                   ValueRange{elemSize, sourcePtr, targetPtr});
 
     // Restore stack used for descriptors
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index a1e21cb524bd9..baed98c13adc7 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1546,11 +1546,15 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
 
     auto punct = printOp.getPunctuation();
     if (auto stringLiteral = printOp.getStringLiteral()) {
-      LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
-                               *stringLiteral, *getTypeConverter(),
-                               /*addNewline=*/false);
+      auto createResult =
+          LLVM::createPrintStrCall(rewriter, loc, parent, "vector_print_str",
+                                   *stringLiteral, *getTypeConverter(),
+                                   /*addNewline=*/false);
+      if (createResult.failed())
+        return failure();
+
     } else if (punct != PrintPunctuation::NoPunctuation) {
-      emitCall(rewriter, printOp->getLoc(), [&] {
+      FailureOr<LLVM::LLVMFuncOp> op = [&]() {
         switch (punct) {
         case PrintPunctuation::Close:
           return LLVM::lookupOrCreatePrintCloseFn(parent);
@@ -1563,7 +1567,10 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
         default:
           llvm_unreachable("unexpected punctuation");
         }
-      }());
+      }();
+      if (failed(op))
+        return failure();
+      emitCall(rewriter, printOp->getLoc(), op.value());
     }
 
     rewriter.eraseOp(printOp);
@@ -1588,7 +1595,7 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
 
     // Make sure element type has runtime support.
     PrintConversion conversion = PrintConversion::None;
-    Operation *printer;
+    FailureOr<Operation *> printer;
     if (printType.isF32()) {
       printer = LLVM::lookupOrCreatePrintF32Fn(parent);
     } else if (printType.isF64()) {
@@ -1631,6 +1638,8 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
     } else {
       return failure();
     }
+    if (failed(printer))
+      return failure();
 
     switch (conversion) {
     case PrintConversion::ZeroExt64:
@@ -1648,7 +1657,7 @@ class VectorPrintOpConversion : public ConvertOpToLLVMPattern<vector::PrintOp> {
     case PrintConversion::None:
       break;
     }
-    emitCall(rewriter, loc, printer, value);
+    emitCall(rewriter, loc, printer.value(), value);
     return success();
   }
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
index 88421a16ccf9f..68d4426e65301 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp
@@ -45,56 +45,85 @@ static constexpr llvm::StringRef kGenericFree = "_mlir_memref_to_llvm_free";
 static constexpr llvm::StringRef kMemRefCopy = "memrefCopy";
 
 /// Generic print function lookupOrCreate helper.
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFn(Operation *moduleOp,
-                                              StringRef name,
-                                              ArrayRef<Type> paramTypes,
-                                              Type resultType, bool isVarArg) {
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateFn(Operation *moduleOp, StringRef name,
+                             ArrayRef<Type> paramTypes, Type resultType,
+                             bool isVarArg, bool isReserved) {
   assert(moduleOp->hasTrait<OpTrait::SymbolTable>() &&
          "expected SymbolTable operation");
   auto func = llvm::dyn_cast_or_null<LLVM::LLVMFuncOp>(
       SymbolTable::lookupSymbolIn(moduleOp, name));
-  if (func)
+  auto funcT = LLVMFunctionType::get(resultType, paramTypes, isVarArg);
+  // Assert the signature of the found function is same as expected
+  if (func) {
+    if (funcT != func.getFunctionType()) {
+      if (isReserved) {
+        func.emitError("redefinition of reserved function '")
+            << name << "' of different type " << func.getFunctionType()
+            << " is prohibited";
+      } else {
+        func.emitError("redefinition of function '")
+            << name << "' of different type " << funcT << " is prohibited";
+      }
+      return failure();
+    }
     return func;
+  }
   OpBuilder b(moduleOp->getRegion(0));
   return b.create<LLVM::LLVMFuncOp>(
       moduleOp->getLoc(), name,
       LLVM::LLVMFunctionType::get(resultType, paramTypes, isVarArg));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintI64Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintI64,
-                          IntegerType::get(moduleOp->getContext(), 64),
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+static FailureOr<LLVM::LLVMFuncOp>
+lookupOrCreateReservedFn(Operation *moduleOp, StringRef name,
+                         ArrayRef<Type> paramTypes, Type resultType) {
+  return lookupOrCreateFn(moduleOp, name, paramTypes, resultType,
+                          /*isVarArg=*/false, /*isReserved=*/true);
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintU64Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintU64,
-                          IntegerType::get(moduleOp->getContext(), 64),
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintI64Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintI64, IntegerType::get(moduleOp->getContext(), 64),
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF16Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintF16,
-                          IntegerType::get(moduleOp->getContext(), 16), // bits!
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintU64Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintU64, IntegerType::get(moduleOp->getContext(), 64),
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintBF16Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintBF16,
-                          IntegerType::get(moduleOp->getContext(), 16), // bits!
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintF16Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintF16,
+      IntegerType::get(moduleOp->getContext(), 16), // bits!
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF32Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintF32,
-                          Float32Type::get(moduleOp->getContext()),
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintBF16Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintBF16,
+      IntegerType::get(moduleOp->getContext(), 16), // bits!
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF64Fn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintF64,
-                          Float64Type::get(moduleOp->getContext()),
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintF32Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintF32, Float32Type::get(moduleOp->getContext()),
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
+}
+
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintF64Fn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintF64, Float64Type::get(moduleOp->getContext()),
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
 static LLVM::LLVMPointerType getCharPtr(MLIRContext *context) {
@@ -106,75 +135,87 @@ static LLVM::LLVMPointerType getVoidPtr(MLIRContext *context) {
   return getCharPtr(context);
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintStringFn(
+FailureOr<LLVM::LLVMFuncOp> mlir::LLVM::lookupOrCreatePrintStringFn(
     Operation *moduleOp, std::optional<StringRef> runtimeFunctionName) {
-  return lookupOrCreateFn(moduleOp, runtimeFunctionName.value_or(kPrintString),
-                          getCharPtr(moduleOp->getContext()),
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+  return lookupOrCreateReservedFn(
+      moduleOp, runtimeFunctionName.value_or(kPrintString),
+      getCharPtr(moduleOp->getContext()),
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintOpenFn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintOpen, {},
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintOpenFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintOpen, {},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCloseFn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintClose, {},
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintCloseFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintClose, {},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCommaFn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintComma, {},
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintCommaFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintComma, {},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintNewlineFn(Operation *moduleOp) {
-  return lookupOrCreateFn(moduleOp, kPrintNewline, {},
-                          LLVM::LLVMVoidType::get(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreatePrintNewlineFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
+      moduleOp, kPrintNewline, {},
+      LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateMallocFn(Operation *moduleOp,
-                                                    Type indexType) {
-  return LLVM::lookupOrCreateFn(moduleOp, kMalloc, indexType,
-                                getVoidPtr(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateMallocFn(Operation *moduleOp, Type indexType) {
+  return lookupOrCreateReservedFn(moduleOp, kMalloc, indexType,
+                                  getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateAlignedAllocFn(Operation *moduleOp,
-                                                          Type indexType) {
-  return LLVM::lookupOrCreateFn(moduleOp, kAlignedAlloc, {indexType, indexType},
-                                getVoidPtr(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateAlignedAllocFn(Operation *moduleOp, Type indexType) {
+  return lookupOrCreateReservedFn(moduleOp, kAlignedAlloc,
+                                  {indexType, indexType},
+                                  getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFreeFn(Operation *moduleOp) {
-  return LLVM::lookupOrCreateFn(
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateFreeFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
       moduleOp, kFree, getVoidPtr(moduleOp->getContext()),
       LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericAllocFn(Operation *moduleOp,
-                                                          Type indexType) {
-  return LLVM::lookupOrCreateFn(moduleOp, kGenericAlloc, indexType,
-                                getVoidPtr(moduleOp->getContext()));
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateGenericAllocFn(Operation *moduleOp, Type indexType) {
+  return lookupOrCreateReservedFn(moduleOp, kGenericAlloc, indexType,
+                                  getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp
+FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(Operation *moduleOp,
                                                 Type indexType) {
-  return LLVM::lookupOrCreateFn(moduleOp, kGenericAlignedAlloc,
-                                {indexType, indexType},
-                                getVoidPtr(moduleOp->getContext()));
+  return lookupOrCreateReservedFn(moduleOp, kGenericAlignedAlloc,
+                                  {indexType, indexType},
+                                  getVoidPtr(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericFreeFn(Operation *moduleOp) {
-  return LLVM::lookupOrCreateFn(
+FailureOr<LLVM::LLVMFuncOp>
+mlir::LLVM::lookupOrCreateGenericFreeFn(Operation *moduleOp) {
+  return lookupOrCreateReservedFn(
       moduleOp, kGenericFree, getVoidPtr(moduleOp->getContext()),
       LLVM::LLVMVoidType::get(moduleOp->getContext()));
 }
 
-LLVM::LLVMFuncOp
+FailureOr<LLVM::LLVMFuncOp>
 mlir::LLVM::lookupOrCreateMemRefCopyFn(Operation *moduleOp, Type indexType,
                                        Type unrankedDescriptorType) {
-  return LLVM::lookupOrCreateFn(
+  return lookupOrCreateReservedFn(
       moduleOp, kMemRefCopy,
       ArrayRef<Type>{indexType, unrankedDescriptorType, unrankedDescriptorType},
       LLVM::LLVMVoidType::get(moduleOp->getContext()));
diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
index 40dd75af1dd77..1e12b83a24b5a 100644
--- a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
@@ -2,6 +2,13 @@
 // Since the error is at an unknown location, we use FileCheck instead of
 // -veri-y-diagnostics here
 
+// CHECK: redefinition of reserved function 'malloc' of different type '!llvm.func<void (i64)>' is prohibited
+llvm.func @malloc(i64)
+func.func @redef_reserved() {
+    %alloc = memref.alloc() : memref<1024x64xf32, 1>
+    llvm.return
+}
+
 // CHECK: conversion of memref memory space "foo" to integer address space failed. Consider adding memory space conversions.
 // CHECK-LABEL: @bad_address_space
 func.func @bad_address_space(%a: memref<2xindex, "foo">) {

From 1d5fbe83c3bcdd27a8ce1d57a68fb7be35e6cbbf Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 28 Jan 2025 04:45:33 -0800
Subject: [PATCH 405/432] [SLP]Adjust NumberOfParts value for adjusted number
 of buildvector scalars

Need to adjust NumParts value, when GatheredScalars scalars are adjusted
after extractelements analysis, to fix compiler crash
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   6 +
 .../resized-bv-values-non-power-of2-node.ll   | 269 ++++++++++++++++++
 2 files changed, 275 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 469d9c8d10976..ebf55a0691df1 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14898,6 +14898,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
             Resized = true;
             GatheredScalars.append(VF - GatheredScalars.size(),
                                    PoisonValue::get(OrigScalarTy));
+            NumParts = TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF));
+            if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
+                VecTy->getNumElements() % NumParts != 0 ||
+                !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
+                                          VecTy->getNumElements() / NumParts))
+              NumParts = 1;
           }
       }
     }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
new file mode 100644
index 0000000000000..e56131b4681e3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll
@@ -0,0 +1,269 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+
+define <16 x half> @test(i32 %0, float %1, i32 %2) {
+; CHECK-LABEL: define <16 x half> @test(
+; CHECK-SAME: i32 [[TMP0:%.*]], float [[TMP1:%.*]], i32 [[TMP2:%.*]]) {
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float 0.000000e+00, float 0.000000e+00>, float [[TMP1]], i32 13
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ogt float [[TMP9]], 0.000000e+00
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP5]], i32 10
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 0, 0
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <16 x i32> [[TMP5]], i32 4
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP22]], 0
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]]
+; CHECK-NEXT:    [[TMP25:%.*]] = bitcast float 0.000000e+00 to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 0
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = sitofp <16 x i32> [[TMP28]] to <16 x float>
+; CHECK-NEXT:    [[TMP30:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP29]], <16 x float> zeroinitializer, <16 x float> zeroinitializer)
+; CHECK-NEXT:    [[TMP31:%.*]] = fadd <16 x float> [[TMP30]], zeroinitializer
+; CHECK-NEXT:    [[TMP32:%.*]] = call <12 x i1> @llvm.vector.insert.v12i1.v2i1(<12 x i1> poison, <2 x i1> zeroinitializer, i64 0)
+; CHECK-NEXT:    [[TMP33:%.*]] = shufflevector <12 x i1> [[TMP32]], <12 x i1> <i1 poison, i1 false, i1 false, i1 poison, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <12 x i32> <i32 0, i32 13, i32 14, i32 1, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    [[TMP34:%.*]] = shufflevector <12 x i1> [[TMP33]], <12 x i1> poison, <16 x i32> <i32 0, i32 1, i32 0, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 0, i32 10, i32 11, i32 0>
+; CHECK-NEXT:    [[TMP35:%.*]] = select <16 x i1> [[TMP34]], <16 x float> zeroinitializer, <16 x float> [[TMP31]]
+; CHECK-NEXT:    [[TMP36:%.*]] = bitcast <16 x float> [[TMP35]] to <16 x i32>
+; CHECK-NEXT:    [[TMP37:%.*]] = and <16 x i32> [[TMP36]], zeroinitializer
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <16 x float>
+; CHECK-NEXT:    [[TMP39:%.*]] = call <16 x float> @llvm.vector.insert.v16f32.v2f32(<16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float poison, float poison>, <2 x float> [[TMP6]], i64 14)
+; CHECK-NEXT:    [[TMP40:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> zeroinitializer, <16 x float> [[TMP38]], <16 x float> [[TMP39]])
+; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <16 x float> [[TMP29]], i32 0
+; CHECK-NEXT:    [[TMP42:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <16 x float> [[TMP29]], i32 14
+; CHECK-NEXT:    [[TMP44:%.*]] = fcmp ogt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP45:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <16 x float> [[TMP29]], i32 13
+; CHECK-NEXT:    [[TMP47:%.*]] = fcmp ogt float [[TMP46]], 0.000000e+00
+; CHECK-NEXT:    [[TMP48:%.*]] = fcmp olt float [[TMP46]], 0.000000e+00
+; CHECK-NEXT:    [[TMP49:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <16 x float> [[TMP29]], i32 1
+; CHECK-NEXT:    [[TMP51:%.*]] = fcmp ogt float [[TMP50]], 0.000000e+00
+; CHECK-NEXT:    [[TMP52:%.*]] = fcmp oeq <16 x float> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    ret <16 x half> zeroinitializer
+;
+  %4 = bitcast float 0.000000e+00 to i32
+  %5 = fcmp olt float 0.000000e+00, 0.000000e+00
+  %6 = icmp ult i32 %4, 0
+  %7 = select i1 %6, i32 0, i32 0
+  %8 = sitofp i32 %7 to float
+  %9 = tail call float @llvm.fmuladd.f32(float %8, float 0.000000e+00, float 0.000000e+00)
+  %10 = fadd float %9, 0.000000e+00
+  %11 = select i1 %5, float 0.000000e+00, float %10
+  %12 = bitcast float %11 to i32
+  %13 = and i32 %12, 0
+  %14 = bitcast i32 %13 to float
+  %15 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+  %16 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %14, float %15)
+  %17 = fcmp oeq float %16, 0.000000e+00
+  %18 = fcmp olt float %8, 0.000000e+00
+  %19 = icmp ugt i32 %2, 0
+  %20 = bitcast float 0.000000e+00 to i32
+  %21 = icmp eq i32 %0, %0
+  %22 = icmp ult i32 %20, 0
+  %23 = select i1 %22, i32 0, i32 0
+  %24 = sitofp i32 %23 to float
+  %25 = tail call float @llvm.fmuladd.f32(float %24, float 0.000000e+00, float 0.000000e+00)
+  %26 = fadd float %25, 0.000000e+00
+  %27 = select i1 false, float 0.000000e+00, float %26
+  %28 = bitcast float %27 to i32
+  %29 = and i32 %28, 0
+  %30 = bitcast i32 %29 to float
+  %31 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float 0.000000e+00, float 0.000000e+00)
+  %32 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %30, float %31)
+  %33 = fcmp ogt float %24, 0.000000e+00
+  %34 = fcmp oeq float %32, 0.000000e+00
+  %35 = fcmp ogt float %31, 0.000000e+00
+  %36 = fcmp olt float %24, 0.000000e+00
+  %37 = bitcast float %1 to i32
+  %38 = icmp ult i32 %37, 0
+  %39 = select i1 %38, i32 0, i32 0
+  %40 = sitofp i32 %39 to float
+  %41 = tail call float @llvm.fmuladd.f32(float %40, float 0.000000e+00, float 0.000000e+00)
+  %42 = fadd float %41, 0.000000e+00
+  %43 = select i1 false, float 0.000000e+00, float %42
+  %44 = bitcast float %43 to i32
+  %45 = and i32 %44, 0
+  %46 = bitcast i32 %45 to float
+  %47 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %46, float 0.000000e+00)
+  %48 = fadd float 0.000000e+00, 0.000000e+00
+  %49 = fcmp ogt float %40, 0.000000e+00
+  %50 = fcmp oeq float %47, 0.000000e+00
+  %51 = fcmp ogt float %48, 0.000000e+00
+  %52 = fcmp olt float %40, 0.000000e+00
+  %53 = icmp eq i32 0, 0
+  %54 = bitcast float 0.000000e+00 to i32
+  %55 = icmp eq i32 0, 0
+  %56 = icmp ult i32 %54, 0
+  %57 = select i1 %56, i32 0, i32 0
+  %58 = sitofp i32 %57 to float
+  %59 = tail call float @llvm.fmuladd.f32(float %58, float 0.000000e+00, float 0.000000e+00)
+  %60 = fadd float %59, 0.000000e+00
+  %61 = select i1 %5, float 0.000000e+00, float %60
+  %62 = bitcast float %61 to i32
+  %63 = and i32 %62, 0
+  %64 = bitcast i32 %63 to float
+  %65 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %64, float 0.000000e+00)
+  %66 = fcmp oeq float %65, 0.000000e+00
+  %67 = bitcast float 0.000000e+00 to i32
+  %68 = icmp eq i32 %0, 0
+  %69 = icmp ult i32 %67, 0
+  %70 = select i1 %69, i32 0, i32 0
+  %71 = sitofp i32 %70 to float
+  %72 = tail call float @llvm.fmuladd.f32(float %71, float 0.000000e+00, float 0.000000e+00)
+  %73 = fadd float %72, 0.000000e+00
+  %74 = select i1 false, float 0.000000e+00, float %73
+  %75 = bitcast float %74 to i32
+  %76 = and i32 %75, 0
+  %77 = bitcast i32 %76 to float
+  %78 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %77, float 0.000000e+00)
+  %79 = fadd float 0.000000e+00, 0.000000e+00
+  %80 = fcmp oeq float %78, 0.000000e+00
+  %81 = fcmp ogt float %79, 0.000000e+00
+  %82 = icmp eq i32 %0, 0
+  %83 = bitcast float 0.000000e+00 to i32
+  %84 = icmp eq i32 %83, 0
+  %85 = icmp ult i32 %83, 0
+  %86 = select i1 %85, i32 0, i32 0
+  %87 = sitofp i32 %86 to float
+  %88 = tail call float @llvm.fmuladd.f32(float %87, float 0.000000e+00, float 0.000000e+00)
+  %89 = fadd float %88, 0.000000e+00
+  %90 = select i1 false, float 0.000000e+00, float %89
+  %91 = bitcast float %90 to i32
+  %92 = and i32 %91, 0
+  %93 = bitcast i32 %92 to float
+  %94 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %93, float 0.000000e+00)
+  %95 = fcmp oeq float %94, 0.000000e+00
+  %96 = bitcast float 0.000000e+00 to i32
+  %97 = bitcast float 0.000000e+00 to i32
+  %98 = icmp ult i32 %97, 0
+  %99 = select i1 %98, i32 0, i32 0
+  %100 = sitofp i32 %99 to float
+  %101 = tail call float @llvm.fmuladd.f32(float %100, float 0.000000e+00, float 0.000000e+00)
+  %102 = fadd float %101, 0.000000e+00
+  %103 = select i1 false, float 0.000000e+00, float %102
+  %104 = bitcast float %103 to i32
+  %105 = and i32 %104, 0
+  %106 = bitcast i32 %105 to float
+  %107 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %106, float 0.000000e+00)
+  %108 = fcmp oeq float %107, 0.000000e+00
+  %109 = icmp eq i32 %96, 0
+  %110 = icmp eq i32 %0, 0
+  %111 = icmp ult i32 0, 0
+  %112 = bitcast float 0.000000e+00 to i32
+  %113 = icmp ult i32 %112, 0
+  %114 = select i1 %113, i32 0, i32 0
+  %115 = sitofp i32 %114 to float
+  %116 = tail call float @llvm.fmuladd.f32(float %115, float 0.000000e+00, float 0.000000e+00)
+  %117 = fadd float %116, 0.000000e+00
+  %118 = select i1 false, float 0.000000e+00, float %117
+  %119 = bitcast float %118 to i32
+  %120 = and i32 %119, 0
+  %121 = bitcast i32 %120 to float
+  %122 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %121, float 0.000000e+00)
+  %123 = fadd float 0.000000e+00, 0.000000e+00
+  %124 = fcmp oeq float %122, 0.000000e+00
+  %125 = fcmp ogt float %123, 0.000000e+00
+  %126 = icmp ult i32 0, 0
+  %127 = bitcast float 0.000000e+00 to i32
+  %128 = icmp ult i32 %127, 0
+  %129 = select i1 %128, i32 0, i32 0
+  %130 = sitofp i32 %129 to float
+  %131 = tail call float @llvm.fmuladd.f32(float %130, float 0.000000e+00, float 0.000000e+00)
+  %132 = fadd float %131, 0.000000e+00
+  %133 = select i1 false, float 0.000000e+00, float %132
+  %134 = bitcast float %133 to i32
+  %135 = and i32 %134, 0
+  %136 = bitcast i32 %135 to float
+  %137 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %136, float 0.000000e+00)
+  %138 = fcmp oeq float %137, 0.000000e+00
+  %139 = icmp ult i32 0, 0
+  %140 = bitcast float 0.000000e+00 to i32
+  %141 = icmp eq i32 0, 0
+  %142 = icmp ult i32 %140, 0
+  %143 = select i1 %142, i32 0, i32 0
+  %144 = sitofp i32 %143 to float
+  %145 = tail call float @llvm.fmuladd.f32(float %144, float 0.000000e+00, float 0.000000e+00)
+  %146 = fadd float %145, 0.000000e+00
+  %147 = select i1 false, float 0.000000e+00, float %146
+  %148 = bitcast float %147 to i32
+  %149 = and i32 %148, 0
+  %150 = bitcast i32 %149 to float
+  %151 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %150, float 0.000000e+00)
+  %152 = fcmp oeq float %151, 0.000000e+00
+  %153 = fcmp olt float 0.000000e+00, 0.000000e+00
+  %154 = select i1 %153, float 0.000000e+00, float %10
+  %155 = bitcast float %154 to i32
+  %156 = and i32 %155, 0
+  %157 = bitcast i32 %156 to float
+  %158 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %157, float 0.000000e+00)
+  %159 = fcmp oeq float %158, 0.000000e+00
+  %160 = bitcast float 0.000000e+00 to i32
+  %161 = icmp eq i32 %160, 0
+  %162 = icmp ult i32 %160, 0
+  %163 = select i1 %162, i32 0, i32 0
+  %164 = sitofp i32 %163 to float
+  %165 = tail call float @llvm.fmuladd.f32(float %164, float 0.000000e+00, float 0.000000e+00)
+  %166 = fadd float %165, 0.000000e+00
+  %167 = select i1 false, float 0.000000e+00, float %166
+  %168 = bitcast float %167 to i32
+  %169 = and i32 %168, 0
+  %170 = bitcast i32 %169 to float
+  %171 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %170, float 0.000000e+00)
+  %172 = fcmp oeq float %171, 0.000000e+00
+  %173 = tail call float @llvm.fmuladd.f32(float %8, float 0.000000e+00, float 0.000000e+00)
+  %174 = fadd float %173, 0.000000e+00
+  %175 = select i1 %5, float 0.000000e+00, float %174
+  %176 = bitcast float %175 to i32
+  %177 = and i32 %176, 0
+  %178 = bitcast i32 %177 to float
+  %179 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %178, float 0.000000e+00)
+  %180 = fadd float 0.000000e+00, 0.000000e+00
+  %181 = fcmp oeq float %179, 0.000000e+00
+  %182 = fcmp ogt float %180, 0.000000e+00
+  %183 = fcmp olt float %8, 0.000000e+00
+  %184 = bitcast float 0.000000e+00 to i32
+  %185 = icmp eq i32 %0, %0
+  %186 = icmp ult i32 %184, 0
+  %187 = select i1 %186, i32 0, i32 0
+  %188 = sitofp i32 %187 to float
+  %189 = tail call float @llvm.fmuladd.f32(float %188, float 0.000000e+00, float 0.000000e+00)
+  %190 = fadd float %189, 0.000000e+00
+  %191 = select i1 %5, float 0.000000e+00, float %190
+  %192 = bitcast float %191 to i32
+  %193 = and i32 %192, 0
+  %194 = bitcast i32 %193 to float
+  %195 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %194, float 0.000000e+00)
+  %196 = fcmp oeq float %195, 0.000000e+00
+  %197 = bitcast float 0.000000e+00 to i32
+  %198 = icmp eq i32 %197, 0
+  %199 = icmp ult i32 %197, 0
+  %200 = select i1 %199, i32 0, i32 0
+  %201 = sitofp i32 %200 to float
+  %202 = tail call float @llvm.fmuladd.f32(float %201, float 0.000000e+00, float 0.000000e+00)
+  %203 = fadd float %202, 0.000000e+00
+  %204 = select i1 false, float 0.000000e+00, float %203
+  %205 = bitcast float %204 to i32
+  %206 = and i32 %205, 0
+  %207 = bitcast i32 %206 to float
+  %208 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %207, float 0.000000e+00)
+  %209 = fcmp oeq float %208, 0.000000e+00
+  %210 = fcmp ogt float %201, 0.000000e+00
+  %211 = tail call float @llvm.fmuladd.f32(float 0.000000e+00, float %14, float 0.000000e+00)
+  %212 = fcmp oeq float %211, 0.000000e+00
+  ret <16 x half> zeroinitializer
+}
+

From 0cf6714279d4146ee5d6a5e34195d6fae56ed475 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 28 Jan 2025 13:49:24 +0000
Subject: [PATCH 406/432] [lldb][AArch64] Fix GCS register field detection

Fixes c5840cc609a3674cf7453a45946f7e4a2a73590b.

On platforms where UL is 32 bit, like Windows or 32 bit Linux,
this shift was not correct, so we assumed GCS was not present.

Use ULL instead, to match the other HWCAP constants.
---
 .../Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
index 1438a45f37d72..042940b7dff6e 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterFlagsDetector_arm64.cpp
@@ -17,7 +17,7 @@
 #define HWCAP_ASIMDHP (1ULL << 10)
 #define HWCAP_DIT (1ULL << 24)
 #define HWCAP_SSBS (1ULL << 28)
-#define HWCAP_GCS (1UL << 32)
+#define HWCAP_GCS (1ULL << 32)
 
 #define HWCAP2_BTI (1ULL << 17)
 #define HWCAP2_MTE (1ULL << 18)

From 11db7fb09b36e656a801117d6a2492133e9c2e46 Mon Sep 17 00:00:00 2001
From: Renat Idrisov <4032256+parsifal-47@users.noreply.github.com>
Date: Tue, 28 Jan 2025 05:54:34 -0800
Subject: [PATCH 407/432] [GlobalISel] Catching inconsistencies in load memory,
 result, and range metadata type (#121247)

This is a fix for:
https://github.com/llvm/llvm-project/issues/97290
Please let me know if that is the right way to address the issue. Thank
you!

---------

Co-authored-by: Renat Idrisov <parsifal-47@users.noreply.github.com>
Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
---
 llvm/lib/CodeGen/MachineVerifier.cpp          |  9 ++++++
 .../legalize-load-memory-metadata.mir         |  3 +-
 ...gbankselect-split-scalar-load-metadata.mir |  5 +--
 .../test_g_incompatible_range.mir             | 31 +++++++++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/MachineVerifier/test_g_incompatible_range.mir

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 46ae3c6b05540..becf41b0a7bcd 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1281,6 +1281,15 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
         if (TypeSize::isKnownGT(MMO.getSize().getValue(),
                                 ValTy.getSizeInBytes()))
           report("load memory size cannot exceed result size", MI);
+
+        if (MMO.getRanges()) {
+          ConstantInt *i =
+              mdconst::extract<ConstantInt>(MMO.getRanges()->getOperand(0));
+          if (i->getIntegerType()->getBitWidth() !=
+              ValTy.getScalarType().getSizeInBits()) {
+            report("range is incompatible with the result type", MI);
+          }
+        }
       } else if (MI->getOpcode() == TargetOpcode::G_STORE) {
         if (TypeSize::isKnownLT(ValTy.getSizeInBytes(),
                                 MMO.getSize().getValue()))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-memory-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-memory-metadata.mir
index a862d4a9032e8..f0c4ea00ad428 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-memory-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-memory-metadata.mir
@@ -30,6 +30,7 @@
   !0 = !{i24 0, i24 1048575}
   !1 = !{!"omnipotent char", !2}
   !2 = !{!"Simple C/C++ TBAA"}
+  !3 = !{i32 0, i32 1048575}
 ...
 
 # Make sure range metadata is not preserved when widening loads, but
@@ -67,7 +68,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), !tbaa !1, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
     %0:_(p1) = COPY $vgpr0_vgpr1
-    %1:_(s32) = G_LOAD %0 :: (load (s24), align 4, addrspace 1, !range !0, !tbaa !1)
+    %1:_(s32) = G_LOAD %0 :: (load (s24), align 4, addrspace 1, !range !3, !tbaa !1)
     $vgpr0 = COPY %1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index c2dbeafce3df7..d15919fb12a73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -17,6 +17,7 @@
   !0 = !{i96 0, i96 9223372036854775808}
   !1 = !{!"omnipotent char", !2}
   !2 = !{!"Simple C/C++ TBAA"}
+  !3 = !{i32 0, i32 2147483646}
 ...
 
 # Make sure range metadata is not preserved when widening loads, but
@@ -44,10 +45,10 @@ body: |
     ; GFX12: liveins: $sgpr0_sgpr1
     ; GFX12-NEXT: {{  $}}
     ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, !range !0, addrspace 4)
+    ; GFX12-NEXT: [[LOAD:%[0-9]+]]:sgpr(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8
     ; GFX12-NEXT: $sgpr0_sgpr1_sgpr2 = COPY [[LOAD]](<3 x s32>)
     %0:_(p4) = COPY $sgpr0_sgpr1
-    %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !range !0)
+    %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 8, addrspace 4, !range !3)
     $sgpr0_sgpr1_sgpr2 = COPY %1
 
 ...
diff --git a/llvm/test/MachineVerifier/test_g_incompatible_range.mir b/llvm/test/MachineVerifier/test_g_incompatible_range.mir
new file mode 100644
index 0000000000000..6813070ade9c5
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_incompatible_range.mir
@@ -0,0 +1,31 @@
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=none -filetype=null %s 2>&1 | FileCheck %s
+--- |
+  define void @mismatched_range_type() {
+    ret void
+  }
+
+  !0 = !{i64 -4294967295, i64 4294967296}
+
+...
+---
+name:            mismatched_range_type
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %1:_(s32) = COPY $vgpr0
+    %2:_(s32) = COPY $vgpr1
+    %0:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+
+    ; CHECK: Bad machine code: range is incompatible with the result type
+    %3:_(<2 x s32>) = G_LOAD %0(p1) :: (volatile load (s64), align 4, !range !0, addrspace 1)
+
+    %4:_(p0) = G_LOAD %0(p1) :: (volatile load (s64), align 4, !range !0, addrspace 1)
+
+    %5:_(<2 x p0>) = G_LOAD %0(p1) :: (volatile load (s64), align 4, !range !0, addrspace 1)
+
+    $vgpr0_vgpr1 = COPY %3
+    SI_RETURN implicit $vgpr0_vgpr1
+
+...

From 37b595cb3715fb8c453c0d3981d6c4f4d208d372 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 28 Jan 2025 05:57:01 -0800
Subject: [PATCH 408/432] [gn] port b968fd950266 (BuiltinsNVPTX.td)

---
 llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 4 ++++
 llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn           | 1 +
 2 files changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index e7e145c2b273e..87ff61939b330 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -112,6 +112,10 @@ clang_tablegen("BuiltinsBPF") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsNVPTX") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 clang_tablegen("BuiltinsRISCV") {
   args = [ "-gen-clang-builtins" ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index bbfac67b2a6e6..31b585277795a 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -26,6 +26,7 @@ static_library("Basic") {
     "//clang/include/clang/Basic:AttrSubMatchRulesList",
     "//clang/include/clang/Basic:Builtins",
     "//clang/include/clang/Basic:BuiltinsBPF",
+    "//clang/include/clang/Basic:BuiltinsNVPTX",
     "//clang/include/clang/Basic:BuiltinsRISCV",
     "//clang/include/clang/Basic:BuiltinsSPIRV",
     "//clang/include/clang/Basic:BuiltinsX86",

From 22687aa97bdae2f0ea0be9baf208247c18d69c06 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 28 Jan 2025 13:59:41 +0000
Subject: [PATCH 409/432] [CodeGen] Correctly handle non-standard cases in
 RemoveLoadsIntoFakeUses (#111551)

In the RemoveLoadsIntoFakeUses pass, we try to remove loads that are
only used by fake uses, as well as the fake use in question. There are
two existing errors with the pass however: it incorrectly examines every
operand of each FAKE_USE, when only the first is relevant (extra
operands will just be "killed" regs assigned by a previous pass), and it
ignores cases where the FAKE_USE register is not an exact match for the
loaded register, which is incorrect as regalloc may choose to load a
wider value than the FAKE_USE required pre-regalloc. This patch fixes
both of these cases.
---
 llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp  |  70 ++++---
 .../CodeGen/X86/fake-use-remove-loads.mir     | 171 ++++++++++++++++++
 2 files changed, 213 insertions(+), 28 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/fake-use-remove-loads.mir

diff --git a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
index ef7a58670c3ac..384a049acfe34 100644
--- a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
+++ b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
@@ -32,6 +32,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -74,6 +75,10 @@ INITIALIZE_PASS_END(RemoveLoadsIntoFakeUses, DEBUG_TYPE,
                     "Remove Loads Into Fake Uses", false, false)
 
 bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
+  // Skip this pass if we would use VarLoc-based LDV, as there may be DBG_VALUE
+  // instructions of the restored values that would become invalid.
+  if (!MF.useDebugInstrRef())
+    return false;
   // Only run this for functions that have fake uses.
   if (!MF.hasFakeUses() || skipFunction(MF.getFunction()))
     return false;
@@ -86,20 +91,20 @@ bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
   const TargetInstrInfo *TII = ST.getInstrInfo();
   const TargetRegisterInfo *TRI = ST.getRegisterInfo();
 
-  SmallDenseMap<Register, SmallVector<MachineInstr *>> RegFakeUses;
+  SmallVector<MachineInstr *> RegFakeUses;
   LivePhysRegs.init(*TRI);
   SmallVector<MachineInstr *, 16> Statepoints;
   for (MachineBasicBlock *MBB : post_order(&MF)) {
+    RegFakeUses.clear();
     LivePhysRegs.addLiveOuts(*MBB);
 
     for (MachineInstr &MI : make_early_inc_range(reverse(*MBB))) {
       if (MI.isFakeUse()) {
-        for (const MachineOperand &MO : MI.operands()) {
-          // Track the Fake Uses that use this register so that we can delete
-          // them if we delete the corresponding load.
-          if (MO.isReg())
-            RegFakeUses[MO.getReg()].push_back(&MI);
-        }
+        if (MI.getNumOperands() == 0 || !MI.getOperand(0).isReg())
+          continue;
+        // Track the Fake Uses that use these register units so that we can
+        // delete them if we delete the corresponding load.
+        RegFakeUses.push_back(&MI);
         // Do not record FAKE_USE uses in LivePhysRegs so that we can recognize
         // otherwise-unused loads.
         continue;
@@ -109,31 +114,38 @@ bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
       // reload of a spilled register.
       if (MI.getRestoreSize(TII)) {
         Register Reg = MI.getOperand(0).getReg();
-        assert(Reg.isPhysical() && "VReg seen in function with NoVRegs set?");
         // Don't delete live physreg defs, or any reserved register defs.
         if (!LivePhysRegs.available(Reg) || MRI->isReserved(Reg))
           continue;
-        // There should be an exact match between the loaded register and the
-        // FAKE_USE use. If not, this is a load that is unused by anything? It
-        // should probably be deleted, but that's outside of this pass' scope.
-        if (RegFakeUses.contains(Reg)) {
+        // There should typically be an exact match between the loaded register
+        // and the FAKE_USE, but sometimes regalloc will choose to load a larger
+        // value than is needed. Therefore, as long as the load isn't used by
+        // anything except at least one FAKE_USE, we will delete it. If it isn't
+        // used by any fake uses, it should still be safe to delete but we
+        // choose to ignore it so that this pass has no side effects unrelated
+        // to fake uses.
+        SmallDenseSet<MachineInstr *> FakeUsesToDelete;
+        SmallVector<MachineInstr *> RemainingFakeUses;
+        for (MachineInstr *&FakeUse : reverse(RegFakeUses)) {
+          if (FakeUse->readsRegister(Reg, TRI)) {
+            FakeUsesToDelete.insert(FakeUse);
+            RegFakeUses.erase(&FakeUse);
+          }
+        }
+        if (!FakeUsesToDelete.empty()) {
           LLVM_DEBUG(dbgs() << "RemoveLoadsIntoFakeUses: DELETING: " << MI);
-          // It is possible that some DBG_VALUE instructions refer to this
-          // instruction. They will be deleted in the live debug variable
-          // analysis.
+          // Since this load only exists to restore a spilled register and we
+          // haven't, run LiveDebugValues yet, there shouldn't be any DBG_VALUEs
+          // for this load; otherwise, deleting this would be incorrect.
           MI.eraseFromParent();
           AnyChanges = true;
           ++NumLoadsDeleted;
-          // Each FAKE_USE now appears to be a fake use of the previous value
-          // of the loaded register; delete them to avoid incorrectly
-          // interpreting them as such.
-          for (MachineInstr *FakeUse : RegFakeUses[Reg]) {
+          for (MachineInstr *FakeUse : FakeUsesToDelete) {
             LLVM_DEBUG(dbgs()
                        << "RemoveLoadsIntoFakeUses: DELETING: " << *FakeUse);
             FakeUse->eraseFromParent();
           }
-          NumFakeUsesDeleted += RegFakeUses[Reg].size();
-          RegFakeUses[Reg].clear();
+          NumFakeUsesDeleted += FakeUsesToDelete.size();
         }
         continue;
       }
@@ -143,13 +155,15 @@ bool RemoveLoadsIntoFakeUses::runOnMachineFunction(MachineFunction &MF) {
       // that register.
       if (!RegFakeUses.empty()) {
         for (const MachineOperand &MO : MI.operands()) {
-          if (MO.isReg() && MO.isDef()) {
-            Register Reg = MO.getReg();
-            assert(Reg.isPhysical() &&
-                   "VReg seen in function with NoVRegs set?");
-            for (MCRegUnit Unit : TRI->regunits(Reg))
-              RegFakeUses.erase(Unit);
-          }
+          if (!MO.isReg())
+            continue;
+          Register Reg = MO.getReg();
+          // We clear RegFakeUses for this register and all subregisters,
+          // because any such FAKE_USE encountered prior is no longer relevant
+          // for later encountered loads.
+          for (MachineInstr *&FakeUse : reverse(RegFakeUses))
+            if (FakeUse->readsRegister(Reg, TRI))
+              RegFakeUses.erase(&FakeUse);
         }
       }
       LivePhysRegs.stepBackward(MI);
diff --git a/llvm/test/CodeGen/X86/fake-use-remove-loads.mir b/llvm/test/CodeGen/X86/fake-use-remove-loads.mir
new file mode 100644
index 0000000000000..3f67f03c9a63d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/fake-use-remove-loads.mir
@@ -0,0 +1,171 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# Ensure that loads into FAKE_USEs are correctly removed by the
+# remove-loads-into-fake-uses pass, and that if the function does not use
+# instruction referencing then no changes are made.
+# RUN: llc %s -run-pass remove-loads-into-fake-uses -mtriple=x86_64-unknown-linux -debug-only=remove-loads-into-fake-uses 2>&1 -o - | FileCheck %s
+# REQUIRES: asserts
+#
+## We verify that:
+##   - The load into the FAKE_USE is removed, along with the FAKE_USE itself,
+##     even when the FAKE_USE is for a subregister of the move.
+##   - We correctly handle situations where FAKE_USE has additional `killed`
+##     operands added by other passes.
+##   - The store to the stack slot still exists.
+##   - When the register has a use between the restore and the FAKE_USE, we do
+##     not delete the load or fake use.
+
+
+---
+name:            enabled
+alignment:       16
+tracksRegLiveness: true
+noPhis:          true
+noVRegs:         true
+hasFakeUses:     true
+tracksDebugUserValues: true
+debugInstrRef: true
+liveins:
+  - { reg: '$rdi', virtual-reg: '' }
+  - { reg: '$esi', virtual-reg: '' }
+  - { reg: '$rdx', virtual-reg: '' }
+frameInfo:
+  isCalleeSavedInfoValid: true
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0:
+    liveins: $esi, $rdi, $rdx, $r15, $r14, $r13, $r12, $r11, $rbx
+
+    ; CHECK-LABEL: name: enabled
+    ; CHECK: liveins: $esi, $rdi, $rdx, $r15, $r14, $r13, $r12, $r11, $rbx
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $rbx = MOV64rr $rdx
+    ; CHECK-NEXT: $r14d = MOV32rr $esi
+    ; CHECK-NEXT: $r15 = MOV64rr $rdi
+    ; CHECK-NEXT: renamable $r12d = XOR32rr undef $r12d, undef $r12d, implicit-def dead $eflags, implicit-def $r12
+    ; CHECK-NEXT: renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $rax
+
+    ;; The store to the stack slot is still present.
+    ; CHECK-NEXT: MOV64mr $rbp, 1, $noreg, -48, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+
+    ; CHECK-NEXT: MOV64mr $rbp, 1, $noreg, -40, $noreg, killed renamable $r11 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
+    ; CHECK-NEXT: $r13d = MOV32rr killed $eax
+    ; CHECK-NEXT: $rdi = MOV64rr $r15
+    ; CHECK-NEXT: CALL64r renamable $r12, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ; CHECK-NEXT: dead renamable $eax = MOV32rm renamable $rbx, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $eax = MOV32ri 1
+    ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags
+    
+    ;; First FAKE_USE and its corresponding load are removed; second FAKE_USE of
+    ;; a restored value that is also used is preserved.
+    ; CHECK-NEXT: renamable $r11 = MOV64rm $rbp, 1, $noreg, -40, $noreg :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: renamable $r12d = XOR32rr $r12d, $r11d, implicit-def dead $eflags
+    ; CHECK-NEXT: FAKE_USE killed renamable $r11d
+
+    ; CHECK-NEXT: TEST32rr killed renamable $r13d, renamable $r13d, implicit-def $eflags
+    ; CHECK-NEXT: RET64
+
+    $rbx = MOV64rr $rdx
+    $r14d = MOV32rr $esi
+    $r15 = MOV64rr $rdi
+    renamable $r12d = XOR32rr undef $r12d, undef $r12d, implicit-def dead $eflags, implicit-def $r12
+    renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $rax
+    MOV64mr $rbp, 1, $noreg, -48, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    MOV64mr $rbp, 1, $noreg, -40, $noreg, killed renamable $r11 :: (store (s64) into %stack.1)
+    renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
+    $r13d = MOV32rr killed $eax
+    $rdi = MOV64rr $r15
+    CALL64r renamable $r12, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    dead renamable $eax = MOV32rm renamable $rbx, 1, $noreg, 0, $noreg
+    renamable $eax = MOV32ri 1
+    TEST8ri renamable $r14b, 1, implicit-def $eflags
+    renamable $rax = MOV64rm $rbp, 1, $noreg, -48, $noreg :: (load (s64) from %stack.0)
+    FAKE_USE renamable $eax, implicit killed $rax
+    renamable $r11 = MOV64rm $rbp, 1, $noreg, -40, $noreg :: (load (s64) from %stack.1)
+    renamable $r12d = XOR32rr $r12d, $r11d, implicit-def dead $eflags
+    FAKE_USE killed renamable $r11d
+    TEST32rr killed renamable $r13d, renamable $r13d, implicit-def $eflags
+    RET64
+
+...
+---
+name:            disabled
+alignment:       16
+tracksRegLiveness: true
+noPhis:          true
+noVRegs:         true
+hasFakeUses:     true
+tracksDebugUserValues: true
+debugInstrRef: false
+liveins:
+  - { reg: '$rdi', virtual-reg: '' }
+  - { reg: '$esi', virtual-reg: '' }
+  - { reg: '$rdx', virtual-reg: '' }
+frameInfo:
+  isCalleeSavedInfoValid: true
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8,
+      stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  bb.0:
+    liveins: $esi, $rdi, $rdx, $r15, $r14, $r13, $r12, $r11, $rbx
+
+    ; CHECK-LABEL: name: disabled
+    ; CHECK: liveins: $esi, $rdi, $rdx, $r15, $r14, $r13, $r12, $r11, $rbx
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: $rbx = MOV64rr $rdx
+    ; CHECK-NEXT: $r14d = MOV32rr $esi
+    ; CHECK-NEXT: $r15 = MOV64rr $rdi
+    ; CHECK-NEXT: renamable $r12d = XOR32rr undef $r12d, undef $r12d, implicit-def dead $eflags, implicit-def $r12
+    ; CHECK-NEXT: renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $rax
+    ; CHECK-NEXT: MOV64mr $rbp, 1, $noreg, -48, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    ; CHECK-NEXT: MOV64mr $rbp, 1, $noreg, -40, $noreg, killed renamable $r11 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
+    ; CHECK-NEXT: $r13d = MOV32rr killed $eax
+    ; CHECK-NEXT: $rdi = MOV64rr $r15
+    ; CHECK-NEXT: CALL64r renamable $r12, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    ; CHECK-NEXT: dead renamable $eax = MOV32rm renamable $rbx, 1, $noreg, 0, $noreg
+    ; CHECK-NEXT: renamable $eax = MOV32ri 1
+    ; CHECK-NEXT: TEST8ri renamable $r14b, 1, implicit-def $eflags
+
+    ;; Verify that when instr-ref is disabled, we do not remove fake uses.
+    ; CHECK-NEXT: renamable $rax = MOV64rm $rbp, 1, $noreg, -48, $noreg :: (load (s64) from %stack.0)
+    ; CHECK-NEXT: FAKE_USE renamable $eax, implicit killed $rax
+    ; CHECK-NEXT: renamable $r11 = MOV64rm $rbp, 1, $noreg, -40, $noreg :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: renamable $r12d = XOR32rr $r12d, $r11d, implicit-def dead $eflags
+    ; CHECK-NEXT: FAKE_USE killed renamable $r11d
+    ; CHECK-NEXT: TEST32rr killed renamable $r13d, renamable $r13d, implicit-def $eflags
+    ; CHECK-NEXT: RET64
+    $rbx = MOV64rr $rdx
+    $r14d = MOV32rr $esi
+    $r15 = MOV64rr $rdi
+    renamable $r12d = XOR32rr undef $r12d, undef $r12d, implicit-def dead $eflags, implicit-def $r12
+    renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags, implicit-def $rax
+    MOV64mr $rbp, 1, $noreg, -48, $noreg, killed renamable $rax :: (store (s64) into %stack.0)
+    MOV64mr $rbp, 1, $noreg, -40, $noreg, killed renamable $r11 :: (store (s64) into %stack.1)
+    renamable $eax = XOR32rr undef $eax, undef $eax, implicit-def dead $eflags
+    $r13d = MOV32rr killed $eax
+    $rdi = MOV64rr $r15
+    CALL64r renamable $r12, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+    dead renamable $eax = MOV32rm renamable $rbx, 1, $noreg, 0, $noreg
+    renamable $eax = MOV32ri 1
+    TEST8ri renamable $r14b, 1, implicit-def $eflags
+    renamable $rax = MOV64rm $rbp, 1, $noreg, -48, $noreg :: (load (s64) from %stack.0)
+    FAKE_USE renamable $eax, implicit killed $rax
+    renamable $r11 = MOV64rm $rbp, 1, $noreg, -40, $noreg :: (load (s64) from %stack.1)
+    renamable $r12d = XOR32rr $r12d, $r11d, implicit-def dead $eflags
+    FAKE_USE killed renamable $r11d
+    TEST32rr killed renamable $r13d, renamable $r13d, implicit-def $eflags
+    RET64
+
+...

From a1ab5b4c87256ce7e99d6fd4bd0c62641e6ce853 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 28 Jan 2025 05:57:10 -0800
Subject: [PATCH 410/432] [SLP]Check the MainOp matches the requirements for
 the instructions

Need to include MainOp into the analysis of the instructions in
getSameOpcode to be sure that it is checked for the requirements to
prevent crashes during further analysis.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  5 +++--
 .../ARM/main-gep-with-non-matching-reqs.ll    | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/ARM/main-gep-with-non-matching-reqs.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ebf55a0691df1..f73ad1b15891a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -963,8 +963,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
       return InstructionsState::invalid();
   }
   bool AnyPoison = InstCnt != VL.size();
-  // Skip MainOp.
-  for (Value *V : iterator_range(It + 1, VL.end())) {
+  // Check MainOp too to be sure that it matches the requirements for the
+  // instructions.
+  for (Value *V : iterator_range(It, VL.end())) {
     auto *I = dyn_cast<Instruction>(V);
     if (!I)
       continue;
diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/main-gep-with-non-matching-reqs.ll b/llvm/test/Transforms/SLPVectorizer/ARM/main-gep-with-non-matching-reqs.ll
new file mode 100644
index 0000000000000..cf1aec7dde8ac
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/ARM/main-gep-with-non-matching-reqs.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=thumb7 -mcpu=swift < %s | FileCheck %s
+
+define i32 @test(ptr  %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: ptr [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[CONT2:.*:]]
+; CHECK-NEXT:    [[ARRAYIDX518:%.*]] = getelementptr i16, ptr [[TMP0]], i32 0
+; CHECK-NEXT:    [[CMP524_NOT:%.*]] = icmp eq ptr [[ARRAYIDX518]], null
+; CHECK-NEXT:    [[ARRAYIDX482_2:%.*]] = getelementptr [4 x i16], ptr null, i32 0, i32 0
+; CHECK-NEXT:    [[CMP487_NOT_2:%.*]] = icmp eq ptr [[ARRAYIDX482_2]], null
+; CHECK-NEXT:    ret i32 0
+;
+cont2:
+  %arrayidx518 = getelementptr i16, ptr  %0, i32 0
+  %cmp524.not = icmp eq ptr  %arrayidx518, null
+  %cmp487.not.3 = icmp eq ptr  null, null
+  %arrayidx482.2 = getelementptr [4 x i16], ptr  null, i32 0, i32 0
+  %cmp487.not.2 = icmp eq ptr  %arrayidx482.2, null
+  %cmp487.not.1 = icmp eq ptr  null, null
+  ret i32 0
+}

From 79499f010d2bfe809187a9a5f042d4e4ee1f1bcc Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Fri, 24 Jan 2025 09:06:01 +0000
Subject: [PATCH 411/432] [NFC][DebugInfo] Deprecate iterator-taking moveBefore
 and getFirstNonPHI (#124290)

The RemoveDIs project [0] makes debug intrinsics obsolete and to support
this instruction iterators carry an extra bit of debug information. To
maintain debug information accuracy insertion needs to be performed with a
BasicBlock::iterator rather than with Instruction pointers, otherwise the
extra bit of debug information is lost.

To that end, we're deprecating getFirstNonPHI and moveBefore for
instruction pointers. They're replaced by getFirstNonPHIIt and an
iterator-taking moveBefore: switching to the replacement is
straightforwards, and 99% of call-sites need only to unwrap the iterator
with &* or call getIterator() on an Instruction pointer.

The exception is when inserting instructions at the start of a block: if
you call getFirstNonPHI() (or begin() or getFirstInsertionPt()) and then
insert something at that position, you must pass the BasicBlock::iterator
returned into the insertion method. Unwrapping with &* and then calling
getIterator strips the debug-info bit we wish to preserve. Please do
contact us about any use case that's confusing or unclear [1].

[0] https://llvm.org/docs/RemoveDIsDebugInfo.html
[1] https://discourse.llvm.org/t/psa-ir-output-changing-from-debug-intrinsics-to-debug-records/79578
---
 llvm/include/llvm/IR/BasicBlock.h  | 23 ++++++++++++++++-----
 llvm/include/llvm/IR/Instruction.h | 33 +++++++++++++++++++++++-------
 llvm/lib/IR/BasicBlock.cpp         | 27 +++++++++++++-----------
 llvm/lib/IR/Instruction.cpp        |  2 +-
 4 files changed, 60 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index f2e34a7d24d06..2ee17ce8f483e 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -280,14 +280,27 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// When adding instructions to the beginning of the basic block, they should
   /// be added before the returned value, not before the first instruction,
   /// which might be PHI. Returns 0 is there's no non-PHI instruction.
-  const Instruction* getFirstNonPHI() const;
-  Instruction* getFirstNonPHI() {
+  ///
+  /// Deprecated in favour of getFirstNonPHIIt, which returns an iterator that
+  /// preserves some debugging information.
+  LLVM_DEPRECATED("Use iterators as instruction positions", "getFirstNonPHIIt")
+  const Instruction *getFirstNonPHI() const;
+  LLVM_DEPRECATED("Use iterators as instruction positions instead",
+                  "getFirstNonPHIIt")
+  Instruction *getFirstNonPHI() {
     return const_cast<Instruction *>(
-                       static_cast<const BasicBlock *>(this)->getFirstNonPHI());
+        static_cast<const BasicBlock *>(this)->getFirstNonPHI());
   }
 
-  /// Iterator returning form of getFirstNonPHI. Installed as a placeholder for
-  /// the RemoveDIs project that will eventually remove debug intrinsics.
+  /// Returns an iterator to the first instruction in this block that is not a
+  /// PHINode instruction.
+  ///
+  /// When adding instructions to the beginning of the basic block, they should
+  /// be added before the returned value, not before the first instruction,
+  /// which might be PHI. Returns end() if there's no non-PHI instruction.
+  ///
+  /// Avoid unwrapping the iterator to an Instruction* before inserting here,
+  /// as important debug-info is preserved in the iterator.
   InstListType::const_iterator getFirstNonPHIIt() const;
   InstListType::iterator getFirstNonPHIIt() {
     BasicBlock::iterator It =
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 6cdd79ce16005..900384432d75d 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -206,6 +206,12 @@ class Instruction : public User,
 
   /// Insert an unlinked instruction into a basic block immediately before
   /// the specified instruction.
+  ///
+  /// Deprecated in favour of the iterator-accepting flavour. Iterators at the
+  /// start of a block such as BasicBlock::getFirstNonPHIIt must be passed into
+  /// insertBefore without unwrapping/rewrapping. For all other positions, call
+  /// getIterator to fetch the instruction iterator.
+  LLVM_DEPRECATED("Use iterators as instruction positions", "")
   void insertBefore(Instruction *InsertPos);
 
   /// Insert an unlinked instruction into a basic block immediately before
@@ -229,6 +235,12 @@ class Instruction : public User,
 
   /// Unlink this instruction from its current basic block and insert it into
   /// the basic block that MovePos lives in, right before MovePos.
+  ///
+  /// Deprecated in favour of the iterator-accepting flavour. Iterators at the
+  /// start of a block such as BasicBlock::getFirstNonPHIIt must be passed into
+  /// moveBefore without unwrapping/rewrapping. For all other positions, call
+  /// getIterator to fetch the instruction iterator.
+  LLVM_DEPRECATED("Use iterators as instruction positions", "")
   void moveBefore(Instruction *MovePos);
 
   /// Unlink this instruction from its current basic block and insert it into
@@ -238,8 +250,20 @@ class Instruction : public User,
   /// Perform a \ref moveBefore operation, while signalling that the caller
   /// intends to preserve the original ordering of instructions. This implicitly
   /// means that any adjacent debug-info should move with this instruction.
-  /// This method is currently a no-op placeholder, but it will become
-  /// meaningful when the "RemoveDIs" project is enabled.
+  void moveBeforePreserving(InstListType::iterator MovePos);
+
+  /// Perform a \ref moveBefore operation, while signalling that the caller
+  /// intends to preserve the original ordering of instructions. This implicitly
+  /// means that any adjacent debug-info should move with this instruction.
+  void moveBeforePreserving(BasicBlock &BB, InstListType::iterator I);
+
+  /// Perform a \ref moveBefore operation, while signalling that the caller
+  /// intends to preserve the original ordering of instructions. This implicitly
+  /// means that any adjacent debug-info should move with this instruction.
+  ///
+  /// Deprecated in favour of the iterator-accepting flavour of
+  /// moveBeforePreserving, as all insertions should be at iterator positions.
+  LLVM_DEPRECATED("Use iterators as instruction positions", "")
   void moveBeforePreserving(Instruction *MovePos);
 
 private:
@@ -253,11 +277,6 @@ class Instruction : public User,
   /// \pre I is a valid iterator into BB.
   void moveBefore(BasicBlock &BB, InstListType::iterator I);
 
-  void moveBeforePreserving(BasicBlock &BB, InstListType::iterator I);
-  /// Unlink this instruction from its current basic block and insert it into
-  /// the basic block that MovePos lives in, right before MovePos.
-  void moveBeforePreserving(InstListType::iterator I);
-
   /// Unlink this instruction from its current basic block and insert it into
   /// the basic block that MovePos lives in, right after MovePos.
   void moveAfter(Instruction *MovePos);
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 0fbe02335edb3..8eaa6e522f826 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -372,15 +372,19 @@ const Instruction* BasicBlock::getFirstNonPHI() const {
 }
 
 BasicBlock::const_iterator BasicBlock::getFirstNonPHIIt() const {
-  const Instruction *I = getFirstNonPHI();
-  if (!I)
-    return end();
-  BasicBlock::const_iterator It = I->getIterator();
-  // Set the head-inclusive bit to indicate that this iterator includes
-  // any debug-info at the start of the block. This is a no-op unless the
-  // appropriate CMake flag is set.
-  It.setHeadBit(true);
-  return It;
+  for (const Instruction &I : *this) {
+    if (isa<PHINode>(I))
+      continue;
+
+    BasicBlock::const_iterator It = I.getIterator();
+    // Set the head-inclusive bit to indicate that this iterator includes
+    // any debug-info at the start of the block. This is a no-op unless the
+    // appropriate CMake flag is set.
+    It.setHeadBit(true);
+    return It;
+  }
+
+  return end();
 }
 
 BasicBlock::const_iterator
@@ -424,11 +428,10 @@ BasicBlock::getFirstNonPHIOrDbgOrLifetime(bool SkipPseudoOp) const {
 }
 
 BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
-  const Instruction *FirstNonPHI = getFirstNonPHI();
-  if (!FirstNonPHI)
+  const_iterator InsertPt = getFirstNonPHIIt();
+  if (InsertPt == end())
     return end();
 
-  const_iterator InsertPt = FirstNonPHI->getIterator();
   if (InsertPt->isEHPad()) ++InsertPt;
   // Set the head-inclusive bit to indicate that this iterator includes
   // any debug-info at the start of the block. This is a no-op unless the
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 4ab47edf3ed7d..84d9306ca6700 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1169,7 +1169,7 @@ bool Instruction::mayThrow(bool IncludePhaseOneUnwind) const {
     // Landingpads themselves don't unwind -- however, an invoke of a skipped
     // landingpad may continue unwinding.
     BasicBlock *UnwindDest = cast<InvokeInst>(this)->getUnwindDest();
-    Instruction *Pad = UnwindDest->getFirstNonPHI();
+    BasicBlock::iterator Pad = UnwindDest->getFirstNonPHIIt();
     if (auto *LP = dyn_cast<LandingPadInst>(Pad))
       return canUnwindPastLandingPad(LP, IncludePhaseOneUnwind);
     return false;

From 4bd2307a2085f0ace05e0b1b11c1c47ff3870110 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 28 Jan 2025 15:04:44 +0100
Subject: [PATCH 412/432] [clang][ExprConst] Don't diagnose a non-existent init
 as not constant (#124575)

This test:
```c++
extern Swim& trident; // expected-note {{declared here}}

constexpr auto& gallagher = typeid(trident);    // expected-error {{constexpr variable 'gallagher' must be initialized by a constant expression}}
                                                // expected-note@-1 {{initializer of 'trident' is not a constant expression}}
```
diagnosed the initializer of `trident` as not constant, but `trident`
doesn't even have an initializer. Remove that diagnostic in this case.
---
 clang/lib/AST/ExprConstant.cpp                     | 8 ++++++--
 clang/test/SemaCXX/constant-expression-cxx11.cpp   | 8 ++++----
 clang/test/SemaCXX/constant-expression-p2280r4.cpp | 3 +--
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index be8f1fe02e721..0e41e3dbc8a32 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -3600,8 +3600,12 @@ static bool evaluateVarDeclInit(EvalInfo &Info, const Expr *E,
        VD->mightBeUsableInConstantExpressions(Info.Ctx)) ||
       ((Info.getLangOpts().CPlusPlus || Info.getLangOpts().OpenCL) &&
        !Info.getLangOpts().CPlusPlus11 && !VD->hasICEInitializer(Info.Ctx))) {
-    Info.CCEDiag(E, diag::note_constexpr_var_init_non_constant, 1) << VD;
-    NoteLValueLocation(Info, Base);
+    if (Init) {
+      Info.CCEDiag(E, diag::note_constexpr_var_init_non_constant, 1) << VD;
+      NoteLValueLocation(Info, Base);
+    } else {
+      Info.CCEDiag(E);
+    }
   }
 
   // Never use the initializer of a weak variable, not even for constant
diff --git a/clang/test/SemaCXX/constant-expression-cxx11.cpp b/clang/test/SemaCXX/constant-expression-cxx11.cpp
index a1234b67acd6d..76e2f81947051 100644
--- a/clang/test/SemaCXX/constant-expression-cxx11.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx11.cpp
@@ -1462,7 +1462,7 @@ namespace InstantiateCaseStmt {
 
 namespace ConvertedConstantExpr {
   extern int &m;
-  extern int &n; // expected-note 2{{declared here}}
+  extern int &n; // pre-cxx23-note 2{{declared here}}
 
   constexpr int k = 4;
   int &m = const_cast<int&>(k);
@@ -1471,9 +1471,9 @@ namespace ConvertedConstantExpr {
   // useless note and instead just point to the non-constant subexpression.
   enum class E {
     em = m,
-    en = n, // cxx23-note {{initializer of 'n' is not a constant expression}} expected-error {{enumerator value is not a constant expression}} cxx11_20-note {{initializer of 'n' is unknown}}
-    eo = (m + // expected-error {{not a constant expression}}
-          n // cxx23-note {{initializer of 'n' is not a constant expression}} cxx11_20-note {{initializer of 'n' is unknown}}
+    en = n, // expected-error {{enumerator value is not a constant expression}} cxx11_20-note {{initializer of 'n' is unknown}}
+    eo = (m + // pre-cxx23-error {{not a constant expression}}
+          n // cxx11_20-note {{initializer of 'n' is unknown}} cxx23-error {{not a constant expression}}
           ),
     eq = reinterpret_cast<long>((int*)0) // expected-error {{not a constant expression}} expected-note {{reinterpret_cast}}
   };
diff --git a/clang/test/SemaCXX/constant-expression-p2280r4.cpp b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
index 0f85c60629eed..8648350b397e0 100644
--- a/clang/test/SemaCXX/constant-expression-p2280r4.cpp
+++ b/clang/test/SemaCXX/constant-expression-p2280r4.cpp
@@ -47,11 +47,10 @@ void splash(Swim& swam) {
 }
 
 extern Swim dc;
-extern Swim& trident; // expected-note {{declared here}}
+extern Swim& trident;
 
 constexpr auto& sandeno   = typeid(dc);         // ok: can only be typeid(Swim)
 constexpr auto& gallagher = typeid(trident);    // expected-error {{constexpr variable 'gallagher' must be initialized by a constant expression}}
-                                                // expected-note@-1 {{initializer of 'trident' is not a constant expression}}
 
 namespace explicitThis {
 struct C {

From 3a975d697f11b5cccda2f2778c3792c179eecb74 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 28 Jan 2025 06:14:17 -0800
Subject: [PATCH 413/432] [gn] port 7e22180c20f (BuiltinsHexagon.td)

---
 llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 4 ++++
 llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn           | 1 +
 2 files changed, 5 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index 87ff61939b330..e55ee6116b485 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -112,6 +112,10 @@ clang_tablegen("BuiltinsBPF") {
   args = [ "-gen-clang-builtins" ]
 }
 
+clang_tablegen("BuiltinsHexagon") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 clang_tablegen("BuiltinsNVPTX") {
   args = [ "-gen-clang-builtins" ]
 }
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index 31b585277795a..fd671bbdf7c65 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -26,6 +26,7 @@ static_library("Basic") {
     "//clang/include/clang/Basic:AttrSubMatchRulesList",
     "//clang/include/clang/Basic:Builtins",
     "//clang/include/clang/Basic:BuiltinsBPF",
+    "//clang/include/clang/Basic:BuiltinsHexagon",
     "//clang/include/clang/Basic:BuiltinsNVPTX",
     "//clang/include/clang/Basic:BuiltinsRISCV",
     "//clang/include/clang/Basic:BuiltinsSPIRV",

From 78b5bb702fe97fe85f66d72598d0dfa7c49fe001 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 28 Jan 2025 14:17:23 +0000
Subject: [PATCH 414/432] [libclc][NFC] Move key math headers to CLC (#124739)

---
 .../include/clc/math/clc_subnormal_config.h}  |   4 +
 .../lib => clc/include/clc}/math/math.h       | 108 +++++++++---------
 .../lib => clc/include/clc}/math/tables.h     |  22 ++--
 libclc/clspv/lib/math/fma.cl                  |   2 +-
 libclc/clspv/lib/subnormal_config.cl          |   3 +-
 libclc/generic/include/math/clc_ldexp.h       |   2 +
 libclc/generic/lib/math/acos.cl               |   4 +-
 libclc/generic/lib/math/acosh.cl              |   5 +-
 libclc/generic/lib/math/acospi.cl             |   3 +-
 libclc/generic/lib/math/asin.cl               |   3 +-
 libclc/generic/lib/math/asinh.cl              |   5 +-
 libclc/generic/lib/math/asinpi.cl             |   3 +-
 libclc/generic/lib/math/atan.cl               |   3 +-
 libclc/generic/lib/math/atan2.cl              |   5 +-
 libclc/generic/lib/math/atan2pi.cl            |   5 +-
 libclc/generic/lib/math/atanh.cl              |   3 +-
 libclc/generic/lib/math/atanpi.cl             |   3 +-
 libclc/generic/lib/math/cbrt.cl               |   5 +-
 libclc/generic/lib/math/clc_exp10.cl          |   7 +-
 libclc/generic/lib/math/clc_fma.cl            |   5 +-
 libclc/generic/lib/math/clc_fmod.cl           |   5 +-
 libclc/generic/lib/math/clc_hypot.cl          |   5 +-
 libclc/generic/lib/math/clc_ldexp.cl          |   4 +-
 libclc/generic/lib/math/clc_pow.cl            |   7 +-
 libclc/generic/lib/math/clc_pown.cl           |   7 +-
 libclc/generic/lib/math/clc_powr.cl           |   7 +-
 libclc/generic/lib/math/clc_remainder.cl      |   5 +-
 libclc/generic/lib/math/clc_remquo.cl         |   5 +-
 libclc/generic/lib/math/clc_rootn.cl          |   7 +-
 libclc/generic/lib/math/clc_tan.cl            |   7 +-
 libclc/generic/lib/math/clc_tanpi.cl          |   8 +-
 libclc/generic/lib/math/cos.cl                |   5 +-
 libclc/generic/lib/math/cosh.cl               |   5 +-
 libclc/generic/lib/math/cospi.cl              |   2 +-
 libclc/generic/lib/math/ep_log.cl             |   6 +-
 libclc/generic/lib/math/ep_log.h              |   2 +
 libclc/generic/lib/math/erf.cl                |   3 +-
 libclc/generic/lib/math/erfc.cl               |   3 +-
 libclc/generic/lib/math/exp.cl                |   3 +-
 libclc/generic/lib/math/exp2.cl               |   3 +-
 libclc/generic/lib/math/exp_helper.cl         |   5 +-
 libclc/generic/lib/math/expm1.cl              |   5 +-
 libclc/generic/lib/math/fdim.cl               |   3 +-
 libclc/generic/lib/math/fma.cl                |   2 +-
 libclc/generic/lib/math/ilogb.cl              |   2 +-
 libclc/generic/lib/math/ldexp.cl              |   4 +-
 libclc/generic/lib/math/lgamma_r.cl           |   3 +-
 libclc/generic/lib/math/log10.cl              |   2 +-
 libclc/generic/lib/math/log1p.cl              |   5 +-
 libclc/generic/lib/math/log2.cl               |   2 +-
 libclc/generic/lib/math/log_base.h            |   2 +-
 libclc/generic/lib/math/logb.cl               |   2 +-
 libclc/generic/lib/math/modf.cl               |   3 +-
 libclc/generic/lib/math/sin.cl                |   5 +-
 libclc/generic/lib/math/sincos_helpers.cl     |   7 +-
 libclc/generic/lib/math/sincos_helpers.h      |   9 +-
 libclc/generic/lib/math/sinh.cl               |   5 +-
 libclc/generic/lib/math/sinpi.cl              |   2 +-
 libclc/generic/lib/math/tables.cl             |   2 +-
 libclc/generic/lib/math/tanh.cl               |   3 +-
 libclc/generic/lib/math/tgamma.cl             |   3 +-
 libclc/generic/lib/subnormal_config.cl        |   3 +-
 libclc/r600/lib/math/fmax.cl                  |   3 +-
 libclc/r600/lib/math/fmin.cl                  |   3 +-
 libclc/spirv/lib/subnormal_config.cl          |   3 +-
 libclc/spirv64/lib/subnormal_config.cl        |   3 +-
 66 files changed, 178 insertions(+), 212 deletions(-)
 rename libclc/{generic/include/config.h => clc/include/clc/math/clc_subnormal_config.h} (90%)
 rename libclc/{generic/lib => clc/include/clc}/math/math.h (52%)
 rename libclc/{generic/lib => clc/include/clc}/math/tables.h (81%)

diff --git a/libclc/generic/include/config.h b/libclc/clc/include/clc/math/clc_subnormal_config.h
similarity index 90%
rename from libclc/generic/include/config.h
rename to libclc/clc/include/clc/math/clc_subnormal_config.h
index 7aa5967f4eb68..eaab5aaee985b 100644
--- a/libclc/generic/include/config.h
+++ b/libclc/clc/include/clc/math/clc_subnormal_config.h
@@ -19,6 +19,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#ifndef __CLC_MATH_CLC_SUBNORMAL_CONFIG_H__
+#define __CLC_MATH_CLC_SUBNORMAL_CONFIG_H__
 
 #include <clc/clcfunc.h>
 
@@ -26,3 +28,5 @@ _CLC_DECL bool __clc_subnormals_disabled();
 _CLC_DECL bool __clc_fp16_subnormals_supported();
 _CLC_DECL bool __clc_fp32_subnormals_supported();
 _CLC_DECL bool __clc_fp64_subnormals_supported();
+
+#endif // __CLC_MATH_CLC_SUBNORMAL_CONFIG_H__
diff --git a/libclc/generic/lib/math/math.h b/libclc/clc/include/clc/math/math.h
similarity index 52%
rename from libclc/generic/lib/math/math.h
rename to libclc/clc/include/clc/math/math.h
index d5ef0871e5201..ed37af237bf82 100644
--- a/libclc/generic/lib/math/math.h
+++ b/libclc/clc/include/clc/math/math.h
@@ -20,12 +20,12 @@
  * THE SOFTWARE.
  */
 
-#ifndef __CLC_MATH_H_
-#define __CLC_MATH_H_
+#ifndef __CLC_MATH_MATH_H__
+#define __CLC_MATH_MATH_H__
 
-#include "clc/clcfunc.h"
-#include "clc/as_type.h"
-#include "config.h"
+#include <clc/clc_as_type.h>
+#include <clc/clcfunc.h>
+#include <clc/math/clc_subnormal_config.h>
 
 #define SNAN 0x001
 #define QNAN 0x002
@@ -54,64 +54,64 @@ bool __attribute__((noinline)) __clc_runtime_has_hw_fma32(void);
 #define MATH_RECIP(X) (1.0f / (X))
 #define MATH_SQRT(X) sqrt(X)
 
-#define SIGNBIT_SP32      0x80000000
-#define EXSIGNBIT_SP32    0x7fffffff
-#define EXPBITS_SP32      0x7f800000
-#define MANTBITS_SP32     0x007fffff
-#define ONEEXPBITS_SP32   0x3f800000
-#define TWOEXPBITS_SP32   0x40000000
-#define HALFEXPBITS_SP32  0x3f000000
-#define IMPBIT_SP32       0x00800000
-#define QNANBITPATT_SP32  0x7fc00000
+#define SIGNBIT_SP32 0x80000000
+#define EXSIGNBIT_SP32 0x7fffffff
+#define EXPBITS_SP32 0x7f800000
+#define MANTBITS_SP32 0x007fffff
+#define ONEEXPBITS_SP32 0x3f800000
+#define TWOEXPBITS_SP32 0x40000000
+#define HALFEXPBITS_SP32 0x3f000000
+#define IMPBIT_SP32 0x00800000
+#define QNANBITPATT_SP32 0x7fc00000
 #define INDEFBITPATT_SP32 0xffc00000
-#define PINFBITPATT_SP32  0x7f800000
-#define NINFBITPATT_SP32  0xff800000
-#define EXPBIAS_SP32      127
+#define PINFBITPATT_SP32 0x7f800000
+#define NINFBITPATT_SP32 0xff800000
+#define EXPBIAS_SP32 127
 #define EXPSHIFTBITS_SP32 23
-#define BIASEDEMIN_SP32   1
-#define EMIN_SP32         -126
-#define BIASEDEMAX_SP32   254
-#define EMAX_SP32         127
-#define LAMBDA_SP32       1.0e30
-#define MANTLENGTH_SP32   24
-#define BASEDIGITS_SP32   7
+#define BIASEDEMIN_SP32 1
+#define EMIN_SP32 -126
+#define BIASEDEMAX_SP32 254
+#define EMAX_SP32 127
+#define LAMBDA_SP32 1.0e30
+#define MANTLENGTH_SP32 24
+#define BASEDIGITS_SP32 7
 
-_CLC_OVERLOAD _CLC_INLINE float __clc_flush_denormal_if_not_supported(float x)
-{
-	int ix = as_int(x);
-	if (!__clc_fp32_subnormals_supported() &&
-		((ix & EXPBITS_SP32) == 0) && ((ix & MANTBITS_SP32) != 0)) {
-		ix &= SIGNBIT_SP32;
-		x = as_float(ix);
-	}
-	return x;
+_CLC_OVERLOAD _CLC_INLINE float __clc_flush_denormal_if_not_supported(float x) {
+  int ix = __clc_as_int(x);
+  if (!__clc_fp32_subnormals_supported() && ((ix & EXPBITS_SP32) == 0) &&
+      ((ix & MANTBITS_SP32) != 0)) {
+    ix &= SIGNBIT_SP32;
+    x = __clc_as_float(ix);
+  }
+  return x;
 }
 
 #ifdef cl_khr_fp64
 
-#define SIGNBIT_DP64      0x8000000000000000L
-#define EXSIGNBIT_DP64    0x7fffffffffffffffL
-#define EXPBITS_DP64      0x7ff0000000000000L
-#define MANTBITS_DP64     0x000fffffffffffffL
-#define ONEEXPBITS_DP64   0x3ff0000000000000L
-#define TWOEXPBITS_DP64   0x4000000000000000L
-#define HALFEXPBITS_DP64  0x3fe0000000000000L
-#define IMPBIT_DP64       0x0010000000000000L
-#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define SIGNBIT_DP64 0x8000000000000000L
+#define EXSIGNBIT_DP64 0x7fffffffffffffffL
+#define EXPBITS_DP64 0x7ff0000000000000L
+#define MANTBITS_DP64 0x000fffffffffffffL
+#define ONEEXPBITS_DP64 0x3ff0000000000000L
+#define TWOEXPBITS_DP64 0x4000000000000000L
+#define HALFEXPBITS_DP64 0x3fe0000000000000L
+#define IMPBIT_DP64 0x0010000000000000L
+#define QNANBITPATT_DP64 0x7ff8000000000000L
 #define INDEFBITPATT_DP64 0xfff8000000000000L
-#define PINFBITPATT_DP64  0x7ff0000000000000L
-#define NINFBITPATT_DP64  0xfff0000000000000L
-#define EXPBIAS_DP64      1023
+#define PINFBITPATT_DP64 0x7ff0000000000000L
+#define NINFBITPATT_DP64 0xfff0000000000000L
+#define EXPBIAS_DP64 1023
 #define EXPSHIFTBITS_DP64 52
-#define BIASEDEMIN_DP64   1
-#define EMIN_DP64         -1022
-#define BIASEDEMAX_DP64   2046 /* 0x7fe */
-#define EMAX_DP64         1023 /* 0x3ff */
-#define LAMBDA_DP64       1.0e300
-#define MANTLENGTH_DP64   53
-#define BASEDIGITS_DP64   15
+#define BIASEDEMIN_DP64 1
+#define EMIN_DP64 -1022
+#define BIASEDEMAX_DP64 2046 /* 0x7fe */
+#define EMAX_DP64 1023       /* 0x3ff */
+#define LAMBDA_DP64 1.0e300
+#define MANTLENGTH_DP64 53
+#define BASEDIGITS_DP64 15
 
 #endif // cl_khr_fp64
 
-#define ALIGNED(x)	__attribute__((aligned(x)))
-#endif // __CLC_MATH_H_
+#define ALIGNED(x) __attribute__((aligned(x)))
+
+#endif // __CLC_MATH_MATH_H__
diff --git a/libclc/generic/lib/math/tables.h b/libclc/clc/include/clc/math/tables.h
similarity index 81%
rename from libclc/generic/lib/math/tables.h
rename to libclc/clc/include/clc/math/tables.h
index ea5221e1f72b0..71a332144c474 100644
--- a/libclc/generic/lib/math/tables.h
+++ b/libclc/clc/include/clc/math/tables.h
@@ -20,32 +20,30 @@
  * THE SOFTWARE.
  */
 
+#ifndef __CLC_MATH_TABLES_H__
+#define __CLC_MATH_TABLES_H__
+
 #include <clc/clctypes.h>
 
 #define TABLE_SPACE __constant
 
 #define TABLE_MANGLE(NAME) __clc_##NAME
 
-#define DECLARE_TABLE(TYPE,NAME,LENGTH) \
-    TABLE_SPACE TYPE NAME [ LENGTH ]
+#define DECLARE_TABLE(TYPE, NAME, LENGTH) TABLE_SPACE TYPE NAME[LENGTH]
 
-#define TABLE_FUNCTION(TYPE,TABLE,NAME) \
-    TYPE TABLE_MANGLE(NAME)(size_t idx) { \
-        return TABLE[idx]; \
-    }
+#define TABLE_FUNCTION(TYPE, TABLE, NAME)                                      \
+  TYPE TABLE_MANGLE(NAME)(size_t idx) { return TABLE[idx]; }
 
-#define TABLE_FUNCTION_DECL(TYPE, NAME) \
-    TYPE TABLE_MANGLE(NAME)(size_t idx);
+#define TABLE_FUNCTION_DECL(TYPE, NAME) TYPE TABLE_MANGLE(NAME)(size_t idx);
 
-#define USE_TABLE(NAME, IDX) \
-    TABLE_MANGLE(NAME)(IDX)
+#define USE_TABLE(NAME, IDX) TABLE_MANGLE(NAME)(IDX)
 
 TABLE_FUNCTION_DECL(float2, loge_tbl);
 TABLE_FUNCTION_DECL(float, log_inv_tbl);
 TABLE_FUNCTION_DECL(float2, log_inv_tbl_ep);
 TABLE_FUNCTION_DECL(float2, log2_tbl);
 TABLE_FUNCTION_DECL(float2, log10_tbl);
-TABLE_FUNCTION_DECL(uint4,  pibits_tbl);
+TABLE_FUNCTION_DECL(uint4, pibits_tbl);
 TABLE_FUNCTION_DECL(float2, sinhcosh_tbl);
 TABLE_FUNCTION_DECL(float2, cbrt_tbl);
 TABLE_FUNCTION_DECL(float, exp_tbl);
@@ -67,3 +65,5 @@ TABLE_FUNCTION_DECL(double2, powlog_tbl);
 TABLE_FUNCTION_DECL(double2, log_f_inv_tbl);
 
 #endif // cl_khr_fp64
+
+#endif // __CLC_MATH_TABLES_H__
diff --git a/libclc/clspv/lib/math/fma.cl b/libclc/clspv/lib/math/fma.cl
index e6251db4e92db..73c6e158601d9 100644
--- a/libclc/clspv/lib/math/fma.cl
+++ b/libclc/clspv/lib/math/fma.cl
@@ -24,9 +24,9 @@
 // (__clc_sw_fma), but avoids the use of ulong in favor of uint2. The logic has
 // been updated as appropriate.
 
-#include "../../../generic/lib/math/math.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/math.h>
 
 struct fp {
   uint2 mantissa;
diff --git a/libclc/clspv/lib/subnormal_config.cl b/libclc/clspv/lib/subnormal_config.cl
index 167fe1b1a1bcf..3ab5a6394b933 100644
--- a/libclc/clspv/lib/subnormal_config.cl
+++ b/libclc/clspv/lib/subnormal_config.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "config.h"
+#include <clc/math/clc_subnormal_config.h>
 
 _CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
 
diff --git a/libclc/generic/include/math/clc_ldexp.h b/libclc/generic/include/math/clc_ldexp.h
index 454b7ed3dcee5..eb83f16240185 100644
--- a/libclc/generic/include/math/clc_ldexp.h
+++ b/libclc/generic/include/math/clc_ldexp.h
@@ -1,3 +1,5 @@
+#include <clc/clcfunc.h>
+
 _CLC_DEF _CLC_OVERLOAD float __clc_ldexp(float, int);
 
 #ifdef cl_khr_fp64
diff --git a/libclc/generic/lib/math/acos.cl b/libclc/generic/lib/math/acos.cl
index d71d10024b180..273ddbd4b1ee5 100644
--- a/libclc/generic/lib/math/acos.cl
+++ b/libclc/generic/lib/math/acos.cl
@@ -19,10 +19,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float acos(float x) {
     // Computes arccos(x).
diff --git a/libclc/generic/lib/math/acosh.cl b/libclc/generic/lib/math/acosh.cl
index 977c2e929b34c..41671e94ea78c 100644
--- a/libclc/generic/lib/math/acosh.cl
+++ b/libclc/generic/lib/math/acosh.cl
@@ -20,11 +20,10 @@
  * THE SOFTWARE.
  */
 
+#include "ep_log.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "ep_log.h"
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF  float acosh(float x) {
     uint ux = as_uint(x);
diff --git a/libclc/generic/lib/math/acospi.cl b/libclc/generic/lib/math/acospi.cl
index 5aa8a083df4e9..d640ff24e5386 100644
--- a/libclc/generic/lib/math/acospi.cl
+++ b/libclc/generic/lib/math/acospi.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float acospi(float x) {
     // Computes arccos(x).
diff --git a/libclc/generic/lib/math/asin.cl b/libclc/generic/lib/math/asin.cl
index 443dec830eb2f..ae60b88e9037c 100644
--- a/libclc/generic/lib/math/asin.cl
+++ b/libclc/generic/lib/math/asin.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float asin(float x) {
     // Computes arcsin(x).
diff --git a/libclc/generic/lib/math/asinh.cl b/libclc/generic/lib/math/asinh.cl
index 686d9f7a95e5d..1a60fae8bdf9b 100644
--- a/libclc/generic/lib/math/asinh.cl
+++ b/libclc/generic/lib/math/asinh.cl
@@ -20,11 +20,10 @@
  * THE SOFTWARE.
  */
 
+#include "ep_log.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "ep_log.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float asinh(float x) {
     uint ux = as_uint(x);
diff --git a/libclc/generic/lib/math/asinpi.cl b/libclc/generic/lib/math/asinpi.cl
index 18dc53000b034..64b15aff6792c 100644
--- a/libclc/generic/lib/math/asinpi.cl
+++ b/libclc/generic/lib/math/asinpi.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float asinpi(float x) {
     // Computes arcsin(x).
diff --git a/libclc/generic/lib/math/atan.cl b/libclc/generic/lib/math/atan.cl
index b6b067f2bf4ff..95c247304f91a 100644
--- a/libclc/generic/lib/math/atan.cl
+++ b/libclc/generic/lib/math/atan.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float atan(float x)
 {
diff --git a/libclc/generic/lib/math/atan2.cl b/libclc/generic/lib/math/atan2.cl
index 32419937a856b..067e43e445ed1 100644
--- a/libclc/generic/lib/math/atan2.cl
+++ b/libclc/generic/lib/math/atan2.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF float atan2(float y, float x)
 {
diff --git a/libclc/generic/lib/math/atan2pi.cl b/libclc/generic/lib/math/atan2pi.cl
index e631918f7539e..d67cf90df63f8 100644
--- a/libclc/generic/lib/math/atan2pi.cl
+++ b/libclc/generic/lib/math/atan2pi.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF  float atan2pi(float y, float x) {
     const float pi = 0x1.921fb6p+1f;
diff --git a/libclc/generic/lib/math/atanh.cl b/libclc/generic/lib/math/atanh.cl
index 10bad190cc0dc..81ae1a2ca801c 100644
--- a/libclc/generic/lib/math/atanh.cl
+++ b/libclc/generic/lib/math/atanh.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float atanh(float x) {
     uint ux = as_uint(x);
diff --git a/libclc/generic/lib/math/atanpi.cl b/libclc/generic/lib/math/atanpi.cl
index 8522acf349933..8c7bac4bb4bb2 100644
--- a/libclc/generic/lib/math/atanpi.cl
+++ b/libclc/generic/lib/math/atanpi.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float atanpi(float x) {
     const float pi = 3.1415926535897932f;
diff --git a/libclc/generic/lib/math/cbrt.cl b/libclc/generic/lib/math/cbrt.cl
index 49f2ba8a411cb..a2bf152f35f94 100644
--- a/libclc/generic/lib/math/cbrt.cl
+++ b/libclc/generic/lib/math/cbrt.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF float cbrt(float x) {
 
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index 572aa396942b7..0eb53d013a85a 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -23,12 +23,11 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_mad.h>
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 #include <clc/relational/clc_isnan.h>
 
-#include "config.h"
-#include "math.h"
-#include "tables.h"
-
 //    Algorithm:
 //
 //    e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/generic/lib/math/clc_fma.cl
index 15de4c8032a93..3f29e7f92615a 100644
--- a/libclc/generic/lib/math/clc_fma.cl
+++ b/libclc/generic/lib/math/clc_fma.cl
@@ -23,13 +23,12 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_max.h>
 
-#include "config.h"
-#include "math.h"
-
 struct fp {
   ulong mantissa;
   int exponent;
diff --git a/libclc/generic/lib/math/clc_fmod.cl b/libclc/generic/lib/math/clc_fmod.cl
index 5d101373178dd..db47536833342 100644
--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@@ -23,12 +23,11 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
 #include <clc/shared/clc_max.h>
-
 #include <math/clc_remainder.h>
-#include "config.h"
-#include "math.h"
 
 _CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y)
 {
diff --git a/libclc/generic/lib/math/clc_hypot.cl b/libclc/generic/lib/math/clc_hypot.cl
index d889969d6d8c2..fd2e87b4a1ed8 100644
--- a/libclc/generic/lib/math/clc_hypot.cl
+++ b/libclc/generic/lib/math/clc_hypot.cl
@@ -24,13 +24,12 @@
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
 #include <clc/math/clc_mad.h>
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_clamp.h>
 #include <math/clc_hypot.h>
 
-#include "config.h"
-#include "math.h"
-
 // Returns sqrt(x*x + y*y) with no overflow or underflow unless the result
 // warrants it
 _CLC_DEF _CLC_OVERLOAD float __clc_hypot(float x, float y) {
diff --git a/libclc/generic/lib/math/clc_ldexp.cl b/libclc/generic/lib/math/clc_ldexp.cl
index 6d3721588df81..09c27062c63f9 100644
--- a/libclc/generic/lib/math/clc_ldexp.cl
+++ b/libclc/generic/lib/math/clc_ldexp.cl
@@ -20,8 +20,8 @@
  * THE SOFTWARE.
  */
 
-#include "config.h"
-#include "math.h"
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/relational/clc_isinf.h>
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index 4abfaf1c10df4..5dcd392c0f7ed 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -24,10 +24,9 @@
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
-
-#include "config.h"
-#include "math.h"
-#include "tables.h"
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 /*
  compute pow using log and exp
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index c020892664602..a0f968c238e99 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -24,10 +24,9 @@
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
-
-#include "config.h"
-#include "math.h"
-#include "tables.h"
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index 9516be34456b8..7e1a6f2a02e7a 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -24,10 +24,9 @@
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
-
-#include "config.h"
-#include "math.h"
-#include "tables.h"
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
diff --git a/libclc/generic/lib/math/clc_remainder.cl b/libclc/generic/lib/math/clc_remainder.cl
index 8a0ce8816fcb3..e9d2e382d9beb 100644
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@@ -23,12 +23,11 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
 #include <clc/shared/clc_max.h>
-
 #include <math/clc_remainder.h>
-#include "config.h"
-#include "math.h"
 
 _CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y)
 {
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
index 8d2e5f9a74bfe..9cbda094294ad 100644
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -23,13 +23,12 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_floor.h>
+#include <clc/math/clc_subnormal_config.h>
 #include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
 #include <clc/shared/clc_max.h>
 #include <math/clc_remainder.h>
 
-#include "config.h"
-#include "math.h"
-
 _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
                                           __private int *quo) {
   x = __clc_flush_denormal_if_not_supported(x);
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index 70ae02ac2370c..42b983784c14d 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -24,10 +24,9 @@
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
-
-#include "config.h"
-#include "math.h"
-#include "tables.h"
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 // compute pow using log and exp
 // x^y = exp(y * log(x))
diff --git a/libclc/generic/lib/math/clc_tan.cl b/libclc/generic/lib/math/clc_tan.cl
index 4daaee59b10d2..c1af01fbb7ec7 100644
--- a/libclc/generic/lib/math/clc_tan.cl
+++ b/libclc/generic/lib/math/clc_tan.cl
@@ -19,16 +19,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
 
-#include "math.h"
-#include "sincos_helpers.h"
-#include "tables.h"
-
 _CLC_DEF _CLC_OVERLOAD float __clc_tan(float x) {
   int ix = as_int(x);
   int ax = ix & 0x7fffffff;
diff --git a/libclc/generic/lib/math/clc_tanpi.cl b/libclc/generic/lib/math/clc_tanpi.cl
index 65d1984593706..90f36eae63996 100644
--- a/libclc/generic/lib/math/clc_tanpi.cl
+++ b/libclc/generic/lib/math/clc_tanpi.cl
@@ -19,12 +19,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
-#include <clc/clc.h>
-#include <clc/clcmacro.h>
 
-#include "math.h"
 #include "sincos_helpers.h"
-#include "tables.h"
+#include <clc/clc.h>
+#include <clc/clcmacro.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_tanpi(float x)
 {
diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl
index 2945c90d6fe0f..e13210cd40864 100644
--- a/libclc/generic/lib/math/cos.cl
+++ b/libclc/generic/lib/math/cos.cl
@@ -20,11 +20,10 @@
  * THE SOFTWARE.
  */
 
+#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "sincos_helpers.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float cos(float x)
 {
diff --git a/libclc/generic/lib/math/cosh.cl b/libclc/generic/lib/math/cosh.cl
index 02b03a761aa19..5b5400178e28e 100644
--- a/libclc/generic/lib/math/cosh.cl
+++ b/libclc/generic/lib/math/cosh.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF float cosh(float x) {
 
diff --git a/libclc/generic/lib/math/cospi.cl b/libclc/generic/lib/math/cospi.cl
index 98e989267c917..a3bfb8f20eee4 100644
--- a/libclc/generic/lib/math/cospi.cl
+++ b/libclc/generic/lib/math/cospi.cl
@@ -22,8 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/math.h>
 
-#include "math.h"
 #include "sincos_helpers.h"
 #include "sincospiF_piby4.h"
 #ifdef cl_khr_fp64
diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl
index 3c2c62c3d305b..90c9fa426fec1 100644
--- a/libclc/generic/lib/math/ep_log.cl
+++ b/libclc/generic/lib/math/ep_log.cl
@@ -22,10 +22,10 @@
 
 #ifdef cl_khr_fp64
 
-#include <clc/clc.h>
 #include "ep_log.h"
-#include "math.h"
-#include "tables.h"
+#include <clc/clc.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h
index 7f99ac60b72fb..414e6231f7fd6 100644
--- a/libclc/generic/lib/math/ep_log.h
+++ b/libclc/generic/lib/math/ep_log.h
@@ -20,6 +20,8 @@
  * THE SOFTWARE.
  */
 
+#include <clc/clcfunc.h>
+
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
diff --git a/libclc/generic/lib/math/erf.cl b/libclc/generic/lib/math/erf.cl
index ae8b6ab784558..bc305fe414a32 100644
--- a/libclc/generic/lib/math/erf.cl
+++ b/libclc/generic/lib/math/erf.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 /*
  * ====================================================
diff --git a/libclc/generic/lib/math/erfc.cl b/libclc/generic/lib/math/erfc.cl
index c4d34ea85e98b..39d8d826280bc 100644
--- a/libclc/generic/lib/math/erfc.cl
+++ b/libclc/generic/lib/math/erfc.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 /*
  * ====================================================
diff --git a/libclc/generic/lib/math/exp.cl b/libclc/generic/lib/math/exp.cl
index 95dc0db44df76..f71ed07c76b36 100644
--- a/libclc/generic/lib/math/exp.cl
+++ b/libclc/generic/lib/math/exp.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float exp(float x) {
 
diff --git a/libclc/generic/lib/math/exp2.cl b/libclc/generic/lib/math/exp2.cl
index 8d718311f896f..13c4789797da9 100644
--- a/libclc/generic/lib/math/exp2.cl
+++ b/libclc/generic/lib/math/exp2.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float exp2(float x) {
 
diff --git a/libclc/generic/lib/math/exp_helper.cl b/libclc/generic/lib/math/exp_helper.cl
index 046f306466bca..ea58d49815393 100644
--- a/libclc/generic/lib/math/exp_helper.cl
+++ b/libclc/generic/lib/math/exp_helper.cl
@@ -21,9 +21,8 @@
  */
 
 #include <clc/clc.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 #ifdef cl_khr_fp64
 
diff --git a/libclc/generic/lib/math/expm1.cl b/libclc/generic/lib/math/expm1.cl
index d50a88e0aaae8..5e2141ae791f5 100644
--- a/libclc/generic/lib/math/expm1.cl
+++ b/libclc/generic/lib/math/expm1.cl
@@ -1,8 +1,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 /* Refer to the exp routine for the underlying algorithm */
 
diff --git a/libclc/generic/lib/math/fdim.cl b/libclc/generic/lib/math/fdim.cl
index 6899cb7549bc8..4980d6266a518 100644
--- a/libclc/generic/lib/math/fdim.cl
+++ b/libclc/generic/lib/math/fdim.cl
@@ -1,6 +1,5 @@
 #include <clc/clc.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 #define __CLC_BODY <fdim.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/fma.cl b/libclc/generic/lib/math/fma.cl
index 9ad81be696d95..00d5857fb897b 100644
--- a/libclc/generic/lib/math/fma.cl
+++ b/libclc/generic/lib/math/fma.cl
@@ -1,6 +1,6 @@
 #include <clc/clc.h>
+#include <clc/math/math.h>
 
-#include "math.h"
 #include "math/clc_fma.h"
 
 #define __CLC_BODY <fma.inc>
diff --git a/libclc/generic/lib/math/ilogb.cl b/libclc/generic/lib/math/ilogb.cl
index f16b4404fbebe..c9a5014a975db 100644
--- a/libclc/generic/lib/math/ilogb.cl
+++ b/libclc/generic/lib/math/ilogb.cl
@@ -21,9 +21,9 @@
  * THE SOFTWARE.
  */
 
-#include "math.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF int ilogb(float x) {
     uint ux = as_uint(x);
diff --git a/libclc/generic/lib/math/ldexp.cl b/libclc/generic/lib/math/ldexp.cl
index a999c63902ce8..72708f74b5cc1 100644
--- a/libclc/generic/lib/math/ldexp.cl
+++ b/libclc/generic/lib/math/ldexp.cl
@@ -20,11 +20,11 @@
  * THE SOFTWARE.
  */
 
-#include "config.h"
-#include "math.h"
 #include "math/clc_ldexp.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/clc_subnormal_config.h>
+#include <clc/math/math.h>
 
 _CLC_DEFINE_BINARY_BUILTIN(float, ldexp, __clc_ldexp, float, int)
 
diff --git a/libclc/generic/lib/math/lgamma_r.cl b/libclc/generic/lib/math/lgamma_r.cl
index bd68a76068ec5..cef836ae9747e 100644
--- a/libclc/generic/lib/math/lgamma_r.cl
+++ b/libclc/generic/lib/math/lgamma_r.cl
@@ -23,8 +23,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 /*
  * ====================================================
diff --git a/libclc/generic/lib/math/log10.cl b/libclc/generic/lib/math/log10.cl
index 3abb14a28d8ad..38e36d7992bce 100644
--- a/libclc/generic/lib/math/log10.cl
+++ b/libclc/generic/lib/math/log10.cl
@@ -20,7 +20,7 @@
  * THE SOFTWARE.
  */
 
-#include "tables.h"
+#include <clc/math/tables.h>
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 
diff --git a/libclc/generic/lib/math/log1p.cl b/libclc/generic/lib/math/log1p.cl
index 7fef79ca7b8f3..da4a921e98659 100644
--- a/libclc/generic/lib/math/log1p.cl
+++ b/libclc/generic/lib/math/log1p.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF float log1p(float x)
 {
diff --git a/libclc/generic/lib/math/log2.cl b/libclc/generic/lib/math/log2.cl
index a6f9692881bb8..c3bcf64a01ab0 100644
--- a/libclc/generic/lib/math/log2.cl
+++ b/libclc/generic/lib/math/log2.cl
@@ -20,7 +20,7 @@
  * THE SOFTWARE.
  */
 
-#include "tables.h"
+#include <clc/math/tables.h>
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 
diff --git a/libclc/generic/lib/math/log_base.h b/libclc/generic/lib/math/log_base.h
index b8110ca1779a2..bd0169c03236f 100644
--- a/libclc/generic/lib/math/log_base.h
+++ b/libclc/generic/lib/math/log_base.h
@@ -20,7 +20,7 @@
  * THE SOFTWARE.
  */
 
-#include "math.h"
+#include <clc/math/math.h>
 
 /*
    Algorithm:
diff --git a/libclc/generic/lib/math/logb.cl b/libclc/generic/lib/math/logb.cl
index d91972f780ba9..ae3813225265f 100644
--- a/libclc/generic/lib/math/logb.cl
+++ b/libclc/generic/lib/math/logb.cl
@@ -1,6 +1,6 @@
-#include "math.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float logb(float x) {
     int ax = as_int(x) & EXSIGNBIT_SP32;
diff --git a/libclc/generic/lib/math/modf.cl b/libclc/generic/lib/math/modf.cl
index a462cab2e802b..5098a41d079c5 100644
--- a/libclc/generic/lib/math/modf.cl
+++ b/libclc/generic/lib/math/modf.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 #define __CLC_BODY <modf.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl
index c271e67828066..9b3d7a9580b14 100644
--- a/libclc/generic/lib/math/sin.cl
+++ b/libclc/generic/lib/math/sin.cl
@@ -20,11 +20,10 @@
  * THE SOFTWARE.
  */
 
+#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "sincos_helpers.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float sin(float x)
 {
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index e291e81ed980d..6afb1887d7797 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -20,15 +20,14 @@
  * THE SOFTWARE.
  */
 
+#include "sincos_helpers.h"
 #include <clc/clc.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_trunc.h>
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 #include <clc/shared/clc_max.h>
 
-#include "math.h"
-#include "sincos_helpers.h"
-#include "tables.h"
-
 #define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
 
 #define bytealign(src0, src1, src2)                                            \
diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h
index e307abc48b2df..6dbca73aa2a2e 100644
--- a/libclc/generic/lib/math/sincos_helpers.h
+++ b/libclc/generic/lib/math/sincos_helpers.h
@@ -20,7 +20,8 @@
  * THE SOFTWARE.
  */
 
-#include "clc/clcfunc.h"
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
 
 _CLC_DECL float __clc_sinf_piby4(float x, float y);
 _CLC_DECL float __clc_cosf_piby4(float x, float y);
@@ -31,8 +32,10 @@ _CLC_DECL int __clc_argReductionS(float *r, float *rr, float x);
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, int *regn);
-_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, int *regn);
+_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr,
+                                            int *regn);
+_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr,
+                                           int *regn);
 _CLC_DECL double2 __clc_sincos_piby4(double x, double xx);
 
 #endif
diff --git a/libclc/generic/lib/math/sinh.cl b/libclc/generic/lib/math/sinh.cl
index 23500c1f49b7a..41a8bffb3a69c 100644
--- a/libclc/generic/lib/math/sinh.cl
+++ b/libclc/generic/lib/math/sinh.cl
@@ -22,9 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
-#include "tables.h"
+#include <clc/math/math.h>
+#include <clc/math/tables.h>
 
 _CLC_OVERLOAD _CLC_DEF float sinh(float x)
 {
diff --git a/libclc/generic/lib/math/sinpi.cl b/libclc/generic/lib/math/sinpi.cl
index 01b340b855c44..81cad9cc17800 100644
--- a/libclc/generic/lib/math/sinpi.cl
+++ b/libclc/generic/lib/math/sinpi.cl
@@ -22,8 +22,8 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/math/math.h>
 
-#include "math.h"
 #include "sincospiF_piby4.h"
 #ifdef cl_khr_fp64
 #include "sincosD_piby4.h"
diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl
index 596487c89e568..8eefbffb234f1 100644
--- a/libclc/generic/lib/math/tables.cl
+++ b/libclc/generic/lib/math/tables.cl
@@ -22,7 +22,7 @@
 
 #include <clc/clc.h>
 
-#include "tables.h"
+#include <clc/math/tables.h>
 
 DECLARE_TABLE(float2, LOGE_TBL, 129) = {
     (float2)(0x0.000000p+0f, 0x0.000000p+0f),
diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl
index d9509c57b0507..80eb33fab903e 100644
--- a/libclc/generic/lib/math/tanh.cl
+++ b/libclc/generic/lib/math/tanh.cl
@@ -22,8 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float tanh(float x)
 {
diff --git a/libclc/generic/lib/math/tgamma.cl b/libclc/generic/lib/math/tgamma.cl
index 314ffda4ec9d5..76614b575b632 100644
--- a/libclc/generic/lib/math/tgamma.cl
+++ b/libclc/generic/lib/math/tgamma.cl
@@ -23,8 +23,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "math.h"
+#include <clc/math/math.h>
 
 _CLC_OVERLOAD _CLC_DEF float tgamma(float x) {
     const float pi = 3.1415926535897932384626433832795f;
diff --git a/libclc/generic/lib/subnormal_config.cl b/libclc/generic/lib/subnormal_config.cl
index 4bcecfd82e18a..7bb25cedc1a43 100644
--- a/libclc/generic/lib/subnormal_config.cl
+++ b/libclc/generic/lib/subnormal_config.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "config.h"
+#include <clc/math/clc_subnormal_config.h>
 
 _CLC_DEF bool __clc_fp16_subnormals_supported() {
   return false;
diff --git a/libclc/r600/lib/math/fmax.cl b/libclc/r600/lib/math/fmax.cl
index a43530fc7507e..7c258e666b66e 100644
--- a/libclc/r600/lib/math/fmax.cl
+++ b/libclc/r600/lib/math/fmax.cl
@@ -1,7 +1,6 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "../../../generic/lib/math/math.h"
+#include <clc/math/math.h>
 
 _CLC_DEF _CLC_OVERLOAD float fmax(float x, float y)
 {
diff --git a/libclc/r600/lib/math/fmin.cl b/libclc/r600/lib/math/fmin.cl
index a43655da13845..052980dd11156 100644
--- a/libclc/r600/lib/math/fmin.cl
+++ b/libclc/r600/lib/math/fmin.cl
@@ -1,7 +1,6 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
-
-#include "../../../generic/lib/math/math.h"
+#include <clc/math/math.h>
 
 _CLC_DEF _CLC_OVERLOAD float fmin(float x, float y)
 {
diff --git a/libclc/spirv/lib/subnormal_config.cl b/libclc/spirv/lib/subnormal_config.cl
index 167fe1b1a1bcf..3ab5a6394b933 100644
--- a/libclc/spirv/lib/subnormal_config.cl
+++ b/libclc/spirv/lib/subnormal_config.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "config.h"
+#include <clc/math/clc_subnormal_config.h>
 
 _CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
 
diff --git a/libclc/spirv64/lib/subnormal_config.cl b/libclc/spirv64/lib/subnormal_config.cl
index 167fe1b1a1bcf..3ab5a6394b933 100644
--- a/libclc/spirv64/lib/subnormal_config.cl
+++ b/libclc/spirv64/lib/subnormal_config.cl
@@ -21,8 +21,7 @@
  */
 
 #include <clc/clc.h>
-
-#include "config.h"
+#include <clc/math/clc_subnormal_config.h>
 
 _CLC_DEF bool __clc_fp16_subnormals_supported() { return false; }
 

From 820c6ac7f5e438cc268ecb1fa2c6b17f0d168000 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Tue, 28 Jan 2025 15:18:38 +0100
Subject: [PATCH 415/432] [Clang] call HandleImmediateInvocation before
 checking for immediate escacalating expressions (reland) (#124708)

HandleImmediateInvocation can call MarkExpressionAsImmediateEscalating
and should always be called before
CheckImmediateEscalatingFunctionDefinition.

However, we were not doing that in `ActFunctionBody`.

Fixes #119046
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/lib/Sema/SemaDecl.cpp                   |  4 ++-
 clang/test/CodeGenCXX/gh119046.cpp            | 32 +++++++++++++++++++
 .../SemaCXX/cxx2b-consteval-propagate.cpp     | 16 ++++++++++
 4 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CodeGenCXX/gh119046.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ad524ad3c5bbb..c5a5da5daf8f5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1036,6 +1036,7 @@ Bug Fixes to C++ Support
 - Fix type of expression when calling a template which returns an ``__array_rank`` querying a type depending on a
   template parameter. Now, such expression can be used with ``static_assert`` and ``constexpr``. (#GH123498)
 - Correctly determine the implicit constexprness of lambdas in dependent contexts. (#GH97958) (#GH114234)
+- Fix that some dependent immediate expressions did not cause immediate escalation (#GH119046)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fe68eadc951b5..3cad9827fdab6 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -16019,7 +16019,6 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
       if (!FD->isDeletedAsWritten())
         FD->setBody(Body);
       FD->setWillHaveBody(false);
-      CheckImmediateEscalatingFunctionDefinition(FD, FSI);
 
       if (getLangOpts().CPlusPlus14) {
         if (!FD->isInvalidDecl() && Body && !FD->isDependentContext() &&
@@ -16397,6 +16396,9 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body,
     // the declaration context below. Otherwise, we're unable to transform
     // 'this' expressions when transforming immediate context functions.
 
+  if (FD)
+    CheckImmediateEscalatingFunctionDefinition(FD, getCurFunction());
+
   if (!IsInstantiation)
     PopDeclContext();
 
diff --git a/clang/test/CodeGenCXX/gh119046.cpp b/clang/test/CodeGenCXX/gh119046.cpp
new file mode 100644
index 0000000000000..cad76879f0862
--- /dev/null
+++ b/clang/test/CodeGenCXX/gh119046.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -std=c++2a -triple x86_64-elf-gnu %s -emit-llvm -o - | FileCheck %s
+
+struct S {
+    consteval void operator()() {}
+};
+
+template <class Fn>
+constexpr void dispatch(Fn fn) {
+    fn();
+}
+
+template <class Visitor>
+struct value_visitor {
+    constexpr void operator()() { visitor(); }
+    Visitor&& visitor;
+};
+
+template <class Visitor>
+constexpr auto make_dispatch() {
+    return dispatch<value_visitor<S>>;
+}
+
+template <class Visitor>
+constexpr void visit(Visitor&&) {
+    make_dispatch<Visitor>();
+}
+
+void f() { visit(S{}); }
+
+// CHECK: define {{.*}} @_Z1fv
+// CHECK-NOT: define {{.*}} @_Z5visitI1SEvOT_
+// CHECK-NOT: define {{.*}} @_Z13make_dispatchI1SEDav
diff --git a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
index 05904d9ade067..dd5063cb29c5b 100644
--- a/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
+++ b/clang/test/SemaCXX/cxx2b-consteval-propagate.cpp
@@ -560,3 +560,19 @@ void foo() {
     S s;
 }
 } // namespace GH118000
+
+namespace GH119046 {
+
+template <typename Cls> constexpr auto tfn(int) {
+  return (unsigned long long)(&Cls::sfn);
+  //expected-note@-1 {{'tfn<GH119046::S>' is an immediate function because its body evaluates the address of a consteval function 'sfn'}}
+};
+struct S { static consteval void sfn() {} };
+
+int f() {
+  int a = 0; // expected-note{{declared here}}
+  return tfn<S>(a);
+  //expected-error@-1 {{call to immediate function 'GH119046::tfn<GH119046::S>' is not a constant expression}}
+  //expected-note@-2 {{read of non-const variable 'a' is not allowed in a constant expression}}
+}
+}

From 3007f31e7458442cafb7fbd7f5c8b7228b5bd98e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 28 Jan 2025 14:25:26 +0000
Subject: [PATCH 416/432] [LoopUnroll] Add AArch64 tests for multi-exit loop
 unrolling.

Test coverage to https://github.com/llvm/llvm-project/pull/124751.
---
 .../AArch64/apple-unrolling-multi-exit.ll     | 395 ++++++++++++++++++
 1 file changed, 395 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll

diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
new file mode 100644
index 0000000000000..bfcd6f9e32a3b
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling-multi-exit.ll
@@ -0,0 +1,395 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -mcpu=apple-m1 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -mcpu=apple-m2 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -mcpu=apple-m3 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -mcpu=apple-m4 -S %s | FileCheck --check-prefix=APPLE %s
+; RUN: opt -p loop-unroll -mcpu=cortex-a57 -S %s | FileCheck --check-prefix=OTHER %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx15.0.0"
+
+define i1 @multi_2_exit_find_i8_loop(ptr %vec, i8 %tgt) {
+; APPLE-LABEL: define i1 @multi_2_exit_find_i8_loop(
+; APPLE-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; APPLE-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; APPLE-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
+; APPLE:       [[LOOP_HEADER]]:
+; APPLE-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; APPLE-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; APPLE-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; APPLE-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; APPLE-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; APPLE-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; APPLE-NEXT:    ret i1 [[C_3]]
+;
+; OTHER-LABEL: define i1 @multi_2_exit_find_i8_loop(
+; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; OTHER-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; OTHER-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; OTHER-NEXT:    br label %[[LOOP_HEADER:.*]]
+; OTHER:       [[LOOP_HEADER]]:
+; OTHER-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; OTHER-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; OTHER-NEXT:    [[C_1:%.*]] = icmp eq i8 [[L]], [[TGT]]
+; OTHER-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; OTHER-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; OTHER-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; OTHER-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+  %end = load ptr, ptr %gep.end, align 8
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load i8, ptr %ptr.iv, align 8
+  %c.1 = icmp eq i8 %l, %tgt
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+  %c.3 = icmp eq ptr %res, %end
+  ret i1 %c.3
+}
+
+
+define i1 @multi_2_exit_find_ptr_loop(ptr %vec, ptr %tgt) {
+; APPLE-LABEL: define i1 @multi_2_exit_find_ptr_loop(
+; APPLE-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; APPLE-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; APPLE-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
+; APPLE:       [[LOOP_HEADER]]:
+; APPLE-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; APPLE-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; APPLE-NEXT:    [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; APPLE-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; APPLE-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; APPLE-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; APPLE-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; APPLE-NEXT:    ret i1 [[C_3]]
+;
+; OTHER-LABEL: define i1 @multi_2_exit_find_ptr_loop(
+; OTHER-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; OTHER-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; OTHER-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; OTHER-NEXT:    br label %[[LOOP_HEADER:.*]]
+; OTHER:       [[LOOP_HEADER]]:
+; OTHER-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; OTHER-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; OTHER-NEXT:    [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; OTHER-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; OTHER-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; OTHER-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; OTHER-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; OTHER-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 8
+  %end = load ptr, ptr %gep.end, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load ptr, ptr %ptr.iv, align 8
+  %c.1 = icmp eq ptr %l, %tgt
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 8
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+  %c.3 = icmp eq ptr %res, %end
+  ret i1 %c.3
+}
+
+define i1 @multi_2_exit_find_i8_loop_too_large(ptr %vec, i8 %tgt) {
+; APPLE-LABEL: define i1 @multi_2_exit_find_i8_loop_too_large(
+; APPLE-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; APPLE-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; APPLE-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
+; APPLE:       [[LOOP_HEADER]]:
+; APPLE-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; APPLE-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; APPLE-NEXT:    [[UDIV:%.*]] = udiv i8 [[L]], [[TGT]]
+; APPLE-NEXT:    [[UDIV_2:%.*]] = udiv i8 [[UDIV]], 10
+; APPLE-NEXT:    [[C_1:%.*]] = icmp eq i8 [[UDIV_2]], 2
+; APPLE-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; APPLE-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; APPLE-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; APPLE-NEXT:    ret i1 [[C_3]]
+;
+; OTHER-LABEL: define i1 @multi_2_exit_find_i8_loop_too_large(
+; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; OTHER-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; OTHER-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; OTHER-NEXT:    br label %[[LOOP_HEADER:.*]]
+; OTHER:       [[LOOP_HEADER]]:
+; OTHER-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; OTHER-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; OTHER-NEXT:    [[UDIV:%.*]] = udiv i8 [[L]], [[TGT]]
+; OTHER-NEXT:    [[UDIV_2:%.*]] = udiv i8 [[UDIV]], 10
+; OTHER-NEXT:    [[C_1:%.*]] = icmp eq i8 [[UDIV_2]], 2
+; OTHER-NEXT:    br i1 [[C_1]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; OTHER-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; OTHER-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; OTHER-NEXT:    ret i1 [[C_3]]
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+  %end = load ptr, ptr %gep.end, align 8
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load i8, ptr %ptr.iv, align 8
+  %udiv = udiv i8 %l, %tgt
+  %udiv.2 = udiv i8 %udiv, 10
+  %c.1 = icmp eq i8 %udiv.2, 2
+  br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+  %c.3 = icmp eq ptr %res, %end
+  ret i1 %c.3
+}
+define i1 @multi_3_exit_find_ptr_loop(ptr %vec, ptr %tgt, ptr %tgt2) {
+; APPLE-LABEL: define i1 @multi_3_exit_find_ptr_loop(
+; APPLE-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; APPLE-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; APPLE-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
+; APPLE:       [[LOOP_HEADER]]:
+; APPLE-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; APPLE-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; APPLE-NEXT:    [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; APPLE-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], [[TGT2]]
+; APPLE-NEXT:    [[OR_COND:%.*]] = select i1 [[C_1]], i1 true, i1 [[C_2]]
+; APPLE-NEXT:    br i1 [[OR_COND]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; APPLE-NEXT:    [[C_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; APPLE-NEXT:    br i1 [[C_3]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; APPLE-NEXT:    [[C_4:%.*]] = icmp eq ptr [[RES]], [[END]]
+; APPLE-NEXT:    ret i1 [[C_4]]
+;
+; OTHER-LABEL: define i1 @multi_3_exit_find_ptr_loop(
+; OTHER-SAME: ptr [[VEC:%.*]], ptr [[TGT:%.*]], ptr [[TGT2:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[START]], i64 8) ]
+; OTHER-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 8
+; OTHER-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; OTHER-NEXT:    br label %[[LOOP_HEADER:.*]]
+; OTHER:       [[LOOP_HEADER]]:
+; OTHER-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; OTHER-NEXT:    [[L:%.*]] = load ptr, ptr [[PTR_IV]], align 8
+; OTHER-NEXT:    [[C_1:%.*]] = icmp eq ptr [[L]], [[TGT]]
+; OTHER-NEXT:    [[C_2:%.*]] = icmp eq ptr [[L]], [[TGT2]]
+; OTHER-NEXT:    [[OR_COND:%.*]] = select i1 [[C_1]], i1 true, i1 [[C_2]]
+; OTHER-NEXT:    br i1 [[OR_COND]], label %[[EXIT:.*]], label %[[LOOP_LATCH]]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 8
+; OTHER-NEXT:    [[C_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; OTHER-NEXT:    br i1 [[C_3]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[END]], i64 8) ]
+; OTHER-NEXT:    [[C_4:%.*]] = icmp eq ptr [[RES]], [[END]]
+; OTHER-NEXT:    ret i1 [[C_4]]
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %start, i64 8) ]
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 8
+  %end = load ptr, ptr %gep.end, align 8
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load ptr, ptr %ptr.iv, align 8
+  %c.1 = icmp eq ptr %l, %tgt
+  br i1 %c.1, label %exit, label %then
+
+then:
+  %c.2 = icmp eq ptr %l, %tgt2
+  br i1 %c.2, label %exit, label %loop.latch
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 8
+  %c.3 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.3, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %ptr.iv, %then], [ %end, %loop.latch ]
+  call void @llvm.assume(i1 true) [ "align"(ptr %end, i64 8) ]
+  %c.4 = icmp eq ptr %res, %end
+  ret i1 %c.4
+}
+
+define i1 @multi_3_exit_find_i8_loop_switch(ptr %vec, i8 %tgt) {
+; APPLE-LABEL: define i1 @multi_3_exit_find_i8_loop_switch(
+; APPLE-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; APPLE-NEXT:  [[ENTRY:.*]]:
+; APPLE-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; APPLE-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; APPLE-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; APPLE-NEXT:    br label %[[LOOP_HEADER:.*]]
+; APPLE:       [[LOOP_HEADER]]:
+; APPLE-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; APPLE-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; APPLE-NEXT:    switch i8 [[L]], label %[[LOOP_LATCH]] [
+; APPLE-NEXT:      i8 0, label %[[EXIT_1:.*]]
+; APPLE-NEXT:      i8 1, label %[[EXIT_2:.*]]
+; APPLE-NEXT:      i8 2, label %[[EXIT:.*]]
+; APPLE-NEXT:    ]
+; APPLE:       [[LOOP_LATCH]]:
+; APPLE-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; APPLE-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; APPLE-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; APPLE:       [[EXIT]]:
+; APPLE-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; APPLE-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; APPLE-NEXT:    ret i1 [[C_3]]
+; APPLE:       [[EXIT_1]]:
+; APPLE-NEXT:    ret i1 false
+; APPLE:       [[EXIT_2]]:
+; APPLE-NEXT:    ret i1 true
+;
+; OTHER-LABEL: define i1 @multi_3_exit_find_i8_loop_switch(
+; OTHER-SAME: ptr [[VEC:%.*]], i8 [[TGT:%.*]]) #[[ATTR0]] {
+; OTHER-NEXT:  [[ENTRY:.*]]:
+; OTHER-NEXT:    [[START:%.*]] = load ptr, ptr [[VEC]], align 8
+; OTHER-NEXT:    [[GEP_END:%.*]] = getelementptr inbounds nuw i8, ptr [[VEC]], i64 1
+; OTHER-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
+; OTHER-NEXT:    br label %[[LOOP_HEADER:.*]]
+; OTHER:       [[LOOP_HEADER]]:
+; OTHER-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[START]], %[[ENTRY]] ]
+; OTHER-NEXT:    [[L:%.*]] = load i8, ptr [[PTR_IV]], align 8
+; OTHER-NEXT:    switch i8 [[L]], label %[[LOOP_LATCH]] [
+; OTHER-NEXT:      i8 0, label %[[EXIT_1:.*]]
+; OTHER-NEXT:      i8 1, label %[[EXIT_2:.*]]
+; OTHER-NEXT:      i8 2, label %[[EXIT:.*]]
+; OTHER-NEXT:    ]
+; OTHER:       [[LOOP_LATCH]]:
+; OTHER-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 1
+; OTHER-NEXT:    [[C_2:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]]
+; OTHER-NEXT:    br i1 [[C_2]], label %[[EXIT]], label %[[LOOP_HEADER]]
+; OTHER:       [[EXIT]]:
+; OTHER-NEXT:    [[RES:%.*]] = phi ptr [ [[PTR_IV]], %[[LOOP_HEADER]] ], [ [[END]], %[[LOOP_LATCH]] ]
+; OTHER-NEXT:    [[C_3:%.*]] = icmp eq ptr [[RES]], [[END]]
+; OTHER-NEXT:    ret i1 [[C_3]]
+; OTHER:       [[EXIT_1]]:
+; OTHER-NEXT:    ret i1 false
+; OTHER:       [[EXIT_2]]:
+; OTHER-NEXT:    ret i1 true
+;
+entry:
+  %start = load ptr, ptr %vec, align 8
+  %gep.end = getelementptr inbounds nuw i8, ptr %vec, i64 1
+  %end = load ptr, ptr %gep.end, align 8
+  br label %loop.header
+
+loop.header:
+  %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %start, %entry ]
+  %l = load i8, ptr %ptr.iv, align 8
+  switch i8 %l, label %loop.latch [
+  i8 0, label %exit.1
+  i8 1, label %exit.2
+  i8 2, label %exit ]
+
+loop.latch:
+  %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 1
+  %c.2 = icmp eq ptr %ptr.iv.next, %end
+  br i1 %c.2, label %exit, label %loop.header
+
+exit:
+  %res = phi ptr [ %ptr.iv, %loop.header ], [ %end, %loop.latch ]
+  %c.3 = icmp eq ptr %res, %end
+  ret i1 %c.3
+
+exit.1:
+  ret i1 0
+
+exit.2:
+  ret i1 1
+}
+
+declare void @llvm.assume(i1 noundef)

From ee1c6a6bc15091155f294d62d0ab67f75a90b7fb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 28 Jan 2025 21:36:23 +0700
Subject: [PATCH 417/432] MachineVerifier: Move test into AMDGPU directory

Fixes failures for builds without AMDGPU enabled for test
added in 11db7fb09b36e656a801117d6a2492133e9c2e46
---
 .../MachineVerifier/{ => AMDGPU}/test_g_incompatible_range.mir    | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename llvm/test/MachineVerifier/{ => AMDGPU}/test_g_incompatible_range.mir (100%)

diff --git a/llvm/test/MachineVerifier/test_g_incompatible_range.mir b/llvm/test/MachineVerifier/AMDGPU/test_g_incompatible_range.mir
similarity index 100%
rename from llvm/test/MachineVerifier/test_g_incompatible_range.mir
rename to llvm/test/MachineVerifier/AMDGPU/test_g_incompatible_range.mir

From 6cb71d7e85d551ce71ba0f7c33fd94cb35a5a966 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Tue, 28 Jan 2025 14:41:15 +0000
Subject: [PATCH 418/432] [flang][NFC] Restrict -funroll-loops tests to known
 working targets (#124594)

If -funroll-loops tests are not restricted to specific targets the tests
may behave differently based on the host platform. This patch restricts
the tests to aarch64 and x86_64, and removes the PowerPC XFAIL.
---
 flang/test/Integration/unroll-loops.f90       | 20 ++++++++++++-------
 flang/test/{ => Lower}/HLFIR/unroll-loops.fir | 16 +++++++++------
 2 files changed, 23 insertions(+), 13 deletions(-)
 rename flang/test/{ => Lower}/HLFIR/unroll-loops.fir (73%)

diff --git a/flang/test/Integration/unroll-loops.f90 b/flang/test/Integration/unroll-loops.f90
index 4b4a394502881..2f812e2bac59d 100644
--- a/flang/test/Integration/unroll-loops.f90
+++ b/flang/test/Integration/unroll-loops.f90
@@ -1,11 +1,17 @@
-! RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-! RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-
 ! FIXME: https://github.com/llvm/llvm-project/issues/123668
-! XFAIL: target=powerpc64{{.*}}
-
+!
+! DEFINE: %{triple} =
+! DEFINE: %{check-unroll} = %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple %{triple} -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+! DEFINE: %{check-nounroll} = %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple %{triple} -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+!
+! REDEFINE: %{triple} = aarch64-unknown-linux-gnu
+! RUN: %if aarch64-registered-target %{ %{check-unroll} %}
+! RUN: %if aarch64-registered-target %{ %{check-nounroll} %}
+!
+! REDEFINE: %{triple} = x86_64-unknown-linux-gnu
+! RUN: %if x86-registered-target %{ %{check-unroll} %}
+! RUN: %if x86-registered-target %{ %{check-nounroll} %}
+!
 ! CHECK-LABEL: @unroll
 ! CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])
 subroutine unroll(a)
diff --git a/flang/test/HLFIR/unroll-loops.fir b/flang/test/Lower/HLFIR/unroll-loops.fir
similarity index 73%
rename from flang/test/HLFIR/unroll-loops.fir
rename to flang/test/Lower/HLFIR/unroll-loops.fir
index 4494cfa570dd7..42a236721e4c1 100644
--- a/flang/test/HLFIR/unroll-loops.fir
+++ b/flang/test/Lower/HLFIR/unroll-loops.fir
@@ -1,10 +1,14 @@
-// RUN: %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O2 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -fno-unroll-loops -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
-// RUN: %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
+// DEFINE: %{triple} =
+// DEFINE: %{check-unroll} = %flang_fc1 -emit-llvm -O1 -funroll-loops -mllvm -force-vector-width=2 -triple %{triple} -o- %s | FileCheck %s --check-prefixes=CHECK,UNROLL
+// DEFINE: %{check-nounroll} = %flang_fc1 -emit-llvm -O1 -mllvm -force-vector-width=2 -triple %{triple} -o- %s | FileCheck %s --check-prefixes=CHECK,NO-UNROLL
 
-// FIXME: https://github.com/llvm/llvm-project/issues/123668
-// XFAIL: target=powerpc64{{.*}}
+// REDEFINE: %{triple} = aarch64-unknown-linux-gnu
+// RUN: %if aarch64-registered-target %{ %{check-unroll} %}
+// RUN: %if aarch64-registered-target %{ %{check-nounroll} %}
+
+// REDEFINE: %{triple} = x86_64-unknown-linux-gnu
+// RUN: %if x86-registered-target %{ %{check-unroll} %}
+// RUN: %if x86-registered-target %{ %{check-nounroll} %}
 
 // CHECK-LABEL: @unroll
 // CHECK-SAME: (ptr nocapture writeonly %[[ARG0:.*]])

From 9d8d538e40ef040cb53e8db7a32f3024865187f3 Mon Sep 17 00:00:00 2001
From: Romaric Jodin <rjodin@google.com>
Date: Tue, 28 Jan 2025 15:47:08 +0100
Subject: [PATCH 419/432] libclc: clspv: add missing clc_isnan.cl dependency
 (#124614)

clc_isnan.cl is needed since
https://github.com/llvm/llvm-project/pull/124097
---
 libclc/clc/lib/clspv/SOURCES | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index d5f5c68fd7ebf..6681859010063 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -6,5 +6,6 @@
 ../generic/math/clc_nextafter.cl
 ../generic/math/clc_rint.cl
 ../generic/math/clc_trunc.cl
+../generic/relational/clc_isnan.cl
 ../generic/relational/clc_select.cl
 ../generic/shared/clc_clamp.cl

From 68d90cff580fe181ad28247584d32837f3b9940e Mon Sep 17 00:00:00 2001
From: Daniil Fukalov <dfukalov@gmail.com>
Date: Tue, 28 Jan 2025 15:53:17 +0100
Subject: [PATCH 420/432] [AMDGPU][GlobalISel] Fix assert on APInt creation.
 (#124608)

Since 3494ee95902cef62f767489802e469c58a13ea04 APInt stopped to
implicitly truncate values, therefore it asserts on a big signed value
converted to (implicitly) unsigned APInt.

The change explicitly marks offset as a signed value.
---
 .../Target/AMDGPU/AMDGPUGlobalISelUtils.cpp   |  2 +-
 .../GlobalISel/llvm.amdgcn.s.buffer.load.ll   | 69 +++++++++++++++++++
 2 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index d64337c4cb909..0b18c6b0e923a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -56,7 +56,7 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
 
   Register Base;
   if (KnownBits && mi_match(Reg, MRI, m_GOr(m_Reg(Base), m_ICst(Offset))) &&
-      KnownBits->maskedValueIsZero(Base, APInt(32, Offset)))
+      KnownBits->maskedValueIsZero(Base, APInt(32, Offset, /*isSigned=*/true)))
     return std::pair(Base, Offset);
 
   // Handle G_PTRTOINT (G_PTR_ADD base, const) case
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
index 91cde52cd2d67..79b333c08cb2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.s.buffer.load.ll
@@ -6846,6 +6846,75 @@ define amdgpu_ps float @s_buffer_load_f32_offset_add_imm_vgpr_sgpr(<4 x i32> inr
   ret float %val
 }
 
+define amdgpu_ps float @s_buffer_load_f32_offset_or_vgpr_imm(<4 x i32> inreg %rsrc, i32 inreg %offset.s) {
+  ; GFX6-LABEL: name: s_buffer_load_f32_offset_or_vgpr_imm
+  ; GFX6: bb.1 (%ir-block.0):
+  ; GFX6-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+  ; GFX6-NEXT: {{  $}}
+  ; GFX6-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX6-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX6-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX6-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX6-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX6-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX6-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX6-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; GFX6-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_OR_B32_]], 0 :: (dereferenceable invariant load (s32))
+  ; GFX6-NEXT:   $vgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
+  ; GFX6-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX7-LABEL: name: s_buffer_load_f32_offset_or_vgpr_imm
+  ; GFX7: bb.1 (%ir-block.0):
+  ; GFX7-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+  ; GFX7-NEXT: {{  $}}
+  ; GFX7-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX7-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX7-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX7-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX7-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX7-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX7-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX7-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; GFX7-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_OR_B32_]], 0 :: (dereferenceable invariant load (s32))
+  ; GFX7-NEXT:   $vgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
+  ; GFX7-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX8-LABEL: name: s_buffer_load_f32_offset_or_vgpr_imm
+  ; GFX8: bb.1 (%ir-block.0):
+  ; GFX8-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+  ; GFX8-NEXT: {{  $}}
+  ; GFX8-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX8-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX8-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX8-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX8-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX8-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX8-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX8-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; GFX8-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR [[REG_SEQUENCE]], [[S_OR_B32_]], 0 :: (dereferenceable invariant load (s32))
+  ; GFX8-NEXT:   $vgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR]]
+  ; GFX8-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  ;
+  ; GFX12-LABEL: name: s_buffer_load_f32_offset_or_vgpr_imm
+  ; GFX12: bb.1 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+  ; GFX12-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; GFX12-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
+  ; GFX12-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[COPY4]], [[S_MOV_B32_]], implicit-def dead $scc
+  ; GFX12-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[REG_SEQUENCE]], [[S_OR_B32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; GFX12-NEXT:   $vgpr0 = COPY [[S_BUFFER_LOAD_DWORD_SGPR_IMM]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %offset = or i32 %offset.s, -2147483648
+  %val = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %offset, i32 0)
+  ret float %val
+}
+
 declare i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32>, i32, i32 immarg)
 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg)
 declare <3 x i32> @llvm.amdgcn.s.buffer.load.v3i32(<4 x i32>, i32, i32 immarg)

From 7d172f96ff2c4c7cf5c428b79a3c18e067ce0079 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 28 Jan 2025 15:04:25 +0000
Subject: [PATCH 421/432] [CostModel][X86] getShuffleCosts - convert all
 shuffle cost tables to be CostKind compatible. NFC. (#124753)

No change in actual costs yet, but split the costs per cost kind to make it easier to tweak the numbers in future patches.
---
 .../lib/Target/X86/X86TargetTransformInfo.cpp | 663 +++++++++---------
 1 file changed, 334 insertions(+), 329 deletions(-)

diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 22dccaf061e1f..82523bb6557ad 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1673,40 +1673,41 @@ InstructionCost X86TTIImpl::getShuffleCost(
   EVT VT = TLI->getValueType(DL, BaseTp);
   if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
       !ST->hasSSSE3()) {
-     static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
-      {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
-      {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
-      {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
-      {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
-      {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
-
-      {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
-      {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
-      {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
-      {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
-
-      {TTI::SK_Splice,           MVT::v4i16, 2}, // punpck+psrldq
-      {TTI::SK_Splice,           MVT::v2i16, 2}, // punpck+psrldq
-      {TTI::SK_Splice,           MVT::v4i8,  2}, // punpck+psrldq
-      {TTI::SK_Splice,           MVT::v2i8,  2}, // punpck+psrldq
-
-      {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
-      {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
-      {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
-      {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
-      {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
-
-      {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
-      {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
-      {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
-      {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
-      {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
+     static const CostKindTblEntry SSE2SubVectorShuffleTbl[] = {
+      {TTI::SK_Broadcast,        MVT::v4i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_Broadcast,        MVT::v2i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_Broadcast,        MVT::v8i8,  {2,2,2,2}}, // punpck/pshuflw
+      {TTI::SK_Broadcast,        MVT::v4i8,  {2,2,2,2}}, // punpck/pshuflw
+      {TTI::SK_Broadcast,        MVT::v2i8,  {1,1,1,1}}, // punpck
+
+      {TTI::SK_Reverse,          MVT::v4i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_Reverse,          MVT::v2i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_Reverse,          MVT::v4i8,  {3,3,3,3}}, // punpck/pshuflw/packus
+      {TTI::SK_Reverse,          MVT::v2i8,  {1,1,1,1}}, // punpck
+
+      {TTI::SK_Splice,           MVT::v4i16, {2,2,2,2}}, // punpck+psrldq
+      {TTI::SK_Splice,           MVT::v2i16, {2,2,2,2}}, // punpck+psrldq
+      {TTI::SK_Splice,           MVT::v4i8,  {2,2,2,2}}, // punpck+psrldq
+      {TTI::SK_Splice,           MVT::v2i8,  {2,2,2,2}}, // punpck+psrldq
+
+      {TTI::SK_PermuteTwoSrc,    MVT::v4i16, {2,2,2,2}}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v2i16, {2,2,2,2}}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  {7,7,7,7}}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  {4,4,4,4}}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  {2,2,2,2}}, // punpck
+
+      {TTI::SK_PermuteSingleSrc, MVT::v4i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v2i16, {1,1,1,1}}, // pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v8i8,  {5,5,5,5}}, // punpck/pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v4i8,  {3,3,3,3}}, // punpck/pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v2i8,  {1,1,1,1}}, // punpck
     };
 
     if (ST->hasSSE2())
       if (const auto *Entry =
               CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
-        return Entry->Cost;
+        if (auto KindCost = Entry->Cost[CostKind])
+          return LT.first * *KindCost;
   }
 
   // We are going to permute multiple sources and the result will be in multiple
@@ -1803,57 +1804,57 @@ InstructionCost X86TTIImpl::getShuffleCost(
   if (LT.first == 1 && IsInLaneShuffle && IsSingleElementMask)
     return TTI::TCC_Basic;
 
-  static const CostTblEntry AVX512VBMIShuffleTbl[] = {
-      {TTI::SK_Reverse, MVT::v64i8, 1}, // vpermb
-      {TTI::SK_Reverse, MVT::v32i8, 1}, // vpermb
-
-      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
-
-      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
-      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
+  static const CostKindTblEntry AVX512VBMIShuffleTbl[] = {
+    { TTI::SK_Reverse, MVT::v64i8,          { 1, 1, 1, 1 } }, // vpermb
+    { TTI::SK_Reverse, MVT::v32i8,          { 1, 1, 1, 1 } }, // vpermb
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8, { 1, 1, 1, 1 } }, // vpermb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 1, 1, 1, 1 } }, // vpermb
+    { TTI::SK_PermuteTwoSrc, MVT::v64i8,    { 2, 2, 2, 2 } }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc, MVT::v32i8,    { 2, 2, 2, 2 } }, // vpermt2b
+    { TTI::SK_PermuteTwoSrc, MVT::v16i8,    { 2, 2, 2, 2 } }  // vpermt2b
   };
 
   if (ST->hasVBMI())
     if (const auto *Entry =
             CostTableLookup(AVX512VBMIShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX512BWShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v32f16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
-
-      {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
-      {TTI::SK_Reverse, MVT::v32f16, 2}, // vpermw
-      {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
-      {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
-
-      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v32f16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 2}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
-
-      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v32f16, 2}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
-
-      {TTI::SK_Select, MVT::v32i16, 1}, // vblendmw
-      {TTI::SK_Select, MVT::v64i8,  1}, // vblendmb
-
-      {TTI::SK_Splice, MVT::v32i16, 2}, // vshufi64x2 + palignr
-      {TTI::SK_Splice, MVT::v32f16, 2}, // vshufi64x2 + palignr
-      {TTI::SK_Splice, MVT::v64i8,  2}, // vshufi64x2 + palignr
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry AVX512BWShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v32i16, { 1, 1, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32f16, { 1, 1, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v64i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
+
+    { TTI::SK_Reverse, MVT::v32i16,   { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v32f16,   { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_Reverse, MVT::v64i8,    { 2, 2, 2, 2 } }, // pshufb + vshufi64x2
+
+    { TTI::SK_PermuteSingleSrc, MVT::v32i16, { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v32f16, { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 2, 2, 2, 2 } }, // vpermw
+    { TTI::SK_PermuteSingleSrc, MVT::v64i8,  { 8, 8, 8, 8 } },  // extend to v32i16
+
+    { TTI::SK_PermuteTwoSrc, MVT::v32i16,{  2,  2,  2,  2 } }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc, MVT::v32f16,{  2,  2,  2,  2 } }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc, MVT::v16i16,{  2,  2,  2,  2 } }, // vpermt2w
+    { TTI::SK_PermuteTwoSrc, MVT::v8i16, {  2,  2,  2,  2 } },  // vpermt2w
+    { TTI::SK_PermuteTwoSrc, MVT::v64i8, { 19, 19, 19, 19 } }, // 6 * v32i8 + 1
+
+    { TTI::SK_Select, MVT::v32i16, { 1, 1, 1, 1 } }, // vblendmw
+    { TTI::SK_Select, MVT::v64i8,  { 1, 1, 1, 1 } }, // vblendmb
+
+    { TTI::SK_Splice, MVT::v32i16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
+    { TTI::SK_Splice, MVT::v32f16, { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
+    { TTI::SK_Splice, MVT::v64i8,  { 2, 2, 2, 2 } }, // vshufi64x2 + palignr
   };
 
   if (ST->hasBWI())
     if (const auto *Entry =
             CostTableLookup(AVX512BWShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
 
   static const CostKindTblEntry AVX512ShuffleTbl[] = {
       {TTI::SK_Broadcast, MVT::v8f64,  { 1, 1, 1, 1 } }, // vbroadcastsd
@@ -1934,286 +1935,288 @@ InstructionCost X86TTIImpl::getShuffleCost(
       if (auto KindCost = Entry->Cost[CostKind])
         return LT.first * *KindCost;
 
-  static const CostTblEntry AVX2InLaneShuffleTbl[] = {
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpshufb
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 1}, // vpshufb
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8,  1}, // vpshufb
-
-      {TTI::SK_PermuteTwoSrc, MVT::v4f64,     2}, // 2*vshufpd + vblendpd
-      {TTI::SK_PermuteTwoSrc, MVT::v8f32,     2}, // 2*vshufps + vblendps
-      {TTI::SK_PermuteTwoSrc, MVT::v4i64,     2}, // 2*vpshufd + vpblendd
-      {TTI::SK_PermuteTwoSrc, MVT::v8i32,     2}, // 2*vpshufd + vpblendd
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16,    2}, // 2*vpshufb + vpor
-      {TTI::SK_PermuteTwoSrc, MVT::v16f16,    2}, // 2*vpshufb + vpor
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8,     2}, // 2*vpshufb + vpor
+  static const CostKindTblEntry AVX2InLaneShuffleTbl[] = {
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 1, 1, 1, 1 } }, // vpshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 1, 1, 1, 1 } }, // vpshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 1, 1, 1, 1 } }, // vpshufb
+
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f64,  { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
+    { TTI::SK_PermuteTwoSrc,    MVT::v8f32,  { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
+    { TTI::SK_PermuteTwoSrc,    MVT::v4i64,  { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
+    { TTI::SK_PermuteTwoSrc,    MVT::v8i32,  { 2, 2, 2, 2 } }, // 2*vpshufd + vpblendd
+    { TTI::SK_PermuteTwoSrc,    MVT::v16i16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
+    { TTI::SK_PermuteTwoSrc,    MVT::v16f16, { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
+    { TTI::SK_PermuteTwoSrc,    MVT::v32i8,  { 2, 2, 2, 2 } }, // 2*vpshufb + vpor
   };
 
   if (IsInLaneShuffle && ST->hasAVX2())
     if (const auto *Entry =
             CostTableLookup(AVX2InLaneShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX2ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v4f64, 1},  // vbroadcastpd
-      {TTI::SK_Broadcast, MVT::v8f32, 1},  // vbroadcastps
-      {TTI::SK_Broadcast, MVT::v4i64, 1},  // vpbroadcastq
-      {TTI::SK_Broadcast, MVT::v8i32, 1},  // vpbroadcastd
-      {TTI::SK_Broadcast, MVT::v16i16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v16f16, 1}, // vpbroadcastw
-      {TTI::SK_Broadcast, MVT::v32i8, 1},  // vpbroadcastb
-
-      {TTI::SK_Reverse, MVT::v4f64, 1},  // vpermpd
-      {TTI::SK_Reverse, MVT::v8f32, 1},  // vpermps
-      {TTI::SK_Reverse, MVT::v4i64, 1},  // vpermq
-      {TTI::SK_Reverse, MVT::v8i32, 1},  // vpermd
-      {TTI::SK_Reverse, MVT::v16i16, 2}, // vperm2i128 + pshufb
-      {TTI::SK_Reverse, MVT::v16f16, 2}, // vperm2i128 + pshufb
-      {TTI::SK_Reverse, MVT::v32i8, 2},  // vperm2i128 + pshufb
-
-      {TTI::SK_Select, MVT::v16i16, 1}, // vpblendvb
-      {TTI::SK_Select, MVT::v16f16, 1}, // vpblendvb
-      {TTI::SK_Select, MVT::v32i8,  1}, // vpblendvb
-
-      {TTI::SK_Splice, MVT::v8i32,  2}, // vperm2i128 + vpalignr
-      {TTI::SK_Splice, MVT::v8f32,  2}, // vperm2i128 + vpalignr
-      {TTI::SK_Splice, MVT::v16i16, 2}, // vperm2i128 + vpalignr
-      {TTI::SK_Splice, MVT::v16f16, 2}, // vperm2i128 + vpalignr
-      {TTI::SK_Splice, MVT::v32i8,  2}, // vperm2i128 + vpalignr
-
-      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 1},  // vpermpd
-      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 1},  // vpermps
-      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 1},  // vpermq
-      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 1},  // vpermd
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vperm2i128 + 2*vpshufb
-                                                  // + vpblendvb
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vperm2i128 + 2*vpshufb
-                                                  // + vpblendvb
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vperm2i128 + 2*vpshufb
-                                                  // + vpblendvb
-
-      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},  // 2*vpermpd + vblendpd
-      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 3},  // 2*vpermps + vblendps
-      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},  // 2*vpermq + vpblendd
-      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 3},  // 2*vpermd + vpblendd
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 7}, // 2*vperm2i128 + 4*vpshufb
-                                               // + vpblendvb
-      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 7}, // 2*vperm2i128 + 4*vpshufb
-                                               // + vpblendvb
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 7},  // 2*vperm2i128 + 4*vpshufb
-                                               // + vpblendvb
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry AVX2ShuffleTbl[] = {
+    { TTI::SK_Broadcast, MVT::v4f64,  { 1, 1, 1, 1 } }, // vbroadcastpd
+    { TTI::SK_Broadcast, MVT::v8f32,  { 1, 1, 1, 1 } }, // vbroadcastps
+    { TTI::SK_Broadcast, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpbroadcastq
+    { TTI::SK_Broadcast, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpbroadcastd
+    { TTI::SK_Broadcast, MVT::v16i16, { 1, 1, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v16f16, { 1, 1, 1, 1 } }, // vpbroadcastw
+    { TTI::SK_Broadcast, MVT::v32i8,  { 1, 1, 1, 1 } }, // vpbroadcastb
+
+    { TTI::SK_Reverse, MVT::v4f64,    { 1, 1, 1, 1 } }, // vpermpd
+    { TTI::SK_Reverse, MVT::v8f32,    { 1, 1, 1, 1 } }, // vpermps
+    { TTI::SK_Reverse, MVT::v4i64,    { 1, 1, 1, 1 } }, // vpermq
+    { TTI::SK_Reverse, MVT::v8i32,    { 1, 1, 1, 1 } }, // vpermd
+    { TTI::SK_Reverse, MVT::v16i16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse, MVT::v16f16,   { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+    { TTI::SK_Reverse, MVT::v32i8,    { 2, 2, 2, 2 } }, // vperm2i128 + pshufb
+
+    { TTI::SK_Select, MVT::v16i16,    { 1, 1, 1, 1 } }, // vpblendvb
+    { TTI::SK_Select, MVT::v16f16,    { 1, 1, 1, 1 } }, // vpblendvb
+    { TTI::SK_Select, MVT::v32i8,     { 1, 1, 1, 1 } }, // vpblendvb
+
+    { TTI::SK_Splice, MVT::v8i32,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
+    { TTI::SK_Splice, MVT::v8f32,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
+    { TTI::SK_Splice, MVT::v16i16,    { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
+    { TTI::SK_Splice, MVT::v16f16,    { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
+    { TTI::SK_Splice, MVT::v32i8,     { 2, 2, 2, 2 } }, // vperm2i128 + vpalignr
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpermpd
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpermps
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } },
+    { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } },
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 4, 4, 4, 4 } },
+
+    { TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 3, 3, 3, 3 } }, // 2*vpermpd + vblendpd
+    { TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 3, 3, 3, 3 } }, // 2*vpermps + vblendps
+    { TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 3, 3, 3, 3 } }, // 2*vpermq + vpblendd
+    { TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 3, 3, 3, 3 } }, // 2*vpermd + vpblendd
+    { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 7, 7, 7, 7 } },
+    { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 7, 7, 7, 7 } },
+    { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 7, 7, 7, 7 } },
   };
 
   if (ST->hasAVX2())
     if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry XOPShuffleTbl[] = {
-      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vpermil2pd
-      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 2},  // vperm2f128 + vpermil2ps
-      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vpermil2pd
-      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 2},  // vperm2f128 + vpermil2ps
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*vpperm
-                                                  // + vinsertf128
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 4},  // vextractf128 + 2*vpperm
-                                                  // + vinsertf128
-
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 9}, // 2*vextractf128 + 6*vpperm
-                                               // + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpperm
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 9},  // 2*vextractf128 + 6*vpperm
-                                               // + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1},  // vpperm
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry XOPShuffleTbl[] = {
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2pd
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32, { 2, 2, 2, 2 } }, // vperm2f128 + vpermil2ps
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16,{ 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
+                                                             // + vinsertf128
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8, { 4, 4, 4, 4 } }, // vextractf128 + 2*vpperm
+                                                             // + vinsertf128
+
+    { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
+                                                            // + vinsertf128
+
+    { TTI::SK_PermuteTwoSrc, MVT::v8i16,  { 1, 1, 1, 1 } }, // vpperm
+    { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 9, 9, 9, 9 } }, // 2*vextractf128 + 6*vpperm
+                                                            // + vinsertf128
+    { TTI::SK_PermuteTwoSrc, MVT::v16i8,  { 1, 1, 1, 1 } }, // vpperm
   };
 
   if (ST->hasXOP())
     if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX1InLaneShuffleTbl[] = {
-      {TTI::SK_PermuteSingleSrc, MVT::v4f64,  1},  // vpermilpd
-      {TTI::SK_PermuteSingleSrc, MVT::v4i64,  1},  // vpermilpd
-      {TTI::SK_PermuteSingleSrc, MVT::v8f32,  1},  // vpermilps
-      {TTI::SK_PermuteSingleSrc, MVT::v8i32,  1},  // vpermilps
-
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
-                                                  // + vpor + vinsertf128
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
-                                                  // + vpor + vinsertf128
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8,  4}, // vextractf128 + 2*pshufb
-                                                  // + vpor + vinsertf128
-
-      {TTI::SK_PermuteTwoSrc, MVT::v4f64,     2}, // 2*vshufpd + vblendpd
-      {TTI::SK_PermuteTwoSrc, MVT::v8f32,     2}, // 2*vshufps + vblendps
-      {TTI::SK_PermuteTwoSrc, MVT::v4i64,     2}, // 2*vpermilpd + vblendpd
-      {TTI::SK_PermuteTwoSrc, MVT::v8i32,     2}, // 2*vpermilps + vblendps
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16,    9}, // 2*vextractf128 + 4*pshufb
-                                                  // + 2*vpor + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v16f16,    9}, // 2*vextractf128 + 4*pshufb
-                                                  // + 2*vpor + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8,     9}, // 2*vextractf128 + 4*pshufb
-                                                  // + 2*vpor + vinsertf128
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry AVX1InLaneShuffleTbl[] = {
+    { TTI::SK_PermuteSingleSrc, MVT::v4f64,  { 1, 1, 1, 1 } }, // vpermilpd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  { 1, 1, 1, 1 } }, // vpermilpd
+    { TTI::SK_PermuteSingleSrc, MVT::v8f32,  { 1, 1, 1, 1 } }, // vpermilps
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  { 1, 1, 1, 1 } }, // vpermilps
+
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
+                                                               // + vpor + vinsertf128
+    { TTI::SK_PermuteSingleSrc, MVT::v16f16, { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
+                                                               // + vpor + vinsertf128
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  { 4, 4, 4, 4 } }, // vextractf128 + 2*pshufb
+                                                               // + vpor + vinsertf128
+
+    { TTI::SK_PermuteTwoSrc, MVT::v4f64,  { 2, 2, 2, 2 } }, // 2*vshufpd + vblendpd
+    { TTI::SK_PermuteTwoSrc, MVT::v8f32,  { 2, 2, 2, 2 } }, // 2*vshufps + vblendps
+    { TTI::SK_PermuteTwoSrc, MVT::v4i64,  { 2, 2, 2, 2 } }, // 2*vpermilpd + vblendpd
+    { TTI::SK_PermuteTwoSrc, MVT::v8i32,  { 2, 2, 2, 2 } }, // 2*vpermilps + vblendps
+    { TTI::SK_PermuteTwoSrc, MVT::v16i16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
+                                                            // + 2*vpor + vinsertf128
+    { TTI::SK_PermuteTwoSrc, MVT::v16f16, { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
+                                                            // + 2*vpor + vinsertf128
+    { TTI::SK_PermuteTwoSrc, MVT::v32i8,  { 9, 9, 9, 9 } }, // 2*vextractf128 + 4*pshufb
+                                                            // + 2*vpor + vinsertf128
   };
 
   if (IsInLaneShuffle && ST->hasAVX())
     if (const auto *Entry =
             CostTableLookup(AVX1InLaneShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry AVX1ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
-      {TTI::SK_Broadcast, MVT::v8f32, 2},  // vperm2f128 + vpermilps
-      {TTI::SK_Broadcast, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
-      {TTI::SK_Broadcast, MVT::v8i32, 2},  // vperm2f128 + vpermilps
-      {TTI::SK_Broadcast, MVT::v16i16, 3}, // vpshuflw + vpshufd + vinsertf128
-      {TTI::SK_Broadcast, MVT::v16f16, 3}, // vpshuflw + vpshufd + vinsertf128
-      {TTI::SK_Broadcast, MVT::v32i8, 2},  // vpshufb + vinsertf128
-
-      {TTI::SK_Reverse, MVT::v4f64, 2},  // vperm2f128 + vpermilpd
-      {TTI::SK_Reverse, MVT::v8f32, 2},  // vperm2f128 + vpermilps
-      {TTI::SK_Reverse, MVT::v4i64, 2},  // vperm2f128 + vpermilpd
-      {TTI::SK_Reverse, MVT::v8i32, 2},  // vperm2f128 + vpermilps
-      {TTI::SK_Reverse, MVT::v16i16, 4}, // vextractf128 + 2*pshufb
-                                         // + vinsertf128
-      {TTI::SK_Reverse, MVT::v16f16, 4}, // vextractf128 + 2*pshufb
-                                         // + vinsertf128
-      {TTI::SK_Reverse, MVT::v32i8, 4},  // vextractf128 + 2*pshufb
-                                         // + vinsertf128
-
-      {TTI::SK_Select, MVT::v4i64, 1},  // vblendpd
-      {TTI::SK_Select, MVT::v4f64, 1},  // vblendpd
-      {TTI::SK_Select, MVT::v8i32, 1},  // vblendps
-      {TTI::SK_Select, MVT::v8f32, 1},  // vblendps
-      {TTI::SK_Select, MVT::v16i16, 3}, // vpand + vpandn + vpor
-      {TTI::SK_Select, MVT::v16f16, 3}, // vpand + vpandn + vpor
-      {TTI::SK_Select, MVT::v32i8, 3},  // vpand + vpandn + vpor
-
-      {TTI::SK_Splice, MVT::v4i64,  2}, // vperm2f128 + shufpd
-      {TTI::SK_Splice, MVT::v4f64,  2}, // vperm2f128 + shufpd
-      {TTI::SK_Splice, MVT::v8i32,  4}, // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_Splice, MVT::v8f32,  4}, // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_Splice, MVT::v16i16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
-      {TTI::SK_Splice, MVT::v16f16, 5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
-      {TTI::SK_Splice, MVT::v32i8,  5}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
-
-      {TTI::SK_PermuteSingleSrc, MVT::v4f64, 2},  // vperm2f128 + vshufpd
-      {TTI::SK_PermuteSingleSrc, MVT::v4i64, 2},  // vperm2f128 + vshufpd
-      {TTI::SK_PermuteSingleSrc, MVT::v8f32, 4},  // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_PermuteSingleSrc, MVT::v8i32, 4},  // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 8}, // vextractf128 + 4*pshufb
-                                                  // + 2*por + vinsertf128
-      {TTI::SK_PermuteSingleSrc, MVT::v16f16, 8}, // vextractf128 + 4*pshufb
-                                                  // + 2*por + vinsertf128
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 8},  // vextractf128 + 4*pshufb
-                                                  // + 2*por + vinsertf128
-
-      {TTI::SK_PermuteTwoSrc, MVT::v4f64, 3},   // 2*vperm2f128 + vshufpd
-      {TTI::SK_PermuteTwoSrc, MVT::v4i64, 3},   // 2*vperm2f128 + vshufpd
-      {TTI::SK_PermuteTwoSrc, MVT::v8f32, 4},   // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_PermuteTwoSrc, MVT::v8i32, 4},   // 2*vperm2f128 + 2*vshufps
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 15}, // 2*vextractf128 + 8*pshufb
-                                                // + 4*por + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v16f16, 15}, // 2*vextractf128 + 8*pshufb
-                                                // + 4*por + vinsertf128
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 15},  // 2*vextractf128 + 8*pshufb
-                                                // + 4*por + vinsertf128
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry AVX1ShuffleTbl[] = {
+      {TTI::SK_Broadcast, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Broadcast, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
+      {TTI::SK_Broadcast, MVT::v16i16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v16f16, {3,3,3,3}}, // vpshuflw + vpshufd + vinsertf128
+      {TTI::SK_Broadcast, MVT::v32i8,  {2,2,2,2}}, // vpshufb + vinsertf128
+
+      {TTI::SK_Reverse, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8f32,  {2,2,2,2}}, // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + vpermilpd
+      {TTI::SK_Reverse, MVT::v8i32,  {2,2,2,2}}, // vperm2f128 + vpermilps
+      {TTI::SK_Reverse, MVT::v16i16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+                                                 // + vinsertf128
+      {TTI::SK_Reverse, MVT::v16f16, {4,4,4,4}}, // vextractf128 + 2*pshufb
+                                                 // + vinsertf128
+      {TTI::SK_Reverse, MVT::v32i8,  {4,4,4,4}}, // vextractf128 + 2*pshufb
+                                                 // + vinsertf128
+
+      {TTI::SK_Select, MVT::v4i64,  {1,1,1,1}}, // vblendpd
+      {TTI::SK_Select, MVT::v4f64,  {1,1,1,1}}, // vblendpd
+      {TTI::SK_Select, MVT::v8i32,  {1,1,1,1}}, // vblendps
+      {TTI::SK_Select, MVT::v8f32,  {1,1,1,1}}, // vblendps
+      {TTI::SK_Select, MVT::v16i16, {3,3,3,3}}, // vpand + vpandn + vpor
+      {TTI::SK_Select, MVT::v16f16, {3,3,3,3}}, // vpand + vpandn + vpor
+      {TTI::SK_Select, MVT::v32i8,  {3,3,3,3}}, // vpand + vpandn + vpor
+
+      {TTI::SK_Splice, MVT::v4i64,  {2,2,2,2}}, // vperm2f128 + shufpd
+      {TTI::SK_Splice, MVT::v4f64,  {2,2,2,2}}, // vperm2f128 + shufpd
+      {TTI::SK_Splice, MVT::v8i32,  {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_Splice, MVT::v8f32,  {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_Splice, MVT::v16i16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+      {TTI::SK_Splice, MVT::v16f16, {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+      {TTI::SK_Splice, MVT::v32i8,  {5,5,5,5}}, // 2*vperm2f128 + 2*vpalignr + vinsertf128
+
+      {TTI::SK_PermuteSingleSrc, MVT::v4f64, {2,2,2,2}}, // vperm2f128 + vshufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v4i64, {2,2,2,2}}, // vperm2f128 + vshufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v8f32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteSingleSrc, MVT::v8i32, {4,4,4,4}}, // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16,{8,8,8,8}}, // vextractf128 + 4*pshufb
+                                                         // + 2*por + vinsertf128
+      {TTI::SK_PermuteSingleSrc, MVT::v16f16,{8,8,8,8}}, // vextractf128 + 4*pshufb
+                                                         // + 2*por + vinsertf128
+      {TTI::SK_PermuteSingleSrc, MVT::v32i8, {8,8,8,8}}, // vextractf128 + 4*pshufb
+                                                         // + 2*por + vinsertf128
+
+      {TTI::SK_PermuteTwoSrc, MVT::v4f64, {3,3,3,3}},   // 2*vperm2f128 + vshufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v4i64, {3,3,3,3}},   // 2*vperm2f128 + vshufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v8f32, {4,4,4,4}},   // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteTwoSrc, MVT::v8i32, {4,4,4,4}},   // 2*vperm2f128 + 2*vshufps
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
+                                                          // + 4*por + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v16f16,{15,15,15,15}}, // 2*vextractf128 + 8*pshufb
+                                                          // + 4*por + vinsertf128
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, {15,15,15,15}}, // 2*vextractf128 + 8*pshufb
+                                                         // + 4*por + vinsertf128
   };
 
   if (ST->hasAVX())
     if (const auto *Entry = CostTableLookup(AVX1ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry SSE41ShuffleTbl[] = {
-      {TTI::SK_Select, MVT::v2i64, 1}, // pblendw
-      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
-      {TTI::SK_Select, MVT::v4i32, 1}, // pblendw
-      {TTI::SK_Select, MVT::v4f32, 1}, // blendps
-      {TTI::SK_Select, MVT::v8i16, 1}, // pblendw
-      {TTI::SK_Select, MVT::v8f16, 1}, // pblendw
-      {TTI::SK_Select, MVT::v16i8, 1}  // pblendvb
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry SSE41ShuffleTbl[] = {
+      {TTI::SK_Select, MVT::v2i64, {1,1,1,1}}, // pblendw
+      {TTI::SK_Select, MVT::v2f64, {1,1,1,1}}, // movsd
+      {TTI::SK_Select, MVT::v4i32, {1,1,1,1}}, // pblendw
+      {TTI::SK_Select, MVT::v4f32, {1,1,1,1}}, // blendps
+      {TTI::SK_Select, MVT::v8i16, {1,1,1,1}}, // pblendw
+      {TTI::SK_Select, MVT::v8f16, {1,1,1,1}}, // pblendw
+      {TTI::SK_Select, MVT::v16i8, {1,1,1,1}}  // pblendvb
   };
 
   if (ST->hasSSE41())
     if (const auto *Entry = CostTableLookup(SSE41ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry SSSE3ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v8i16, 1}, // pshufb
-      {TTI::SK_Broadcast, MVT::v8f16, 1}, // pshufb
-      {TTI::SK_Broadcast, MVT::v16i8, 1}, // pshufb
-
-      {TTI::SK_Reverse, MVT::v8i16, 1}, // pshufb
-      {TTI::SK_Reverse, MVT::v8f16, 1}, // pshufb
-      {TTI::SK_Reverse, MVT::v16i8, 1}, // pshufb
-
-      {TTI::SK_Select, MVT::v8i16, 3}, // 2*pshufb + por
-      {TTI::SK_Select, MVT::v8f16, 3}, // 2*pshufb + por
-      {TTI::SK_Select, MVT::v16i8, 3}, // 2*pshufb + por
-
-      {TTI::SK_Splice, MVT::v4i32, 1}, // palignr
-      {TTI::SK_Splice, MVT::v4f32, 1}, // palignr
-      {TTI::SK_Splice, MVT::v8i16, 1}, // palignr
-      {TTI::SK_Splice, MVT::v8f16, 1}, // palignr
-      {TTI::SK_Splice, MVT::v16i8, 1}, // palignr
-
-      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1}, // pshufb
-      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 1}, // pshufb
-      {TTI::SK_PermuteSingleSrc, MVT::v16i8, 1}, // pshufb
-
-      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 3}, // 2*pshufb + por
-      {TTI::SK_PermuteTwoSrc, MVT::v8f16, 3}, // 2*pshufb + por
-      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}, // 2*pshufb + por
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry SSSE3ShuffleTbl[] = {
+      {TTI::SK_Broadcast, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Broadcast, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Broadcast, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+
+      {TTI::SK_Reverse, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Reverse, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_Reverse, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+
+      {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
+      {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
+      {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
+
+      {TTI::SK_Splice, MVT::v4i32, {1, 1, 1, 1}}, // palignr
+      {TTI::SK_Splice, MVT::v4f32, {1, 1, 1, 1}}, // palignr
+      {TTI::SK_Splice, MVT::v8i16, {1, 1, 1, 1}}, // palignr
+      {TTI::SK_Splice, MVT::v8f16, {1, 1, 1, 1}}, // palignr
+      {TTI::SK_Splice, MVT::v16i8, {1, 1, 1, 1}}, // palignr
+
+      {TTI::SK_PermuteSingleSrc, MVT::v8i16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, {1, 1, 1, 1}}, // pshufb
+      {TTI::SK_PermuteSingleSrc, MVT::v16i8, {1, 1, 1, 1}}, // pshufb
+
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, {3, 3, 3, 3}}, // 2*pshufb + por
+      {TTI::SK_PermuteTwoSrc, MVT::v8f16, {3, 3, 3, 3}}, // 2*pshufb + por
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, {3, 3, 3, 3}}, // 2*pshufb + por
   };
 
   if (ST->hasSSSE3())
     if (const auto *Entry = CostTableLookup(SSSE3ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
-
-  static const CostTblEntry SSE2ShuffleTbl[] = {
-      {TTI::SK_Broadcast, MVT::v2f64, 1}, // shufpd
-      {TTI::SK_Broadcast, MVT::v2i64, 1}, // pshufd
-      {TTI::SK_Broadcast, MVT::v4i32, 1}, // pshufd
-      {TTI::SK_Broadcast, MVT::v8i16, 2}, // pshuflw + pshufd
-      {TTI::SK_Broadcast, MVT::v8f16, 2}, // pshuflw + pshufd
-      {TTI::SK_Broadcast, MVT::v16i8, 3}, // unpck + pshuflw + pshufd
-
-      {TTI::SK_Reverse, MVT::v2f64, 1}, // shufpd
-      {TTI::SK_Reverse, MVT::v2i64, 1}, // pshufd
-      {TTI::SK_Reverse, MVT::v4i32, 1}, // pshufd
-      {TTI::SK_Reverse, MVT::v8i16, 3}, // pshuflw + pshufhw + pshufd
-      {TTI::SK_Reverse, MVT::v8f16, 3}, // pshuflw + pshufhw + pshufd
-      {TTI::SK_Reverse, MVT::v16i8, 9}, // 2*pshuflw + 2*pshufhw
-                                        // + 2*pshufd + 2*unpck + packus
-
-      {TTI::SK_Select, MVT::v2i64, 1}, // movsd
-      {TTI::SK_Select, MVT::v2f64, 1}, // movsd
-      {TTI::SK_Select, MVT::v4i32, 2}, // 2*shufps
-      {TTI::SK_Select, MVT::v8i16, 3}, // pand + pandn + por
-      {TTI::SK_Select, MVT::v8f16, 3}, // pand + pandn + por
-      {TTI::SK_Select, MVT::v16i8, 3}, // pand + pandn + por
-
-      {TTI::SK_Splice, MVT::v2i64, 1}, // shufpd
-      {TTI::SK_Splice, MVT::v2f64, 1}, // shufpd
-      {TTI::SK_Splice, MVT::v4i32, 2}, // 2*{unpck,movsd,pshufd}
-      {TTI::SK_Splice, MVT::v8i16, 3}, // psrldq + psrlldq + por
-      {TTI::SK_Splice, MVT::v8f16, 3}, // psrldq + psrlldq + por
-      {TTI::SK_Splice, MVT::v16i8, 3}, // psrldq + psrlldq + por
-
-      {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // shufpd
-      {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // pshufd
-      {TTI::SK_PermuteSingleSrc, MVT::v4i32, 1}, // pshufd
-      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 5}, // 2*pshuflw + 2*pshufhw
-                                                  // + pshufd/unpck
-      {TTI::SK_PermuteSingleSrc, MVT::v8f16, 5}, // 2*pshuflw + 2*pshufhw
-                                                  // + pshufd/unpck
-    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
-                                                  // + 2*pshufd + 2*unpck + 2*packus
-
-    { TTI::SK_PermuteTwoSrc,    MVT::v2f64,  1 }, // shufpd
-    { TTI::SK_PermuteTwoSrc,    MVT::v2i64,  1 }, // shufpd
-    { TTI::SK_PermuteTwoSrc,    MVT::v4i32,  2 }, // 2*{unpck,movsd,pshufd}
-    { TTI::SK_PermuteTwoSrc,    MVT::v8i16,  8 }, // blend+permute
-    { TTI::SK_PermuteTwoSrc,    MVT::v8f16,  8 }, // blend+permute
-    { TTI::SK_PermuteTwoSrc,    MVT::v16i8, 13 }, // blend+permute
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
+
+  static const CostKindTblEntry SSE2ShuffleTbl[] = {
+      {TTI::SK_Broadcast, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
+      {TTI::SK_Broadcast, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_Broadcast, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_Broadcast, MVT::v8i16, {2, 2, 2, 2}}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v8f16, {2, 2, 2, 2}}, // pshuflw + pshufd
+      {TTI::SK_Broadcast, MVT::v16i8, {3, 3, 3, 3}}, // unpck + pshuflw + pshufd
+
+      {TTI::SK_Reverse, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
+      {TTI::SK_Reverse, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_Reverse, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_Reverse, MVT::v8i16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v8f16, {3, 3, 3, 3}}, // pshuflw + pshufhw + pshufd
+      {TTI::SK_Reverse, MVT::v16i8, {9, 9, 9, 9}}, // 2*pshuflw + 2*pshufhw
+                                                   // + 2*pshufd + 2*unpck + packus
+
+      {TTI::SK_Select, MVT::v2i64, {1, 1, 1, 1}}, // movsd
+      {TTI::SK_Select, MVT::v2f64, {1, 1, 1, 1}}, // movsd
+      {TTI::SK_Select, MVT::v4i32, {2, 2, 2, 2}}, // 2*shufps
+      {TTI::SK_Select, MVT::v8i16, {3, 3, 3, 3}}, // pand + pandn + por
+      {TTI::SK_Select, MVT::v8f16, {3, 3, 3, 3}}, // pand + pandn + por
+      {TTI::SK_Select, MVT::v16i8, {3, 3, 3, 3}}, // pand + pandn + por
+
+      {TTI::SK_Splice, MVT::v2i64, {1, 1, 1, 1}}, // shufpd
+      {TTI::SK_Splice, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
+      {TTI::SK_Splice, MVT::v4i32, {2, 2, 2, 2}}, // 2*{unpck,movsd,pshufd}
+      {TTI::SK_Splice, MVT::v8i16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
+      {TTI::SK_Splice, MVT::v8f16, {3, 3, 3, 3}}, // psrldq + psrlldq + por
+      {TTI::SK_Splice, MVT::v16i8, {3, 3, 3, 3}}, // psrldq + psrlldq + por
+
+      {TTI::SK_PermuteSingleSrc, MVT::v2f64, {1, 1, 1, 1}}, // shufpd
+      {TTI::SK_PermuteSingleSrc, MVT::v2i64, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_PermuteSingleSrc, MVT::v4i32, {1, 1, 1, 1}}, // pshufd
+      {TTI::SK_PermuteSingleSrc, MVT::v8i16, {5, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
+                                                            // + pshufd/unpck
+      {TTI::SK_PermuteSingleSrc, MVT::v8f16, {5, 5, 5, 5}}, // 2*pshuflw + 2*pshufhw
+                                                            // + pshufd/unpck
+      {TTI::SK_PermuteSingleSrc, MVT::v16i8, {10, 10, 10, 10}}, // 2*pshuflw + 2*pshufhw
+                                                                // + 2*pshufd + 2*unpck + 2*packus
+
+      {TTI::SK_PermuteTwoSrc, MVT::v2f64, {1, 1, 1, 1}},     // shufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v2i64, {1, 1, 1, 1}},     // shufpd
+      {TTI::SK_PermuteTwoSrc, MVT::v4i32, {2, 2, 2, 2}},     // 2*{unpck,movsd,pshufd}
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, {8, 8, 8, 8}},     // blend+permute
+      {TTI::SK_PermuteTwoSrc, MVT::v8f16, {8, 8, 8, 8}},     // blend+permute
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, {13, 13, 13, 13}}, // blend+permute
   };
 
   static const CostTblEntry SSE3BroadcastLoadTbl[] = {
@@ -2233,16 +2236,17 @@ InstructionCost X86TTIImpl::getShuffleCost(
       }
 
     if (const auto *Entry = CostTableLookup(SSE2ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
   }
 
-  static const CostTblEntry SSE1ShuffleTbl[] = {
-    { TTI::SK_Broadcast,        MVT::v4f32, 1 }, // shufps
-    { TTI::SK_Reverse,          MVT::v4f32, 1 }, // shufps
-    { TTI::SK_Select,           MVT::v4f32, 2 }, // 2*shufps
-    { TTI::SK_Splice,           MVT::v4f32, 2 }, // 2*shufps
-    { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
-    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, 2 }, // 2*shufps
+  static const CostKindTblEntry SSE1ShuffleTbl[] = {
+    { TTI::SK_Broadcast,        MVT::v4f32, {1,1,1,1} }, // shufps
+    { TTI::SK_Reverse,          MVT::v4f32, {1,1,1,1} }, // shufps
+    { TTI::SK_Select,           MVT::v4f32, {2,2,2,2} }, // 2*shufps
+    { TTI::SK_Splice,           MVT::v4f32, {2,2,2,2} }, // 2*shufps
+    { TTI::SK_PermuteSingleSrc, MVT::v4f32, {1,1,1,1} }, // shufps
+    { TTI::SK_PermuteTwoSrc,    MVT::v4f32, {2,2,2,2} }, // 2*shufps
   };
 
   if (ST->hasSSE1()) {
@@ -2255,7 +2259,8 @@ InstructionCost X86TTIImpl::getShuffleCost(
         return 1;
     }
     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
-      return LT.first * Entry->Cost;
+      if (auto KindCost = Entry->Cost[CostKind])
+        return LT.first * *KindCost;
   }
 
   return BaseT::getShuffleCost(Kind, BaseTp, Mask, CostKind, Index, SubTp);

From 2abde54aabc4b8878a5665e4d70a0525e9041456 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 28 Jan 2025 15:05:54 +0000
Subject: [PATCH 422/432] [gn build] Port de4bbbfdccb6

---
 llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 5146d4141f29b..ba13bb16731af 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -41,7 +41,7 @@ static_library("Support") {
     "APInt.cpp",
     "APSInt.cpp",
     "ARMAttributeParser.cpp",
-    "ARMBuildAttrs.cpp",
+    "ARMBuildAttributes.cpp",
     "ARMWinEH.cpp",
     "Allocator.cpp",
     "AutoConvert.cpp",

From a0b049055df592759e4ac1d8032139f385581c63 Mon Sep 17 00:00:00 2001
From: Venkata Ramanaiah Nalamothu <quic_vnalamot@quicinc.com>
Date: Tue, 28 Jan 2025 21:03:12 +0530
Subject: [PATCH 423/432] [RISC-V] Fix incorrect epilogue_begin setting in
 debug line table (#120623)

The DwarfDebug.cpp implementation expects the epilogue instructions to
have source location of last non-debug instruction after which the epilogue
instructions are inserted. The epilogue_begin is set on location of the first
FrameDestroy instruction with source line information that has been seen in
the epilogue basic block.

In the trunk, the risc-v backend sets the epilogue_begin after the epilogue has
actually begun i.e. after callee saved register reloads and the source line
information is not set on those reload instructions. This is leading to #120553
where, while debugging, breaking on or single stepping to the epilogue_begin
location will make accessing the variables from wrong place as the FP has been
restored to the parent frame's FP.

To fix that, this patch sets FrameSetup/FrameDestroy flags on the callee saved
register spill/reload instructions which is actually correct. Then the
RISCVInstrInfo::loadRegFromStackSlot uses FrameDestroy flag to identify a
reload of the callee saved register in the epilogue and copies the source
line information from insert position instruction to that reload instruction.

Requires PR #120622

Fixes #120553
---
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |  40 ++++---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  18 ++-
 llvm/test/CodeGen/RISCV/debug-line.ll         |  47 ++++++++
 llvm/test/CodeGen/RISCV/kcfi-mir.ll           |   4 +-
 llvm/test/CodeGen/RISCV/live-sp.mir           |   4 +-
 .../RISCV/rvv/addi-scalable-offset.mir        |   8 +-
 .../test/CodeGen/RISCV/rvv/emergency-slot.mir |  48 ++++----
 .../CodeGen/RISCV/stack-inst-compress.mir     |  48 ++++----
 llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir   |  32 +++---
 llvm/test/CodeGen/RISCV/zcmp-cm-push-pop.mir  | 104 +++++++++---------
 .../DebugInfo/RISCV/dwarf-riscv-relocs.ll     |   2 +-
 11 files changed, 210 insertions(+), 145 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/debug-line.ll

diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index 333c8060f37f4..bb2e5781c34db 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -810,8 +810,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   auto FirstFrameSetup = MBBI;
 
-  // Since spillCalleeSavedRegisters may have inserted a libcall, skip past
-  // any instructions marked as FrameSetup
+  // Skip past all callee-saved register spill instructions.
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
     ++MBBI;
 
@@ -820,6 +819,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   const auto &CSI = MFI.getCalleeSavedInfo();
 
+  // Skip to before the spills of scalar callee-saved registers
+  // FIXME: assumes exactly one instruction is used to restore each
+  // callee-saved register.
+  MBBI = std::prev(MBBI, getRVVCalleeSavedInfo(MF, CSI).size() +
+                             getUnmanagedCSI(MF, CSI).size());
+
   // If libcalls are used to spill and restore callee-saved registers, the frame
   // has two sections; the opaque section managed by the libcalls, and the
   // section managed by MachineFrameInfo which can also hold callee saved
@@ -1076,8 +1081,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
     MBBI = MBB.getFirstTerminator();
 
-    // If callee-saved registers are saved via libcall, place stack adjustment
-    // before this call.
+    // Skip to before the restores of all callee-saved registers.
     while (MBBI != MBB.begin() &&
            std::prev(MBBI)->getFlag(MachineInstr::FrameDestroy))
       --MBBI;
@@ -1088,7 +1092,8 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   // Skip to before the restores of scalar callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
   // callee-saved register.
-  auto LastFrameDestroy = std::prev(MBBI, getUnmanagedCSI(MF, CSI).size());
+  auto FirstScalarCSRRestoreInsn =
+      std::next(MBBI, getRVVCalleeSavedInfo(MF, CSI).size());
 
   uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
   uint64_t RealStackSize = FirstSPAdjustAmount ? FirstSPAdjustAmount
@@ -1105,20 +1110,20 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
     // If RestoreSPFromFP the stack pointer will be restored using the frame
     // pointer value.
     if (!RestoreSPFromFP)
-      RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
+      RI->adjustReg(MBB, FirstScalarCSRRestoreInsn, DL, SPReg, SPReg,
                     StackOffset::getScalable(RVVStackSize),
                     MachineInstr::FrameDestroy, getStackAlign());
 
     if (!hasFP(MF)) {
       unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
           nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize));
-      BuildMI(MBB, LastFrameDestroy, DL,
+      BuildMI(MBB, FirstScalarCSRRestoreInsn, DL,
               TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlag(MachineInstr::FrameDestroy);
     }
 
-    emitCalleeSavedRVVEpilogCFI(MBB, LastFrameDestroy);
+    emitCalleeSavedRVVEpilogCFI(MBB, FirstScalarCSRRestoreInsn);
   }
 
   if (FirstSPAdjustAmount) {
@@ -1130,14 +1135,14 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
     // If RestoreSPFromFP the stack pointer will be restored using the frame
     // pointer value.
     if (!RestoreSPFromFP)
-      RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, SPReg,
+      RI->adjustReg(MBB, FirstScalarCSRRestoreInsn, DL, SPReg, SPReg,
                     StackOffset::getFixed(SecondSPAdjustAmount),
                     MachineInstr::FrameDestroy, getStackAlign());
 
     if (!hasFP(MF)) {
       unsigned CFIIndex = MF.addFrameInst(
           MCCFIInstruction::cfiDefCfaOffset(nullptr, FirstSPAdjustAmount));
-      BuildMI(MBB, LastFrameDestroy, DL,
+      BuildMI(MBB, FirstScalarCSRRestoreInsn, DL,
               TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlag(MachineInstr::FrameDestroy);
@@ -1156,7 +1161,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   // have vector objects in stack.
   if (RestoreSPFromFP) {
     assert(hasFP(MF) && "frame pointer should not have been eliminated");
-    RI->adjustReg(MBB, LastFrameDestroy, DL, SPReg, FPReg,
+    RI->adjustReg(MBB, FirstScalarCSRRestoreInsn, DL, SPReg, FPReg,
                   StackOffset::getFixed(-FPOffset), MachineInstr::FrameDestroy,
                   getStackAlign());
   }
@@ -1164,11 +1169,17 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
   if (hasFP(MF)) {
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
         nullptr, RI->getDwarfRegNum(SPReg, true), RealStackSize));
-    BuildMI(MBB, LastFrameDestroy, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+    BuildMI(MBB, FirstScalarCSRRestoreInsn, DL,
+            TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlag(MachineInstr::FrameDestroy);
   }
 
+  // Skip to after the restores of scalar callee-saved registers
+  // FIXME: assumes exactly one instruction is used to restore each
+  // callee-saved register.
+  MBBI = std::next(FirstScalarCSRRestoreInsn, getUnmanagedCSI(MF, CSI).size());
+
   if (getLibCallID(MF, CSI) != -1) {
     // tail __riscv_restore_[0-12] instruction is considered as a terminator,
     // therefor it is unnecessary to place any CFI instructions after it. Just
@@ -1898,7 +1909,8 @@ bool RISCVFrameLowering::spillCalleeSavedRegisters(
       Register Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, !MBB.isLiveIn(Reg),
-                              CS.getFrameIdx(), RC, TRI, Register());
+                              CS.getFrameIdx(), RC, TRI, Register(),
+                              MachineInstr::FrameSetup);
     }
   };
   storeRegsToStackSlots(UnmanagedCSI);
@@ -2009,7 +2021,7 @@ bool RISCVFrameLowering::restoreCalleeSavedRegisters(
       Register Reg = CS.getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI,
-                               Register());
+                               Register(), MachineInstr::FrameDestroy);
       assert(MI != MBB.begin() &&
              "loadRegFromStackSlot didn't insert any code!");
     }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index bb9ebedeea4f7..12a7af0750813 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -650,7 +650,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DebugLoc(), get(Opcode))
         .addReg(SrcReg, getKillRegState(IsKill))
         .addFrameIndex(FI)
-        .addMemOperand(MMO);
+        .addMemOperand(MMO)
+        .setMIFlag(Flags);
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
@@ -660,7 +661,8 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(IsKill))
         .addFrameIndex(FI)
         .addImm(0)
-        .addMemOperand(MMO);
+        .addMemOperand(MMO)
+        .setMIFlag(Flags);
   }
 }
 
@@ -670,6 +672,8 @@ void RISCVInstrInfo::loadRegFromStackSlot(
     Register VReg, MachineInstr::MIFlag Flags) const {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
+  DebugLoc DL =
+      Flags & MachineInstr::FrameDestroy ? MBB.findDebugLoc(I) : DebugLoc();
 
   unsigned Opcode;
   bool IsScalableVector = true;
@@ -734,18 +738,20 @@ void RISCVInstrInfo::loadRegFromStackSlot(
         LocationSize::beforeOrAfterPointer(), MFI.getObjectAlign(FI));
 
     MFI.setStackID(FI, TargetStackID::ScalableVector);
-    BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
+    BuildMI(MBB, I, DL, get(Opcode), DstReg)
         .addFrameIndex(FI)
-        .addMemOperand(MMO);
+        .addMemOperand(MMO)
+        .setMIFlag(Flags);
   } else {
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
         MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
-    BuildMI(MBB, I, DebugLoc(), get(Opcode), DstReg)
+    BuildMI(MBB, I, DL, get(Opcode), DstReg)
         .addFrameIndex(FI)
         .addImm(0)
-        .addMemOperand(MMO);
+        .addMemOperand(MMO)
+        .setMIFlag(Flags);
   }
 }
 
diff --git a/llvm/test/CodeGen/RISCV/debug-line.ll b/llvm/test/CodeGen/RISCV/debug-line.ll
new file mode 100644
index 0000000000000..3cb4d6dc74086
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/debug-line.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s
+
+define void @foo() #0 !dbg !3 {
+; CHECK-LABEL: foo:
+; CHECK: .Lfunc_begin0:
+; CHECK-NEXT: 	.file	1 "test.c"
+; CHECK-NEXT: 	.loc	1 5 0                           # test.c:5:0
+; CHECK-NEXT: 	.cfi_startproc
+; CHECK-NEXT: # %bb.0:                                # %entry
+; CHECK-NEXT: 	addi	sp, sp, -16
+; CHECK-NEXT: 	.cfi_def_cfa_offset 16
+; CHECK-NEXT: 	sd	ra, 8(sp)                       # 8-byte Folded Spill
+; CHECK-NEXT: 	sd	s0, 0(sp)                       # 8-byte Folded Spill
+; CHECK-NEXT: 	.cfi_offset ra, -8
+; CHECK-NEXT: 	.cfi_offset s0, -16
+; CHECK-NEXT: 	addi	s0, sp, 16
+; CHECK-NEXT: 	.cfi_def_cfa s0, 0
+; CHECK-NEXT: .Ltmp0:
+; CHECK-NEXT: 	.loc	1 6 4 prologue_end              # test.c:6:4
+; CHECK-NEXT: 	sw	zero, 0(zero)
+; CHECK-NEXT:   .cfi_def_cfa sp, 16
+; CHECK-NEXT: 	.loc	1 7 1 epilogue_begin            # test.c:7:1
+; CHECK-NEXT: 	ld	ra, 8(sp)                       # 8-byte Folded Reload
+; CHECK-NEXT: 	ld	s0, 0(sp)                       # 8-byte Folded Reload
+; CHECK-NEXT:   .cfi_restore ra
+; CHECK-NEXT:   .cfi_restore s0
+; CHECK-NEXT: 	addi	sp, sp, 16
+; CHECK-NEXT:   .cfi_def_cfa_offset 0 
+; CHECK-NEXT: 	ret
+entry:
+  store i32 0, ptr null, align 4, !dbg !6
+  ret void, !dbg !7
+}
+
+attributes #0 = { "frame-pointer"="all" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "test.c", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !4, scopeLine: 5, unit: !0)
+!4 = !DISubroutineType(types: !5)
+!5 = !{null}
+!6 = !DILocation(line: 6, column: 4, scope: !3)
+!7 = !DILocation(line: 7, column: 1, scope: !3)
diff --git a/llvm/test/CodeGen/RISCV/kcfi-mir.ll b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
index 0c0d39a8bf87d..2efdc129f2621 100644
--- a/llvm/test/CodeGen/RISCV/kcfi-mir.ll
+++ b/llvm/test/CodeGen/RISCV/kcfi-mir.ll
@@ -8,13 +8,13 @@ define void @f1(ptr noundef %x) !kcfi_type !1 {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -16
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
-  ; CHECK-NEXT:   SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
+  ; CHECK-NEXT:   frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
   ; CHECK-NEXT:   BUNDLE implicit-def $x6, implicit-def $x6_w, implicit-def $x6_h, implicit-def $x7, implicit-def $x7_w, implicit-def $x7_h, implicit-def $x28, implicit-def $x28_w, implicit-def $x28_h, implicit-def $x29, implicit-def $x29_w, implicit-def $x29_h, implicit-def $x30, implicit-def $x30_w, implicit-def $x30_h, implicit-def $x31, implicit-def $x31_w, implicit-def $x31_h, implicit-def dead $x1, implicit-def $x2, implicit-def $x2_w, implicit-def $x2_h, implicit killed $x10 {
   ; CHECK-NEXT:     KCFI_CHECK $x10, 12345678, implicit-def $x6, implicit-def $x7, implicit-def $x28, implicit-def $x29, implicit-def $x30, implicit-def $x31
   ; CHECK-NEXT:     PseudoCALLIndirect killed $x10, csr_ilp32_lp64, implicit-def dead $x1, implicit-def $x2
   ; CHECK-NEXT:   }
-  ; CHECK-NEXT:   $x1 = LD $x2, 8 :: (load (s64) from %stack.0)
+  ; CHECK-NEXT:   $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.0)
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
   ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x2, 16
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/live-sp.mir b/llvm/test/CodeGen/RISCV/live-sp.mir
index 9f40870feb00f..1c4c6e43a9234 100644
--- a/llvm/test/CodeGen/RISCV/live-sp.mir
+++ b/llvm/test/CodeGen/RISCV/live-sp.mir
@@ -74,13 +74,13 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -16
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
-    ; CHECK-NEXT: SD $x1, $x2, 8 :: (store (s64) into %stack.1)
+    ; CHECK-NEXT: frame-setup SD $x1, $x2, 8 :: (store (s64) into %stack.1)
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-NEXT: SW renamable $x1, $x2, 4 :: (store (s32) into %ir.a)
     ; CHECK-NEXT: renamable $x11 = ADDIW killed renamable $x1, 0
     ; CHECK-NEXT: $x10 = COPY $x0
     ; CHECK-NEXT: PseudoCALL target-flags(riscv-call) @vararg, csr_ilp32_lp64, implicit-def dead $x1, implicit killed $x10, implicit $x11, implicit-def $x2
-    ; CHECK-NEXT: $x1 = LD $x2, 8 :: (load (s64) from %stack.1)
+    ; CHECK-NEXT: $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.1)
     ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 16
     ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
index 2694fe52de8a6..cb1aebf0f95dd 100644
--- a/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/addi-scalable-offset.mir
@@ -30,8 +30,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-NEXT: SD killed $x1, $x2, 2024 :: (store (s64) into %stack.3)
-    ; CHECK-NEXT: SD killed $x8, $x2, 2016 :: (store (s64) into %stack.4)
+    ; CHECK-NEXT: frame-setup SD killed $x1, $x2, 2024 :: (store (s64) into %stack.3)
+    ; CHECK-NEXT: frame-setup SD killed $x8, $x2, 2016 :: (store (s64) into %stack.4)
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -16
     ; CHECK-NEXT: $x8 = frame-setup ADDI $x2, 2032
@@ -48,8 +48,8 @@ body: |
     ; CHECK-NEXT: VS1R_V killed renamable $v8, killed renamable $x10
     ; CHECK-NEXT: $x2 = frame-destroy ADDI $x8, -2032
     ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $x2, 2032
-    ; CHECK-NEXT: $x1 = LD $x2, 2024 :: (load (s64) from %stack.3)
-    ; CHECK-NEXT: $x8 = LD $x2, 2016 :: (load (s64) from %stack.4)
+    ; CHECK-NEXT: $x1 = frame-destroy LD $x2, 2024 :: (load (s64) from %stack.3)
+    ; CHECK-NEXT: $x8 = frame-destroy LD $x2, 2016 :: (load (s64) from %stack.4)
     ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NEXT: $x2 = frame-destroy ADDI $x2, 2032
diff --git a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
index 9e6a36d68833d..9e72382e072c3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/emergency-slot.mir
@@ -55,18 +55,18 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $x2 = frame-setup ADDI $x2, -2032
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-  ; CHECK-NEXT:   SD killed $x1, $x2, 2024 :: (store (s64) into %stack.3)
-  ; CHECK-NEXT:   SD killed $x8, $x2, 2016 :: (store (s64) into %stack.4)
-  ; CHECK-NEXT:   SD killed $x18, $x2, 2008 :: (store (s64) into %stack.5)
-  ; CHECK-NEXT:   SD killed $x19, $x2, 2000 :: (store (s64) into %stack.6)
-  ; CHECK-NEXT:   SD killed $x20, $x2, 1992 :: (store (s64) into %stack.7)
-  ; CHECK-NEXT:   SD killed $x21, $x2, 1984 :: (store (s64) into %stack.8)
-  ; CHECK-NEXT:   SD killed $x22, $x2, 1976 :: (store (s64) into %stack.9)
-  ; CHECK-NEXT:   SD killed $x23, $x2, 1968 :: (store (s64) into %stack.10)
-  ; CHECK-NEXT:   SD killed $x24, $x2, 1960 :: (store (s64) into %stack.11)
-  ; CHECK-NEXT:   SD killed $x25, $x2, 1952 :: (store (s64) into %stack.12)
-  ; CHECK-NEXT:   SD killed $x26, $x2, 1944 :: (store (s64) into %stack.13)
-  ; CHECK-NEXT:   SD killed $x27, $x2, 1936 :: (store (s64) into %stack.14)
+  ; CHECK-NEXT:   frame-setup SD killed $x1, $x2, 2024 :: (store (s64) into %stack.3)
+  ; CHECK-NEXT:   frame-setup SD killed $x8, $x2, 2016 :: (store (s64) into %stack.4)
+  ; CHECK-NEXT:   frame-setup SD killed $x18, $x2, 2008 :: (store (s64) into %stack.5)
+  ; CHECK-NEXT:   frame-setup SD killed $x19, $x2, 2000 :: (store (s64) into %stack.6)
+  ; CHECK-NEXT:   frame-setup SD killed $x20, $x2, 1992 :: (store (s64) into %stack.7)
+  ; CHECK-NEXT:   frame-setup SD killed $x21, $x2, 1984 :: (store (s64) into %stack.8)
+  ; CHECK-NEXT:   frame-setup SD killed $x22, $x2, 1976 :: (store (s64) into %stack.9)
+  ; CHECK-NEXT:   frame-setup SD killed $x23, $x2, 1968 :: (store (s64) into %stack.10)
+  ; CHECK-NEXT:   frame-setup SD killed $x24, $x2, 1960 :: (store (s64) into %stack.11)
+  ; CHECK-NEXT:   frame-setup SD killed $x25, $x2, 1952 :: (store (s64) into %stack.12)
+  ; CHECK-NEXT:   frame-setup SD killed $x26, $x2, 1944 :: (store (s64) into %stack.13)
+  ; CHECK-NEXT:   frame-setup SD killed $x27, $x2, 1936 :: (store (s64) into %stack.14)
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x1, -8
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x8, -16
   ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $x18, -24
@@ -152,18 +152,18 @@ body:             |
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   $x2 = frame-destroy ADDI $x8, -2032
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION def_cfa $x2, 2032
-  ; CHECK-NEXT:   $x1 = LD $x2, 2024 :: (load (s64) from %stack.3)
-  ; CHECK-NEXT:   $x8 = LD $x2, 2016 :: (load (s64) from %stack.4)
-  ; CHECK-NEXT:   $x18 = LD $x2, 2008 :: (load (s64) from %stack.5)
-  ; CHECK-NEXT:   $x19 = LD $x2, 2000 :: (load (s64) from %stack.6)
-  ; CHECK-NEXT:   $x20 = LD $x2, 1992 :: (load (s64) from %stack.7)
-  ; CHECK-NEXT:   $x21 = LD $x2, 1984 :: (load (s64) from %stack.8)
-  ; CHECK-NEXT:   $x22 = LD $x2, 1976 :: (load (s64) from %stack.9)
-  ; CHECK-NEXT:   $x23 = LD $x2, 1968 :: (load (s64) from %stack.10)
-  ; CHECK-NEXT:   $x24 = LD $x2, 1960 :: (load (s64) from %stack.11)
-  ; CHECK-NEXT:   $x25 = LD $x2, 1952 :: (load (s64) from %stack.12)
-  ; CHECK-NEXT:   $x26 = LD $x2, 1944 :: (load (s64) from %stack.13)
-  ; CHECK-NEXT:   $x27 = LD $x2, 1936 :: (load (s64) from %stack.14)
+  ; CHECK-NEXT:   $x1 = frame-destroy LD $x2, 2024 :: (load (s64) from %stack.3)
+  ; CHECK-NEXT:   $x8 = frame-destroy LD $x2, 2016 :: (load (s64) from %stack.4)
+  ; CHECK-NEXT:   $x18 = frame-destroy LD $x2, 2008 :: (load (s64) from %stack.5)
+  ; CHECK-NEXT:   $x19 = frame-destroy LD $x2, 2000 :: (load (s64) from %stack.6)
+  ; CHECK-NEXT:   $x20 = frame-destroy LD $x2, 1992 :: (load (s64) from %stack.7)
+  ; CHECK-NEXT:   $x21 = frame-destroy LD $x2, 1984 :: (load (s64) from %stack.8)
+  ; CHECK-NEXT:   $x22 = frame-destroy LD $x2, 1976 :: (load (s64) from %stack.9)
+  ; CHECK-NEXT:   $x23 = frame-destroy LD $x2, 1968 :: (load (s64) from %stack.10)
+  ; CHECK-NEXT:   $x24 = frame-destroy LD $x2, 1960 :: (load (s64) from %stack.11)
+  ; CHECK-NEXT:   $x25 = frame-destroy LD $x2, 1952 :: (load (s64) from %stack.12)
+  ; CHECK-NEXT:   $x26 = frame-destroy LD $x2, 1944 :: (load (s64) from %stack.13)
+  ; CHECK-NEXT:   $x27 = frame-destroy LD $x2, 1936 :: (load (s64) from %stack.14)
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x1
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x8
   ; CHECK-NEXT:   frame-destroy CFI_INSTRUCTION restore $x18
diff --git a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
index 2e6d888e65ba6..fe84d29963353 100644
--- a/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
+++ b/llvm/test/CodeGen/RISCV/stack-inst-compress.mir
@@ -47,7 +47,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: {{  $}}
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: frame-setup SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -32
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2064
@@ -55,7 +55,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: PseudoCALL target-flags(riscv-call) @_Z6calleePi, csr_ilp32_lp64, implicit-def dead $x1, implicit killed $x10, implicit-def $x2
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 32
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: $x1 = LW $x2, 2028 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: $x1 = frame-destroy LW $x2, 2028 :: (load (s32) from %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -66,7 +66,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: {{  $}}
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI $x2, -256
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-COM-NEXT: frame-setup SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI $x2, -1808
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2064
@@ -74,7 +74,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: PseudoCALL target-flags(riscv-call) @_Z6calleePi, csr_ilp32_lp64, implicit-def dead $x1, implicit killed $x10, implicit-def $x2
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI $x2, 1808
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: $x1 = LW $x2, 252 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-COM-NEXT: $x1 = frame-destroy LW $x2, 252 :: (load (s32) from %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI $x2, 256
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -85,7 +85,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: {{  $}}
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: frame-setup SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -32
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2064
@@ -93,7 +93,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: PseudoCALL target-flags(riscv-call) @_Z6calleePi, csr_ilp32_lp64, implicit-def dead $x1, implicit killed $x10, implicit-def $x2
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 32
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: $x1 = LD $x2, 2024 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: $x1 = frame-destroy LD $x2, 2024 :: (load (s64) from %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -104,7 +104,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: {{  $}}
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI $x2, -496
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-COM-NEXT: frame-setup SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI $x2, -1568
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2064
@@ -112,7 +112,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: PseudoCALL target-flags(riscv-call) @_Z6calleePi, csr_ilp32_lp64, implicit-def dead $x1, implicit killed $x10, implicit-def $x2
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI $x2, 1568
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: $x1 = LD $x2, 488 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-COM-NEXT: $x1 = frame-destroy LD $x2, 488 :: (load (s64) from %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI $x2, 496
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -145,7 +145,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: {{  $}}
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: frame-setup SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2048
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI killed $x2, -32
@@ -155,7 +155,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI killed $x2, 48
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: $x1 = LW $x2, 2028 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: $x1 = frame-destroy LW $x2, 2028 :: (load (s32) from %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -166,7 +166,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: {{  $}}
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI $x2, -256
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-COM-NEXT: frame-setup SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI $x2, -2048
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI killed $x2, -1808
@@ -176,7 +176,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI killed $x2, 1824
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: $x1 = LW $x2, 252 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-COM-NEXT: $x1 = frame-destroy LW $x2, 252 :: (load (s32) from %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI $x2, 256
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -187,7 +187,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: {{  $}}
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: frame-setup SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2048
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI killed $x2, -32
@@ -197,7 +197,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI killed $x2, 48
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: $x1 = LD $x2, 2024 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: $x1 = frame-destroy LD $x2, 2024 :: (load (s64) from %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -208,7 +208,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: {{  $}}
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI $x2, -496
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-COM-NEXT: frame-setup SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI $x2, -2048
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI killed $x2, -1568
@@ -218,7 +218,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI killed $x2, 1584
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: $x1 = LD $x2, 488 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-COM-NEXT: $x1 = frame-destroy LD $x2, 488 :: (load (s64) from %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI $x2, 496
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -251,7 +251,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: {{  $}}
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: frame-setup SW killed $x1, $x2, 2028 :: (store (s32) into %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-NO-COM-NEXT: $x10 = frame-setup LUI 2
     ; CHECK-RV32-NO-COM-NEXT: $x10 = frame-setup ADDI killed $x10, -2016
@@ -263,7 +263,7 @@ body:             |
     ; CHECK-RV32-NO-COM-NEXT: $x10 = frame-destroy ADDI killed $x10, -2016
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADD $x2, killed $x10
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV32-NO-COM-NEXT: $x1 = LW $x2, 2028 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-NO-COM-NEXT: $x1 = frame-destroy LW $x2, 2028 :: (load (s32) from %stack.1)
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV32-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -274,7 +274,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: {{  $}}
     ; CHECK-RV32-COM-NEXT: $x2 = frame-setup ADDI $x2, -256
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
+    ; CHECK-RV32-COM-NEXT: frame-setup SW killed $x1, $x2, 252 :: (store (s32) into %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-RV32-COM-NEXT: $x10 = frame-setup LUI 2
     ; CHECK-RV32-COM-NEXT: $x10 = frame-setup ADDI killed $x10, -240
@@ -286,7 +286,7 @@ body:             |
     ; CHECK-RV32-COM-NEXT: $x10 = frame-destroy ADDI killed $x10, -240
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADD $x2, killed $x10
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 256
-    ; CHECK-RV32-COM-NEXT: $x1 = LW $x2, 252 :: (load (s32) from %stack.1)
+    ; CHECK-RV32-COM-NEXT: $x1 = frame-destroy LW $x2, 252 :: (load (s32) from %stack.1)
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV32-COM-NEXT: $x2 = frame-destroy ADDI $x2, 256
     ; CHECK-RV32-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -297,7 +297,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: {{  $}}
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-setup ADDI $x2, -2032
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: frame-setup SD killed $x1, $x2, 2024 :: (store (s64) into %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-NO-COM-NEXT: $x10 = frame-setup LUI 2
     ; CHECK-RV64-NO-COM-NEXT: $x10 = frame-setup ADDIW killed $x10, -2016
@@ -309,7 +309,7 @@ body:             |
     ; CHECK-RV64-NO-COM-NEXT: $x10 = frame-destroy ADDIW killed $x10, -2016
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADD $x2, killed $x10
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 2032
-    ; CHECK-RV64-NO-COM-NEXT: $x1 = LD $x2, 2024 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-NO-COM-NEXT: $x1 = frame-destroy LD $x2, 2024 :: (load (s64) from %stack.1)
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-NO-COM-NEXT: $x2 = frame-destroy ADDI $x2, 2032
     ; CHECK-RV64-NO-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -320,7 +320,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: {{  $}}
     ; CHECK-RV64-COM-NEXT: $x2 = frame-setup ADDI $x2, -496
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
+    ; CHECK-RV64-COM-NEXT: frame-setup SD killed $x1, $x2, 488 :: (store (s64) into %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-RV64-COM-NEXT: $x10 = frame-setup LUI 2
     ; CHECK-RV64-COM-NEXT: $x10 = frame-setup ADDIW killed $x10, -480
@@ -332,7 +332,7 @@ body:             |
     ; CHECK-RV64-COM-NEXT: $x10 = frame-destroy ADDIW killed $x10, -480
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADD $x2, killed $x10
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 496
-    ; CHECK-RV64-COM-NEXT: $x1 = LD $x2, 488 :: (load (s64) from %stack.1)
+    ; CHECK-RV64-COM-NEXT: $x1 = frame-destroy LD $x2, 488 :: (load (s64) from %stack.1)
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-RV64-COM-NEXT: $x2 = frame-destroy ADDI $x2, 496
     ; CHECK-RV64-COM-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
diff --git a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
index e737ee0994968..ba2a333f6c9ba 100644
--- a/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
+++ b/llvm/test/CodeGen/RISCV/zcmp-cm-popretz.mir
@@ -65,14 +65,14 @@ body:                   |
     ; CHECK-NO-ZCMP32-NEXT: {{  $}}
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-setup ADDI $x2, -16
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x1, $x2, 12 :: (store (s32) into %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x8, $x2, 8 :: (store (s32) into %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x1, $x2, 12 :: (store (s32) into %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x8, $x2, 8 :: (store (s32) into %stack.1)
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8
     ; CHECK-NO-ZCMP32-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP32-NEXT: $x8 = IMPLICIT_DEF
-    ; CHECK-NO-ZCMP32-NEXT: $x1 = LW $x2, 12 :: (load (s32) from %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: $x8 = LW $x2, 8 :: (load (s32) from %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: $x1 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: $x8 = frame-destroy LW $x2, 8 :: (load (s32) from %stack.1)
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-destroy ADDI $x2, 16
@@ -84,14 +84,14 @@ body:                   |
     ; CHECK-NO-ZCMP64-NEXT: {{  $}}
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-setup ADDI $x2, -16
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x8, $x2, 0 :: (store (s64) into %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x8, $x2, 0 :: (store (s64) into %stack.1)
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -16
     ; CHECK-NO-ZCMP64-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP64-NEXT: $x8 = IMPLICIT_DEF
-    ; CHECK-NO-ZCMP64-NEXT: $x1 = LD $x2, 8 :: (load (s64) from %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: $x8 = LD $x2, 0 :: (load (s64) from %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: $x8 = frame-destroy LD $x2, 0 :: (load (s64) from %stack.1)
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-destroy ADDI $x2, 16
@@ -157,15 +157,15 @@ body:                   |
     ; CHECK-NO-ZCMP32-NEXT: {{  $}}
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-setup ADDI $x2, -16
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x1, $x2, 12 :: (store (s32) into %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x8, $x2, 8 :: (store (s32) into %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x1, $x2, 12 :: (store (s32) into %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x8, $x2, 8 :: (store (s32) into %stack.1)
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8
     ; CHECK-NO-ZCMP32-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP32-NEXT: $x8 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP32-NEXT: $x10 = ADDI $x0, 0
-    ; CHECK-NO-ZCMP32-NEXT: $x1 = LW $x2, 12 :: (load (s32) from %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: $x8 = LW $x2, 8 :: (load (s32) from %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: $x1 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: $x8 = frame-destroy LW $x2, 8 :: (load (s32) from %stack.1)
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-destroy ADDI $x2, 16
@@ -177,15 +177,15 @@ body:                   |
     ; CHECK-NO-ZCMP64-NEXT: {{  $}}
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-setup ADDI $x2, -16
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x8, $x2, 0 :: (store (s64) into %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x1, $x2, 8 :: (store (s64) into %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x8, $x2, 0 :: (store (s64) into %stack.1)
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -16
     ; CHECK-NO-ZCMP64-NEXT: $x1 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP64-NEXT: $x8 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP64-NEXT: $x10 = ADDI $x0, 0
-    ; CHECK-NO-ZCMP64-NEXT: $x1 = LD $x2, 8 :: (load (s64) from %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: $x8 = LD $x2, 0 :: (load (s64) from %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: $x1 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: $x8 = frame-destroy LD $x2, 0 :: (load (s64) from %stack.1)
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-destroy ADDI $x2, 16
diff --git a/llvm/test/CodeGen/RISCV/zcmp-cm-push-pop.mir b/llvm/test/CodeGen/RISCV/zcmp-cm-push-pop.mir
index 00cca9645ecb3..f78031e62f049 100644
--- a/llvm/test/CodeGen/RISCV/zcmp-cm-push-pop.mir
+++ b/llvm/test/CodeGen/RISCV/zcmp-cm-push-pop.mir
@@ -155,19 +155,19 @@ body:                   |
     ; CHECK-NO-ZCMP32-NEXT: {{  $}}
     ; CHECK-NO-ZCMP32-NEXT: $x2 = frame-setup ADDI $x2, -64
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 64
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x1, $x2, 60 :: (store (s32) into %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x8, $x2, 56 :: (store (s32) into %stack.1)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x9, $x2, 52 :: (store (s32) into %stack.2)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x18, $x2, 48 :: (store (s32) into %stack.3)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x19, $x2, 44 :: (store (s32) into %stack.4)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x20, $x2, 40 :: (store (s32) into %stack.5)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x21, $x2, 36 :: (store (s32) into %stack.6)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x22, $x2, 32 :: (store (s32) into %stack.7)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x23, $x2, 28 :: (store (s32) into %stack.8)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x24, $x2, 24 :: (store (s32) into %stack.9)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x25, $x2, 20 :: (store (s32) into %stack.10)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x26, $x2, 16 :: (store (s32) into %stack.11)
-    ; CHECK-NO-ZCMP32-NEXT: SW killed $x27, $x2, 12 :: (store (s32) into %stack.12)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x1, $x2, 60 :: (store (s32) into %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x8, $x2, 56 :: (store (s32) into %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x9, $x2, 52 :: (store (s32) into %stack.2)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x18, $x2, 48 :: (store (s32) into %stack.3)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x19, $x2, 44 :: (store (s32) into %stack.4)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x20, $x2, 40 :: (store (s32) into %stack.5)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x21, $x2, 36 :: (store (s32) into %stack.6)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x22, $x2, 32 :: (store (s32) into %stack.7)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x23, $x2, 28 :: (store (s32) into %stack.8)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x24, $x2, 24 :: (store (s32) into %stack.9)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x25, $x2, 20 :: (store (s32) into %stack.10)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x26, $x2, 16 :: (store (s32) into %stack.11)
+    ; CHECK-NO-ZCMP32-NEXT: frame-setup SW killed $x27, $x2, 12 :: (store (s32) into %stack.12)
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -4
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -8
     ; CHECK-NO-ZCMP32-NEXT: frame-setup CFI_INSTRUCTION offset $x9, -12
@@ -194,19 +194,19 @@ body:                   |
     ; CHECK-NO-ZCMP32-NEXT: $x25 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP32-NEXT: $x26 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP32-NEXT: $x27 = IMPLICIT_DEF
-    ; CHECK-NO-ZCMP32-NEXT: $x1 = LW $x2, 60 :: (load (s32) from %stack.0)
-    ; CHECK-NO-ZCMP32-NEXT: $x8 = LW $x2, 56 :: (load (s32) from %stack.1)
-    ; CHECK-NO-ZCMP32-NEXT: $x9 = LW $x2, 52 :: (load (s32) from %stack.2)
-    ; CHECK-NO-ZCMP32-NEXT: $x18 = LW $x2, 48 :: (load (s32) from %stack.3)
-    ; CHECK-NO-ZCMP32-NEXT: $x19 = LW $x2, 44 :: (load (s32) from %stack.4)
-    ; CHECK-NO-ZCMP32-NEXT: $x20 = LW $x2, 40 :: (load (s32) from %stack.5)
-    ; CHECK-NO-ZCMP32-NEXT: $x21 = LW $x2, 36 :: (load (s32) from %stack.6)
-    ; CHECK-NO-ZCMP32-NEXT: $x22 = LW $x2, 32 :: (load (s32) from %stack.7)
-    ; CHECK-NO-ZCMP32-NEXT: $x23 = LW $x2, 28 :: (load (s32) from %stack.8)
-    ; CHECK-NO-ZCMP32-NEXT: $x24 = LW $x2, 24 :: (load (s32) from %stack.9)
-    ; CHECK-NO-ZCMP32-NEXT: $x25 = LW $x2, 20 :: (load (s32) from %stack.10)
-    ; CHECK-NO-ZCMP32-NEXT: $x26 = LW $x2, 16 :: (load (s32) from %stack.11)
-    ; CHECK-NO-ZCMP32-NEXT: $x27 = LW $x2, 12 :: (load (s32) from %stack.12)
+    ; CHECK-NO-ZCMP32-NEXT: $x1 = frame-destroy LW $x2, 60 :: (load (s32) from %stack.0)
+    ; CHECK-NO-ZCMP32-NEXT: $x8 = frame-destroy LW $x2, 56 :: (load (s32) from %stack.1)
+    ; CHECK-NO-ZCMP32-NEXT: $x9 = frame-destroy LW $x2, 52 :: (load (s32) from %stack.2)
+    ; CHECK-NO-ZCMP32-NEXT: $x18 = frame-destroy LW $x2, 48 :: (load (s32) from %stack.3)
+    ; CHECK-NO-ZCMP32-NEXT: $x19 = frame-destroy LW $x2, 44 :: (load (s32) from %stack.4)
+    ; CHECK-NO-ZCMP32-NEXT: $x20 = frame-destroy LW $x2, 40 :: (load (s32) from %stack.5)
+    ; CHECK-NO-ZCMP32-NEXT: $x21 = frame-destroy LW $x2, 36 :: (load (s32) from %stack.6)
+    ; CHECK-NO-ZCMP32-NEXT: $x22 = frame-destroy LW $x2, 32 :: (load (s32) from %stack.7)
+    ; CHECK-NO-ZCMP32-NEXT: $x23 = frame-destroy LW $x2, 28 :: (load (s32) from %stack.8)
+    ; CHECK-NO-ZCMP32-NEXT: $x24 = frame-destroy LW $x2, 24 :: (load (s32) from %stack.9)
+    ; CHECK-NO-ZCMP32-NEXT: $x25 = frame-destroy LW $x2, 20 :: (load (s32) from %stack.10)
+    ; CHECK-NO-ZCMP32-NEXT: $x26 = frame-destroy LW $x2, 16 :: (load (s32) from %stack.11)
+    ; CHECK-NO-ZCMP32-NEXT: $x27 = frame-destroy LW $x2, 12 :: (load (s32) from %stack.12)
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP32-NEXT: frame-destroy CFI_INSTRUCTION restore $x9
@@ -229,19 +229,19 @@ body:                   |
     ; CHECK-NO-ZCMP64-NEXT: {{  $}}
     ; CHECK-NO-ZCMP64-NEXT: $x2 = frame-setup ADDI $x2, -112
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 112
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x1, $x2, 104 :: (store (s64) into %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x8, $x2, 96 :: (store (s64) into %stack.1)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x9, $x2, 88 :: (store (s64) into %stack.2)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x18, $x2, 80 :: (store (s64) into %stack.3)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x19, $x2, 72 :: (store (s64) into %stack.4)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x20, $x2, 64 :: (store (s64) into %stack.5)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x21, $x2, 56 :: (store (s64) into %stack.6)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x22, $x2, 48 :: (store (s64) into %stack.7)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x23, $x2, 40 :: (store (s64) into %stack.8)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x24, $x2, 32 :: (store (s64) into %stack.9)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x25, $x2, 24 :: (store (s64) into %stack.10)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x26, $x2, 16 :: (store (s64) into %stack.11)
-    ; CHECK-NO-ZCMP64-NEXT: SD killed $x27, $x2, 8 :: (store (s64) into %stack.12)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x1, $x2, 104 :: (store (s64) into %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x8, $x2, 96 :: (store (s64) into %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x9, $x2, 88 :: (store (s64) into %stack.2)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x18, $x2, 80 :: (store (s64) into %stack.3)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x19, $x2, 72 :: (store (s64) into %stack.4)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x20, $x2, 64 :: (store (s64) into %stack.5)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x21, $x2, 56 :: (store (s64) into %stack.6)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x22, $x2, 48 :: (store (s64) into %stack.7)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x23, $x2, 40 :: (store (s64) into %stack.8)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x24, $x2, 32 :: (store (s64) into %stack.9)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x25, $x2, 24 :: (store (s64) into %stack.10)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x26, $x2, 16 :: (store (s64) into %stack.11)
+    ; CHECK-NO-ZCMP64-NEXT: frame-setup SD killed $x27, $x2, 8 :: (store (s64) into %stack.12)
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x1, -8
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x8, -16
     ; CHECK-NO-ZCMP64-NEXT: frame-setup CFI_INSTRUCTION offset $x9, -24
@@ -268,19 +268,19 @@ body:                   |
     ; CHECK-NO-ZCMP64-NEXT: $x25 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP64-NEXT: $x26 = IMPLICIT_DEF
     ; CHECK-NO-ZCMP64-NEXT: $x27 = IMPLICIT_DEF
-    ; CHECK-NO-ZCMP64-NEXT: $x1 = LD $x2, 104 :: (load (s64) from %stack.0)
-    ; CHECK-NO-ZCMP64-NEXT: $x8 = LD $x2, 96 :: (load (s64) from %stack.1)
-    ; CHECK-NO-ZCMP64-NEXT: $x9 = LD $x2, 88 :: (load (s64) from %stack.2)
-    ; CHECK-NO-ZCMP64-NEXT: $x18 = LD $x2, 80 :: (load (s64) from %stack.3)
-    ; CHECK-NO-ZCMP64-NEXT: $x19 = LD $x2, 72 :: (load (s64) from %stack.4)
-    ; CHECK-NO-ZCMP64-NEXT: $x20 = LD $x2, 64 :: (load (s64) from %stack.5)
-    ; CHECK-NO-ZCMP64-NEXT: $x21 = LD $x2, 56 :: (load (s64) from %stack.6)
-    ; CHECK-NO-ZCMP64-NEXT: $x22 = LD $x2, 48 :: (load (s64) from %stack.7)
-    ; CHECK-NO-ZCMP64-NEXT: $x23 = LD $x2, 40 :: (load (s64) from %stack.8)
-    ; CHECK-NO-ZCMP64-NEXT: $x24 = LD $x2, 32 :: (load (s64) from %stack.9)
-    ; CHECK-NO-ZCMP64-NEXT: $x25 = LD $x2, 24 :: (load (s64) from %stack.10)
-    ; CHECK-NO-ZCMP64-NEXT: $x26 = LD $x2, 16 :: (load (s64) from %stack.11)
-    ; CHECK-NO-ZCMP64-NEXT: $x27 = LD $x2, 8 :: (load (s64) from %stack.12)
+    ; CHECK-NO-ZCMP64-NEXT: $x1 = frame-destroy LD $x2, 104 :: (load (s64) from %stack.0)
+    ; CHECK-NO-ZCMP64-NEXT: $x8 = frame-destroy LD $x2, 96 :: (load (s64) from %stack.1)
+    ; CHECK-NO-ZCMP64-NEXT: $x9 = frame-destroy LD $x2, 88 :: (load (s64) from %stack.2)
+    ; CHECK-NO-ZCMP64-NEXT: $x18 = frame-destroy LD $x2, 80 :: (load (s64) from %stack.3)
+    ; CHECK-NO-ZCMP64-NEXT: $x19 = frame-destroy LD $x2, 72 :: (load (s64) from %stack.4)
+    ; CHECK-NO-ZCMP64-NEXT: $x20 = frame-destroy LD $x2, 64 :: (load (s64) from %stack.5)
+    ; CHECK-NO-ZCMP64-NEXT: $x21 = frame-destroy LD $x2, 56 :: (load (s64) from %stack.6)
+    ; CHECK-NO-ZCMP64-NEXT: $x22 = frame-destroy LD $x2, 48 :: (load (s64) from %stack.7)
+    ; CHECK-NO-ZCMP64-NEXT: $x23 = frame-destroy LD $x2, 40 :: (load (s64) from %stack.8)
+    ; CHECK-NO-ZCMP64-NEXT: $x24 = frame-destroy LD $x2, 32 :: (load (s64) from %stack.9)
+    ; CHECK-NO-ZCMP64-NEXT: $x25 = frame-destroy LD $x2, 24 :: (load (s64) from %stack.10)
+    ; CHECK-NO-ZCMP64-NEXT: $x26 = frame-destroy LD $x2, 16 :: (load (s64) from %stack.11)
+    ; CHECK-NO-ZCMP64-NEXT: $x27 = frame-destroy LD $x2, 8 :: (load (s64) from %stack.12)
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x1
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x8
     ; CHECK-NO-ZCMP64-NEXT: frame-destroy CFI_INSTRUCTION restore $x9
diff --git a/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll b/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
index 99594b5e01e95..14cb4b386a369 100644
--- a/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
+++ b/llvm/test/DebugInfo/RISCV/dwarf-riscv-relocs.ll
@@ -71,7 +71,7 @@
 ; DWARF-DUMP-NEXT: ------------------ ------ ------ ------ --- ------------- ------- -------------
 ; DWARF-DUMP-NEXT: 0x0000000000000000      2      0      0   0             0       0  is_stmt
 ; DWARF-DUMP-NEXT: 0x000000000000001c      3      5      0   0             0       0  is_stmt prologue_end
-; DWARF-DUMP-NEXT: 0x0000000000000028      3      5      0   0             0       0  epilogue_begin
+; DWARF-DUMP-NEXT: 0x0000000000000020      3      5      0   0             0       0  epilogue_begin
 ; DWARF-DUMP-NEXT: 0x0000000000000030      3      5      0   0             0       0  end_sequence
 
 ; ModuleID = 'dwarf-riscv-relocs.c'

From d459784cbea334d167b2dca48e0c26115c68e5d3 Mon Sep 17 00:00:00 2001
From: Mats Jun Larsen <mats@jun.codes>
Date: Wed, 29 Jan 2025 00:44:45 +0900
Subject: [PATCH 424/432] [IR][SPIR-V] Replace of PointerType::get(Type) with
 opaque version (NFC) (#124755)

---
 llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp       | 8 +-------
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 2 +-
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index b52c793e57e96..95fa7bc3894fd 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -2176,14 +2176,8 @@ static SPIRVType *
 getOrCreateSPIRVDeviceEventPointer(MachineIRBuilder &MIRBuilder,
                                    SPIRVGlobalRegistry *GR) {
   LLVMContext &Context = MIRBuilder.getMF().getFunction().getContext();
-  Type *OpaqueType = StructType::getTypeByName(Context, "spirv.DeviceEvent");
-  if (!OpaqueType)
-    OpaqueType = StructType::getTypeByName(Context, "opencl.clk_event_t");
-  if (!OpaqueType)
-    OpaqueType = StructType::create(Context, "spirv.DeviceEvent");
-  unsigned SC0 = storageClassToAddressSpace(SPIRV::StorageClass::Function);
   unsigned SC1 = storageClassToAddressSpace(SPIRV::StorageClass::Generic);
-  Type *PtrType = PointerType::get(PointerType::get(OpaqueType, SC0), SC1);
+  Type *PtrType = PointerType::get(Context, SC1);
   return GR->getOrCreateSPIRVType(PtrType, MIRBuilder);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 874894ae98726..e2f1b211caa5c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -629,7 +629,7 @@ SPIRVGlobalRegistry::getOrCreateConstNullPtr(MachineIRBuilder &MIRBuilder,
   unsigned AddressSpace = typeToAddressSpace(LLVMTy);
   // Find a constant in DT or build a new one.
   Constant *CP = ConstantPointerNull::get(
-      PointerType::get(::getPointeeType(LLVMTy), AddressSpace));
+      PointerType::get(LLVMTy->getContext(), AddressSpace));
   Register Res = DT.find(CP, CurMF);
   if (!Res.isValid()) {
     LLT LLTy = LLT::pointer(AddressSpace, PointerSize);

From 75622e3f8d9d18de693988f95c44a0011de9208f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 28 Jan 2025 16:50:10 +0100
Subject: [PATCH 425/432] [MLIR] Define `getArgument()` for Toy tutorial passes

This is important during debugging to be able to dump a pass pipeline.
It is also what is used by `--mlir-print-ir-tree-dir` to compute filenames
during dumps.
---
 mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp | 1 +
 mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp | 1 +
 mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp | 1 +
 mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp | 1 +
 mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp        | 1 +
 mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp | 1 +
 mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp | 1 +
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp        | 1 +
 mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp | 1 +
 9 files changed, 9 insertions(+)

diff --git a/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
index a9e995ed91bff..2522abe1a46f9 100644
--- a/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
@@ -55,6 +55,7 @@ namespace {
 struct ShapeInferencePass
     : public mlir::PassWrapper<ShapeInferencePass, OperationPass<toy::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass)
+  StringRef getArgument() const override { return "toy-shape-inference"; }
 
   void runOnOperation() override {
     auto f = getOperation();
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index 741321432b452..bf2bc43301a33 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -328,6 +328,7 @@ namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToAffineLoweringPass)
+  StringRef getArgument() const override { return "toy-to-affine"; }
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect, func::FuncDialect,
diff --git a/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
index a9e995ed91bff..2522abe1a46f9 100644
--- a/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
@@ -55,6 +55,7 @@ namespace {
 struct ShapeInferencePass
     : public mlir::PassWrapper<ShapeInferencePass, OperationPass<toy::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass)
+  StringRef getArgument() const override { return "toy-shape-inference"; }
 
   void runOnOperation() override {
     auto f = getOperation();
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index 741321432b452..bf2bc43301a33 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -328,6 +328,7 @@ namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToAffineLoweringPass)
+  StringRef getArgument() const override { return "toy-to-affine"; }
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect, func::FuncDialect,
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 3ad70e7279692..de1eafc4d8678 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -184,6 +184,7 @@ namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToLLVMLoweringPass)
+  StringRef getArgument() const override { return "toy-to-llvm"; }
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
diff --git a/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
index a9e995ed91bff..2522abe1a46f9 100644
--- a/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
@@ -55,6 +55,7 @@ namespace {
 struct ShapeInferencePass
     : public mlir::PassWrapper<ShapeInferencePass, OperationPass<toy::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass)
+  StringRef getArgument() const override { return "toy-shape-inference"; }
 
   void runOnOperation() override {
     auto f = getOperation();
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index 741321432b452..bf2bc43301a33 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -328,6 +328,7 @@ namespace {
 struct ToyToAffineLoweringPass
     : public PassWrapper<ToyToAffineLoweringPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToAffineLoweringPass)
+  StringRef getArgument() const override { return "toy-to-affine"; }
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<affine::AffineDialect, func::FuncDialect,
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 123d114ae1635..a20a2c888a175 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -184,6 +184,7 @@ namespace {
 struct ToyToLLVMLoweringPass
     : public PassWrapper<ToyToLLVMLoweringPass, OperationPass<ModuleOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ToyToLLVMLoweringPass)
+  StringRef getArgument() const override { return "toy-to-llvm"; }
 
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<LLVM::LLVMDialect, scf::SCFDialect>();
diff --git a/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
index a9e995ed91bff..2522abe1a46f9 100644
--- a/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
@@ -55,6 +55,7 @@ namespace {
 struct ShapeInferencePass
     : public mlir::PassWrapper<ShapeInferencePass, OperationPass<toy::FuncOp>> {
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ShapeInferencePass)
+  StringRef getArgument() const override { return "toy-shape-inference"; }
 
   void runOnOperation() override {
     auto f = getOperation();

From 48df9480dab57f99aa466ade1df6c46e71da25b5 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 28 Jan 2025 15:55:45 +0000
Subject: [PATCH 426/432] [NFC] Suppress spurious deprecation warning with MSVC
 (#124764)

gcc and clang won't complain about calls to deprecated functions, if
you're calling from a function that is deprecated too. However, MSVC
does care, and expands into maaany deprecation warnings for
getFirstNonPHI.

Suppress this by converting the inlineable copy of getFirstNonPHI into a
non-inline copy.
---
 llvm/include/llvm/IR/BasicBlock.h | 5 +----
 llvm/lib/IR/BasicBlock.cpp        | 7 +++++++
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 2ee17ce8f483e..c9169cb601809 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -287,10 +287,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   const Instruction *getFirstNonPHI() const;
   LLVM_DEPRECATED("Use iterators as instruction positions instead",
                   "getFirstNonPHIIt")
-  Instruction *getFirstNonPHI() {
-    return const_cast<Instruction *>(
-        static_cast<const BasicBlock *>(this)->getFirstNonPHI());
-  }
+  Instruction *getFirstNonPHI();
 
   /// Returns an iterator to the first instruction in this block that is not a
   /// PHINode instruction.
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 8eaa6e522f826..dca42a57fa9e3 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -371,6 +371,13 @@ const Instruction* BasicBlock::getFirstNonPHI() const {
   return nullptr;
 }
 
+Instruction *BasicBlock::getFirstNonPHI() {
+  for (Instruction &I : *this)
+    if (!isa<PHINode>(I))
+      return &I;
+  return nullptr;
+}
+
 BasicBlock::const_iterator BasicBlock::getFirstNonPHIIt() const {
   for (const Instruction &I : *this) {
     if (isa<PHINode>(I))

From e38f4f6904b774dfdd90d78c3fb282f8cc9d07c1 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Tue, 28 Jan 2025 17:01:57 +0100
Subject: [PATCH 427/432] Revert "[clang] improve print / dump of anonymous
 declarations (#124605)"

This reverts commit f949f876daeda520a5b7dbeb2cbb35b8c4383acb.

This commit introduces an llvm_unreachable call that is actually
reachable. I posted a reproducer on the pull request discussion.
---
 clang/lib/AST/StmtPrinter.cpp                 |  32 +-
 clang/lib/AST/TextNodeDumper.cpp              |  36 +-
 .../test/AST/HLSL/StructuredBuffers-AST.hlsl  | 444 +++++++++---------
 clang/test/AST/HLSL/TypedBuffers-AST.hlsl     |   4 +-
 clang/test/AST/ast-dump-decl.c                |   2 +-
 clang/test/AST/ast-dump-records.c             |  16 +-
 clang/test/AST/ast-dump-records.cpp           |  16 +-
 .../attr-counted-by-late-parsed-struct-ptrs.c |   2 +-
 ...unted-by-or-null-late-parsed-struct-ptrs.c |   2 +-
 .../AST/attr-counted-by-or-null-struct-ptrs.c |   6 +-
 clang/test/AST/attr-counted-by-struct-ptrs.c  |   6 +-
 .../attr-sized-by-late-parsed-struct-ptrs.c   |   2 +-
 ...sized-by-or-null-late-parsed-struct-ptrs.c |   2 +-
 .../AST/attr-sized-by-or-null-struct-ptrs.c   |   6 +-
 clang/test/AST/attr-sized-by-struct-ptrs.c    |   6 +-
 clang/test/Import/cxx-anon-namespace/test.cpp |   8 +-
 clang/test/Modules/odr_hash.cpp               |   2 +-
 .../aggregate-deduction-candidate.cpp         |   4 +-
 clang/test/SemaTemplate/deduction-crash.cpp   |  16 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   8 +-
 20 files changed, 286 insertions(+), 334 deletions(-)

diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 2db5e7936041b..b5def6fbe525c 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1257,12 +1257,11 @@ void StmtPrinter::VisitConstantExpr(ConstantExpr *Node) {
 }
 
 void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
-  ValueDecl *VD = Node->getDecl();
-  if (const auto *OCED = dyn_cast<OMPCapturedExprDecl>(VD)) {
+  if (const auto *OCED = dyn_cast<OMPCapturedExprDecl>(Node->getDecl())) {
     OCED->getInit()->IgnoreImpCasts()->printPretty(OS, nullptr, Policy);
     return;
   }
-  if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(VD)) {
+  if (const auto *TPOD = dyn_cast<TemplateParamObjectDecl>(Node->getDecl())) {
     TPOD->printAsExpr(OS, Policy);
     return;
   }
@@ -1270,29 +1269,16 @@ void StmtPrinter::VisitDeclRefExpr(DeclRefExpr *Node) {
     Qualifier->print(OS, Policy);
   if (Node->hasTemplateKeyword())
     OS << "template ";
-  DeclarationNameInfo NameInfo = Node->getNameInfo();
-  if (IdentifierInfo *ID = NameInfo.getName().getAsIdentifierInfo();
-      ID || NameInfo.getName().getNameKind() != DeclarationName::Identifier) {
-    if (Policy.CleanUglifiedParameters &&
-        isa<ParmVarDecl, NonTypeTemplateParmDecl>(VD) && ID)
-      OS << ID->deuglifiedName();
-    else
-      NameInfo.printName(OS, Policy);
-  } else {
-    switch (VD->getKind()) {
-    case Decl::NonTypeTemplateParm: {
-      auto *TD = cast<NonTypeTemplateParmDecl>(VD);
-      OS << "value-parameter-" << TD->getDepth() << '-' << TD->getIndex() << "";
-      break;
-    }
-    default:
-      llvm_unreachable("Unhandled anonymous declaration kind");
-    }
-  }
+  if (Policy.CleanUglifiedParameters &&
+      isa<ParmVarDecl, NonTypeTemplateParmDecl>(Node->getDecl()) &&
+      Node->getDecl()->getIdentifier())
+    OS << Node->getDecl()->getIdentifier()->deuglifiedName();
+  else
+    Node->getNameInfo().printName(OS, Policy);
   if (Node->hasExplicitTemplateArgs()) {
     const TemplateParameterList *TPL = nullptr;
     if (!Node->hadMultipleCandidates())
-      if (auto *TD = dyn_cast<TemplateDecl>(VD))
+      if (auto *TD = dyn_cast<TemplateDecl>(Node->getDecl()))
         TPL = TD->getTemplateParameters();
     printTemplateArgumentList(OS, Node->template_arguments(), Policy, TPL);
   }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index a57cba9597482..46ec553fc05f0 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -901,41 +901,7 @@ void TextNodeDumper::dumpBareDeclRef(const Decl *D) {
 
   if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
     ColorScope Color(OS, ShowColors, DeclNameColor);
-    if (DeclarationName Name = ND->getDeclName())
-      OS << " '" << Name << '\'';
-    else
-      switch (ND->getKind()) {
-      case Decl::Decomposition: {
-        auto *DD = cast<DecompositionDecl>(ND);
-        OS << " first_binding '" << DD->bindings()[0]->getDeclName() << '\'';
-        break;
-      }
-      case Decl::Field: {
-        auto *FD = cast<FieldDecl>(ND);
-        OS << " field_index " << FD->getFieldIndex();
-        break;
-      }
-      case Decl::ParmVar: {
-        auto *PD = cast<ParmVarDecl>(ND);
-        OS << " depth " << PD->getFunctionScopeDepth() << " index "
-           << PD->getFunctionScopeIndex();
-        break;
-      }
-      case Decl::TemplateTypeParm: {
-        auto *TD = cast<TemplateTypeParmDecl>(ND);
-        OS << " depth " << TD->getDepth() << " index " << TD->getIndex();
-        break;
-      }
-      case Decl::NonTypeTemplateParm: {
-        auto *TD = cast<NonTypeTemplateParmDecl>(ND);
-        OS << " depth " << TD->getDepth() << " index " << TD->getIndex();
-        break;
-      }
-      default:
-        // Var, Namespace, (CXX)Record: Nothing else besides source location.
-        dumpSourceRange(ND->getSourceRange());
-        break;
-      }
+    OS << " '" << ND->getDeclName() << '\'';
   }
 
   if (const ValueDecl *VD = dyn_cast<ValueDecl>(D))
diff --git a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
index 11be67d45a14c..db3f2d405e686 100644
--- a/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffers-AST.hlsl
@@ -1,222 +1,222 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
-// RUN:  -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:  -check-prefix=EMPTY %s
-//
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
-// RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
-// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s
-
-// This test tests two different AST generations for each structured buffer.
-// The "EMPTY" test mode verifies the AST generated by forward declaration
-// of the HLSL types which happens on initializing the HLSL external AST with
-// an AST Context.
-
-// The non-empty mode has a use that requires the resource type be complete,
-// which results in the AST being populated by the external AST source. That
-// case covers the full implementation of the template declaration and the
-// instantiated specialization.
-
-// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
-// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
-// EMPTY-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
-// EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
-// EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
-// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
-// EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
-// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
-// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class [[RESOURCE]]
-// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-
-// There should be no more occurrences of [[RESOURCE]]
-// EMPTY-NOT: {{[^[:alnum:]]}}[[RESOURCE]]
-
-#ifndef EMPTY
-
-RESOURCE<float> Buffer;
-
-#endif
-
-// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
-// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
-// CHECK-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
-// CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
-// CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
-// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
-// CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
-// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
-// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class [[RESOURCE]] definition
-
-// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
-// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
-
-// CHECK-SUBSCRIPT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
-// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const [[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-SUBSCRIPT-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
-// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
-// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
-
-// CHECK-LOAD: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Load 'element_type (unsigned int)'
-// CHECK-LOAD-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
-// CHECK-LOAD-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-LOAD-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-LOAD-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-LOAD-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-LOAD-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-LOAD-SAME{LITERAL}: [[hlsl::resource_class(
-// CHECK-LOAD-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-LOAD-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
-// CHECK-LOAD-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
-// CHECK-LOAD-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-COUNTER: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> IncrementCounter 'unsigned int ()'
-// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
-// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
-// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-COUNTER-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> DecrementCounter 'unsigned int ()'
-// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
-// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
-// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-APPEND: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Append 'void (element_type)'
-// CHECK-APPEND-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> value 'element_type'
-// CHECK-APPEND-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-APPEND-NEXT: BinaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' '='
-// CHECK-APPEND-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-APPEND-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
-// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' ParmVar 0x{{[0-9A-Fa-f]+}} 'value' 'element_type'
-
-// CHECK-CONSUME: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Consume 'element_type ()'
-// CHECK-CONSUME-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-CONSUME-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-CONSUME-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
-// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
-// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
-// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
-// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
-// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
-// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
-// CHECK-CONSUME-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
-
-// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class [[RESOURCE]] definition
-
-// CHECK: TemplateArgument type 'float'
-// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
-// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
-// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=StructuredBuffer %s | FileCheck -DRESOURCE=StructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-SRV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=RWStructuredBuffer %s | FileCheck -DRESOURCE=RWStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-SUBSCRIPT,CHECK-COUNTER,CHECK-LOAD %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=AppendStructuredBuffer %s | FileCheck -DRESOURCE=AppendStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-APPEND %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=ConsumeStructuredBuffer %s | FileCheck -DRESOURCE=ConsumeStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-NOSUBSCRIPT,CHECK-CONSUME %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY \
+// RUN:  -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
+// RUN:  -check-prefix=EMPTY %s
+//
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump \
+// RUN:   -DRESOURCE=RasterizerOrderedStructuredBuffer %s | FileCheck -DRESOURCE=RasterizerOrderedStructuredBuffer \
+// RUN:   -check-prefixes=CHECK,CHECK-UAV,CHECK-ROV,CHECK-SUBSCRIPT,CHECK-LOAD %s
+
+// This test tests two different AST generations for each structured buffer.
+// The "EMPTY" test mode verifies the AST generated by forward declaration
+// of the HLSL types which happens on initializing the HLSL external AST with
+// an AST Context.
+
+// The non-empty mode has a use that requires the resource type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// EMPTY-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class [[RESOURCE]]
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrences of [[RESOURCE]]
+// EMPTY-NOT: {{[^[:alnum:]]}}[[RESOURCE]]
+
+#ifndef EMPTY
+
+RESOURCE<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit [[RESOURCE]]
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> typename depth 0 index 0 element_type
+// CHECK-NEXT: ConceptSpecializationExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'bool' Concept 0x{{[0-9A-Fa-f]+}} '__is_structured_resource_element_compatible'
+// CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
+// CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
+// CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
+// CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class [[RESOURCE]] definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
+// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
+
+// CHECK-SUBSCRIPT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
+// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const [[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-SUBSCRIPT-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-SUBSCRIPT-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-SUBSCRIPT-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-SUBSCRIPT-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-SUBSCRIPT-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SUBSCRIPT-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-SUBSCRIPT-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-SUBSCRIPT-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-SUBSCRIPT-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-SUBSCRIPT-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'const element_type &(unsigned int) const'
+// CHECK-NOSUBSCRIPT-NOT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+
+// CHECK-LOAD: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Load 'element_type (unsigned int)'
+// CHECK-LOAD-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Index 'unsigned int'
+// CHECK-LOAD-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-LOAD-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-LOAD-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-LOAD-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-LOAD-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-LOAD-SAME{LITERAL}: [[hlsl::resource_class(
+// CHECK-LOAD-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-LOAD-SAME: ' lvalue .__handle 0x{{[0-9A-Fa-f]+}}
+// CHECK-LOAD-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-LOAD-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int' ParmVar 0x{{[0-9A-Fa-f]+}} 'Index' 'unsigned int'
+// CHECK-LOAD-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-COUNTER: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> IncrementCounter 'unsigned int ()'
+// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
+// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
+// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-COUNTER-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> DecrementCounter 'unsigned int ()'
+// CHECK-COUNTER-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-COUNTER-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-COUNTER-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-COUNTER-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-COUNTER-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-COUNTER-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'RWStructuredBuffer<element_type>' lvalue implicit this
+// CHECK-COUNTER-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
+// CHECK-COUNTER-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-APPEND: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Append 'void (element_type)'
+// CHECK-APPEND-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> value 'element_type'
+// CHECK-APPEND-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-APPEND-NEXT: BinaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' '='
+// CHECK-APPEND-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-APPEND-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-APPEND-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-APPEND-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-APPEND-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-APPEND-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' 1
+// CHECK-APPEND-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' ParmVar 0x{{[0-9A-Fa-f]+}} 'value' 'element_type'
+
+// CHECK-CONSUME: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Consume 'element_type ()'
+// CHECK-CONSUME-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-CONSUME-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-CONSUME-NEXT: UnaryOperator 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' prefix '*' cannot overflow
+// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type *'
+// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_resource_getpointer' 'void (...) noexcept'
+// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-CONSUME-NEXT: CallExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'unsigned int'
+// CHECK-CONSUME-NEXT: DeclRefExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '<builtin fn type>' Function 0x{{[0-9A-Fa-f]+}} '__builtin_hlsl_buffer_update_counter' 'unsigned int (...) noexcept'
+// CHECK-CONSUME-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '__hlsl_resource_t
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-CONSUME-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]' lvalue .__handle
+// CHECK-CONSUME-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> '[[RESOURCE]]<element_type>' lvalue implicit this
+// CHECK-CONSUME-NEXT: IntegerLiteral 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'int' -1
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class [[RESOURCE]] definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit __handle '__hlsl_resource_t
+// CHECK-SRV-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-UAV-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-ROV-SAME{LITERAL}: [[hlsl::is_rov]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit RawBuffer
diff --git a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
index 406ff07d0cf62..d6dfb0caba5d9 100644
--- a/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
+++ b/clang/test/AST/HLSL/TypedBuffers-AST.hlsl
@@ -23,7 +23,7 @@
 // EMPTY-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
 // EMPTY-NEXT: TemplateArgument type 'type-parameter-0-0'
 // EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
+// EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
 // EMPTY-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
 // EMPTY-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
 // EMPTY-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
@@ -45,7 +45,7 @@ RESOURCE<float> Buffer;
 // CHECK-NEXT: ImplicitConceptSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc>
 // CHECK-NEXT: TemplateArgument type 'type-parameter-0-0'
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-0' dependent depth 0 index 0
-// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} depth 0 index 0
+// CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} ''
 // CHECK-NEXT: TemplateArgument type 'element_type':'type-parameter-0-0'
 // CHECK-NEXT: TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'element_type' dependent depth 0 index 0
 // CHECK-NEXT: TemplateTypeParm 0x{{[0-9A-Fa-f]+}} 'element_type'
diff --git a/clang/test/AST/ast-dump-decl.c b/clang/test/AST/ast-dump-decl.c
index 683df50f7e91c..28b58c8eb648c 100644
--- a/clang/test/AST/ast-dump-decl.c
+++ b/clang/test/AST/ast-dump-decl.c
@@ -121,7 +121,7 @@ struct testIndirectFieldDecl {
   };
 };
 // CHECK:      IndirectFieldDecl{{.*}} TestIndirectFieldDecl 'int'
-// CHECK-NEXT:   Field{{.*}} field_index 0
+// CHECK-NEXT:   Field{{.*}} ''
 // CHECK-NEXT:   Field{{.*}} 'TestIndirectFieldDecl'
 
 // FIXME: It would be nice to dump the enum and its enumerators.
diff --git a/clang/test/AST/ast-dump-records.c b/clang/test/AST/ast-dump-records.c
index f4a540f3fa872..c0fbac67bc491 100644
--- a/clang/test/AST/ast-dump-records.c
+++ b/clang/test/AST/ast-dump-records.c
@@ -58,10 +58,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:3> col:3 implicit 'union C::(anonymous at {{.*}}:[[@LINE-7]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union C::(anonymous at {{.*}}:[[@LINE-9]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union C::(anonymous at {{.*}}:[[@LINE-9]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union C::(anonymous at {{.*}}:[[@LINE-12]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union C::(anonymous at {{.*}}:[[@LINE-12]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -72,10 +72,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:3> col:3 implicit 'struct C::(anonymous at {{.*}}:[[@LINE-6]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct C::(anonymous at {{.*}}:[[@LINE-8]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct C::(anonymous at {{.*}}:[[@LINE-8]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct C::(anonymous at {{.*}}:[[@LINE-11]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct C::(anonymous at {{.*}}:[[@LINE-11]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
@@ -141,10 +141,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:3> col:3 implicit 'union G::(anonymous at {{.*}}:[[@LINE-7]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union G::(anonymous at {{.*}}:[[@LINE-9]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union G::(anonymous at {{.*}}:[[@LINE-9]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'union G::(anonymous at {{.*}}:[[@LINE-12]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'union G::(anonymous at {{.*}}:[[@LINE-12]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -155,10 +155,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:3> col:3 implicit 'struct G::(anonymous at {{.*}}:[[@LINE-6]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct G::(anonymous at {{.*}}:[[@LINE-8]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct G::(anonymous at {{.*}}:[[@LINE-8]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'struct G::(anonymous at {{.*}}:[[@LINE-11]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'struct G::(anonymous at {{.*}}:[[@LINE-11]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
diff --git a/clang/test/AST/ast-dump-records.cpp b/clang/test/AST/ast-dump-records.cpp
index e9b37b73002dd..bfd8892698d4b 100644
--- a/clang/test/AST/ast-dump-records.cpp
+++ b/clang/test/AST/ast-dump-records.cpp
@@ -90,10 +90,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-14]]:3> col:3 implicit 'C::(anonymous union at {{.*}}:[[@LINE-14]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'C::(anonymous union at {{.*}}:[[@LINE-16]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous union at {{.*}}:[[@LINE-16]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'C::(anonymous union at {{.*}}:[[@LINE-19]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous union at {{.*}}:[[@LINE-19]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -111,10 +111,10 @@ struct C {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-13]]:3> col:3 implicit 'C::(anonymous struct at {{.*}}:[[@LINE-13]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'C::(anonymous struct at {{.*}}:[[@LINE-15]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous struct at {{.*}}:[[@LINE-15]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'C::(anonymous struct at {{.*}}:[[@LINE-18]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'C::(anonymous struct at {{.*}}:[[@LINE-18]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
@@ -223,10 +223,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-15]]:3> col:3 implicit 'G::(anonymous union at {{.*}}:[[@LINE-15]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-6]]:9> col:9 implicit c 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'G::(anonymous union at {{.*}}:[[@LINE-17]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous union at {{.*}}:[[@LINE-17]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'c' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-7]]:11> col:11 implicit d 'float'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 1 'G::(anonymous union at {{.*}}:[[@LINE-20]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous union at {{.*}}:[[@LINE-20]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'd' 'float'
 
   struct {
@@ -245,10 +245,10 @@ union G {
   };
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <line:[[@LINE-14]]:3> col:3 implicit 'G::(anonymous struct at {{.*}}:[[@LINE-14]]:3)'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <line:[[@LINE-5]]:9> col:9 implicit e 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'G::(anonymous struct at {{.*}}:[[@LINE-16]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous struct at {{.*}}:[[@LINE-16]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'e' 'int'
   // CHECK-NEXT: IndirectFieldDecl 0x{{[^ ]*}} <col:12> col:12 implicit f 'int'
-  // CHECK-NEXT: Field 0x{{[^ ]*}} field_index 2 'G::(anonymous struct at {{.*}}:[[@LINE-19]]:3)'
+  // CHECK-NEXT: Field 0x{{[^ ]*}} '' 'G::(anonymous struct at {{.*}}:[[@LINE-19]]:3)'
   // CHECK-NEXT: Field 0x{{[^ ]*}} 'f' 'int'
 };
 
diff --git a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
index f9772db8b6554..a585a45eeff03 100644
--- a/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
index 59b866dae720d..975c0a0231943 100644
--- a/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
index d42547003f0b3..075f583784fe1 100644
--- a/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-or-null-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-counted-by-struct-ptrs.c b/clang/test/AST/attr-counted-by-struct-ptrs.c
index afef9c8c3b95d..0c05258234143 100644
--- a/clang/test/AST/attr-counted-by-struct-ptrs.c
+++ b/clang/test/AST/attr-counted-by-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __counted_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __counted_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __counted_by(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c b/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
index 411b333f37989..b58caf608bf97 100644
--- a/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
index ae89360e17493..d55a42ac0fb94 100644
--- a/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-late-parsed-struct-ptrs.c
@@ -31,7 +31,7 @@ struct on_pointer_anon_count {
 // CHECK-NEXT:  | `-FieldDecl {{.*}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.*}} implicit 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.*}} implicit referenced count 'int'
-// CHECK-NEXT:    |-Field {{.*}} field_index 1 'struct on_pointer_anon_count::(anonymous at {{.*}})'
+// CHECK-NEXT:    |-Field {{.*}} '' 'struct on_pointer_anon_count::(anonymous at {{.*}})'
 // CHECK-NEXT:    `-Field {{.*}} 'count' 'int'
 
 //==============================================================================
diff --git a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
index 7273280e4b60c..73b8a71f23503 100644
--- a/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-or-null-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by_or_null(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/AST/attr-sized-by-struct-ptrs.c b/clang/test/AST/attr-sized-by-struct-ptrs.c
index 738eaf8cbf36b..7f7e3dfea2ac7 100644
--- a/clang/test/AST/attr-sized-by-struct-ptrs.c
+++ b/clang/test/AST/attr-sized-by-struct-ptrs.c
@@ -26,7 +26,7 @@ struct on_member_pointer_complete_ty {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf::(anonymous at [[ANON_STRUCT_PATH]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by(count)':'struct size_known *'
 struct on_pointer_anon_buf {
   int count;
@@ -94,7 +94,7 @@ struct on_nested_pointer_outer {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} buf 'struct size_known * __sized_by(count)':'struct size_known *'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2:.+]])'
 // CHECK-NEXT:  `-IndirectFieldDecl {{.+}} implicit buf 'struct size_known * __sized_by(count)':'struct size_known *'
-// CHECK-NEXT:    |-Field {{.+}} field_index 1 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
+// CHECK-NEXT:    |-Field {{.+}} '' 'struct on_pointer_anon_buf_ty_pos::(anonymous at [[ANON_STRUCT_PATH2]])'
 // CHECK-NEXT:    `-Field {{.+}} 'buf' 'struct size_known * __sized_by(count)':'struct size_known *'
 struct on_pointer_anon_buf_ty_pos {
   int count;
@@ -108,7 +108,7 @@ struct on_pointer_anon_buf_ty_pos {
 // CHECK-NEXT:  | `-FieldDecl {{.+}} count 'int'
 // CHECK-NEXT:  |-FieldDecl {{.+}} implicit 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3:.+]])'
 // CHECK-NEXT:  |-IndirectFieldDecl {{.+}} implicit referenced count 'int'
-// CHECK-NEXT:  | |-Field {{.+}} field_index 0 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
+// CHECK-NEXT:  | |-Field {{.+}} '' 'struct on_pointer_anon_count_ty_pos::(anonymous at [[ANON_STRUCT_PATH3]])'
 // CHECK-NEXT:  | `-Field {{.+}} 'count' 'int'
 struct on_pointer_anon_count_ty_pos {
   struct {
diff --git a/clang/test/Import/cxx-anon-namespace/test.cpp b/clang/test/Import/cxx-anon-namespace/test.cpp
index 9a2df03d35bbc..e7668cc36f0e8 100644
--- a/clang/test/Import/cxx-anon-namespace/test.cpp
+++ b/clang/test/Import/cxx-anon-namespace/test.cpp
@@ -14,12 +14,12 @@
 // CHECK-NEXT: CompoundStmt
 // This is for the nested anonymous namespace.
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: <col:1, line:24:1>
+// CHECK-SAME: ''
 // CHECK: FunctionDecl
 // CHECK-SAME: func1
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: <col:1, line:4:1>
+// CHECK-SAME: ''
 
 // CHECK: NamespaceDecl
 // CHECK-SAME: test_namespace1
@@ -28,7 +28,7 @@
 // CHECK-SAME: func2
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: <col:1, line:9:1>
+// CHECK-SAME: ''
 
 // CHECK-NEXT: NamespaceDecl
 // CHECK-SAME: test_namespace2
@@ -39,7 +39,7 @@
 // CHECK-SAME: func3
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: UsingDirectiveDecl
-// CHECK-SAME: <col:1, line:17:1>
+// CHECK-SAME: ''
 
 void expr() {
   func1();
diff --git a/clang/test/Modules/odr_hash.cpp b/clang/test/Modules/odr_hash.cpp
index b0f5b904f2b39..4de0e50dbc0eb 100644
--- a/clang/test/Modules/odr_hash.cpp
+++ b/clang/test/Modules/odr_hash.cpp
@@ -4020,7 +4020,7 @@ struct Valid {
 };
 #else
 Invalid::L2<1>::L3<1> invalid;
-// expected-error@second.h:* {{'Types::InjectedClassName::Invalid::L2::L3::x' from module 'SecondModule' is not present in definition of 'L3<value-parameter-1-0>' in module 'FirstModule'}}
+// expected-error@second.h:* {{'Types::InjectedClassName::Invalid::L2::L3::x' from module 'SecondModule' is not present in definition of 'L3<>' in module 'FirstModule'}}
 // expected-note@first.h:* {{declaration of 'x' does not match}}
 Valid::L2<1>::L3<1> valid;
 #endif
diff --git a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
index 0854ac9178b4f..49afb6b860620 100644
--- a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
+++ b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
@@ -274,10 +274,10 @@ namespace TrailingPack {
   // CHECK: |-TemplateArgument pack
   // CHECK: | |-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
   // CHECK: | | `-RecordType {{.*}} 'TrailingPack::(lambda at {{.*}})'
-  // CHECK: | |   `-CXXRecord {{.*}} <line:262:5>
+  // CHECK: | |   `-CXXRecord {{.*}} ''
   // CHECK: | `-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
   // CHECK: |   `-RecordType {{.*}} 'TrailingPack::(lambda at {{.*}})'
-  // CHECK: |     `-CXXRecord {{.*}} <line:263:5>
+  // CHECK: |     `-CXXRecord {{.*}} ''
   // CHECK: |-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: `-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: FunctionProtoType {{.*}} 'auto (T...) -> A<T...>' dependent trailing_return cdecl
diff --git a/clang/test/SemaTemplate/deduction-crash.cpp b/clang/test/SemaTemplate/deduction-crash.cpp
index 287c61aef00d6..e3f97c594aa90 100644
--- a/clang/test/SemaTemplate/deduction-crash.cpp
+++ b/clang/test/SemaTemplate/deduction-crash.cpp
@@ -24,16 +24,16 @@ struct state_machine
   {
     ant(0);
   }
-
+  
   template<class>
   struct region_processing_helper
   {
     template<class,int=0>
     struct In;
-
+    
     template<int my>
     struct In<a::int_<aaa::a>,my>; // expected-error +{{}}
-
+        
     template<class Event>
     int process(Event)
     {
@@ -61,18 +61,18 @@ template <class _Tp> struct remove_reference<_Tp&> ;
 
 template <class > struct __tuple_like;
 
-template <class _Tp, class _Up, int = __tuple_like<typename remove_reference<_Tp>::type>::value>
+template <class _Tp, class _Up, int = __tuple_like<typename remove_reference<_Tp>::type>::value> 
 struct __tuple_convertible;
 
 struct pair
 {
-template<class _Tuple, int = enable_if<__tuple_convertible<_Tuple, pair>::value>::type>
+template<class _Tuple, int = enable_if<__tuple_convertible<_Tuple, pair>::value>::type> 
 pair(_Tuple&& );
 };
 
 template <class> struct basic_ostream;
 
-template <int>
+template <int> 
 void endl( ) ;
 
 extern basic_ostream<char> cout;
@@ -166,9 +166,9 @@ namespace PR51872_part1 {
   template<int> class T1 { template <struct U1> T1(); };
   // expected-error@-1 {{non-type template parameter has incomplete type 'struct U1'}}
   // expected-note@-2  {{forward declaration of 'PR51872_part1::U1'}}
-  // expected-note@-3  {{implicit deduction guide declared as 'template <int> T1(T1<value-parameter-0-0>) -> T1<value-parameter-0-0>'}}
+  // expected-note@-3  {{implicit deduction guide declared as 'template <int> T1(T1<>) -> T1<>'}}
 
   T1 t1 = 0;
   // expected-error@-1 {{no viable constructor or deduction guide for deduction of template arguments of 'T1'}}
-  // expected-note@-7  {{candidate template ignored: could not match 'T1<value-parameter-0-0>' against 'int'}}
+  // expected-note@-7  {{candidate template ignored: could not match 'T1<>' against 'int'}}
 }
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index a4c523595fca2..c0b56f9e6ac7b 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -227,20 +227,20 @@ F s(0);
 // CHECK: FunctionTemplateDecl
 // CHECK: |-NonTypeTemplateParmDecl {{.*}} 'char' depth 0 index 0
 // CHECK:   `-TemplateArgument {{.*}} expr
-// CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} depth 0 index 0 'char'
+// CHECK: |   |-inherited from NonTypeTemplateParm {{.*}} '' 'char'
 // CHECK: |   `-CharacterLiteral {{.*}} 'char' 120
 // CHECK: |-TemplateTypeParmDecl {{.*}} typename depth 0 index 1 U
 // CHECK: |-ParenExpr {{.*}} 'bool'
 // CHECK: | `-CXXBoolLiteralExpr {{.*}} 'bool' false
-// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<value-parameter-0-0>'
+// CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (U) -> F<>'
 // CHECK: | `-ParmVarDecl {{.*}} 'U'
 // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (int) -> F<>'
 // CHECK:   |-TemplateArgument integral ''x''
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
 // CHECK:   `-ParmVarDecl {{.*}} 'int'
-// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<value-parameter-0-0>' dependent trailing_return cdecl
-// CHECK: |-InjectedClassNameType {{.*}} 'F<value-parameter-0-0>' dependent
+// CHECK: FunctionProtoType {{.*}} 'auto (U) -> F<>' dependent trailing_return cdecl
+// CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
 // CHECK: `-TemplateTypeParmType {{.*}} 'U' dependent depth 0 index 1
 

From 8ce0d05b88f3369ba99866195f587a58a2d75f4c Mon Sep 17 00:00:00 2001
From: c8ef <c8ef@outlook.com>
Date: Wed, 29 Jan 2025 00:23:09 +0800
Subject: [PATCH 428/432] [libc] Revise the definition of `posix_spawn`.
 (#124686)

Closes #124635.

Some parameter types in the definition of `posix_spawn` currently do not
match the standard. This patch resolves the issue.
ref: https://man7.org/linux/man-pages/man3/posix_spawn.3.html
---
 libc/include/spawn.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/include/spawn.yaml b/libc/include/spawn.yaml
index e725ab9719eda..c763cc76fd094 100644
--- a/libc/include/spawn.yaml
+++ b/libc/include/spawn.yaml
@@ -18,8 +18,8 @@ functions:
       - type: const char *__restrict
       - type: posix_spawn_file_actions_t *
       - type: posix_spawnattr_t *__restrict
-      - type: const char *__restrict *
-      - type: const char *__restrict *
+      - type: char * const * __restrict
+      - type: char * const * __restrict
   - name: posix_spawn_file_actions_addclose
     standards:
       - POSIX

From a8c82d5fde1dab47a1a3fe27bb8ced8f0eb33ab9 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 28 Jan 2025 16:23:52 +0000
Subject: [PATCH 429/432] [libclc] Optimize isfpclass-like CLC builtins
 (#124145)

The builtins we were using to implement __clc_is(finite|inf|nan|normal)
-- __builtin_isfinite, etc. -- don't take vector types so we were
previously scalarizing. The __builtin_isfpclass builtin does take vector
types and thus allows us to keep things in vectors.

There is no change in codegen to the scalar versions of any of these
builtins.
---
 .../clc/include/clc/relational/relational.h   | 82 ++++++-------------
 .../lib/generic/relational/clc_isfinite.cl    | 15 +---
 .../clc/lib/generic/relational/clc_isinf.cl   |  8 +-
 .../clc/lib/generic/relational/clc_isnan.cl   | 14 ++--
 .../lib/generic/relational/clc_isnormal.cl    | 15 +---
 .../clc/lib/generic/relational/clc_signbit.cl | 51 +++++++++++-
 6 files changed, 91 insertions(+), 94 deletions(-)

diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h
index f269715cfc83c..2210d64bdac7b 100644
--- a/libclc/clc/include/clc/relational/relational.h
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -6,63 +6,6 @@
  * when the result is true.
  */
 
-#define _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_NAME,  \
-                                            ARG_TYPE)                          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return BUILTIN_NAME(x);                                                    \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE)        \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} !=            \
-                      (RET_TYPE)0);                                            \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE)        \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
-                                 FUNCTION(x.s2)} != (RET_TYPE)0);              \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE)        \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
-                                 FUNCTION(x.s2),                               \
-                                 FUNCTION(x.s3)} != (RET_TYPE)0);              \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE)        \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return (                                                                   \
-        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
-                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
-                             FUNCTION(x.s6), FUNCTION(x.s7)} != (RET_TYPE)0);  \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE)       \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
-    return (                                                                   \
-        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
-                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
-                             FUNCTION(x.s6), FUNCTION(x.s7), FUNCTION(x.s8),   \
-                             FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb),   \
-                             FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se),   \
-                             FUNCTION(x.sf)} != (RET_TYPE)0);                  \
-  }
-
-#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)     \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2)        \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3)        \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4)        \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8)        \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16)
-
-#define _CLC_DEFINE_RELATIONAL_UNARY(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,     \
-                                     ARG_TYPE)                                 \
-  _CLC_DEFINE_RELATIONAL_UNARY_SCALAR(RET_TYPE, FUNCTION, BUILTIN_FUNCTION,    \
-                                      ARG_TYPE)                                \
-  _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)
-
 #define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \
                                              ARG1_TYPE, ARG2_TYPE)             \
   _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
@@ -89,4 +32,29 @@
     return _CLC_RELATIONAL_OP(x, y);                                           \
   }
 
+#define fcNan (__FPCLASS_SNAN | __FPCLASS_QNAN)
+#define fcInf (__FPCLASS_POSINF | __FPCLASS_NEGINF)
+#define fcNormal (__FPCLASS_POSNORMAL | __FPCLASS_NEGNORMAL)
+#define fcPosFinite                                                            \
+  (__FPCLASS_POSNORMAL | __FPCLASS_POSSUBNORMAL | __FPCLASS_POSZERO)
+#define fcNegFinite                                                            \
+  (__FPCLASS_NEGNORMAL | __FPCLASS_NEGSUBNORMAL | __FPCLASS_NEGZERO)
+#define fcFinite (fcPosFinite | fcNegFinite)
+
+#define _CLC_DEFINE_ISFPCLASS_VEC(RET_TYPE, FUNCTION, MASK, ARG_TYPE)          \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)(__builtin_isfpclass(x, (MASK)) != (RET_TYPE)0);          \
+  }
+
+#define _CLC_DEFINE_ISFPCLASS(RET_TYPE, VEC_RET_TYPE, FUNCTION, MASK,          \
+                              ARG_TYPE)                                        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return __builtin_isfpclass(x, (MASK));                                     \
+  }                                                                            \
+  _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##2, FUNCTION, MASK, ARG_TYPE##2)      \
+  _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##3, FUNCTION, MASK, ARG_TYPE##3)      \
+  _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##4, FUNCTION, MASK, ARG_TYPE##4)      \
+  _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##8, FUNCTION, MASK, ARG_TYPE##8)      \
+  _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##16, FUNCTION, MASK, ARG_TYPE##16)
+
 #endif // __CLC_RELATIONAL_RELATIONAL_H__
diff --git a/libclc/clc/lib/generic/relational/clc_isfinite.cl b/libclc/clc/lib/generic/relational/clc_isfinite.cl
index c3def5dc5f0d5..2d28f6f4cccf7 100644
--- a/libclc/clc/lib/generic/relational/clc_isfinite.cl
+++ b/libclc/clc/lib/generic/relational/clc_isfinite.cl
@@ -1,7 +1,7 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isfinite, __builtin_isfinite, float)
+_CLC_DEFINE_ISFPCLASS(int, int, __clc_isfinite, fcFinite, float)
 
 #ifdef cl_khr_fp64
 
@@ -9,23 +9,16 @@ _CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isfinite, __builtin_isfinite, float)
 
 // The scalar version of __clc_isfinite(double) returns an int, but the vector
 // versions return long.
-_CLC_DEF _CLC_OVERLOAD int __clc_isfinite(double x) {
-  return __builtin_isfinite(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isfinite, double)
+_CLC_DEFINE_ISFPCLASS(int, long, __clc_isfinite, fcFinite, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isfinite(half) returns an int, but the vector
 // versions return short.
-_CLC_DEF _CLC_OVERLOAD int __clc_isfinite(half x) {
-  return __builtin_isfinite(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isfinite, half)
+_CLC_DEFINE_ISFPCLASS(int, short, __clc_isfinite, fcFinite, half)
 
 #endif
diff --git a/libclc/clc/lib/generic/relational/clc_isinf.cl b/libclc/clc/lib/generic/relational/clc_isinf.cl
index afe29122f36a3..799c017c22bbc 100644
--- a/libclc/clc/lib/generic/relational/clc_isinf.cl
+++ b/libclc/clc/lib/generic/relational/clc_isinf.cl
@@ -1,7 +1,7 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isinf, __builtin_isinf, float)
+_CLC_DEFINE_ISFPCLASS(int, int, __clc_isinf, fcInf, float)
 
 #ifdef cl_khr_fp64
 
@@ -9,9 +9,8 @@ _CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isinf, __builtin_isinf, float)
 
 // The scalar version of __clc_isinf(double) returns an int, but the vector
 // versions return long.
-_CLC_DEF _CLC_OVERLOAD int __clc_isinf(double x) { return __builtin_isinf(x); }
+_CLC_DEFINE_ISFPCLASS(int, long, __clc_isinf, fcInf, double)
 
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isinf, double)
 #endif
 
 #ifdef cl_khr_fp16
@@ -20,7 +19,6 @@ _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isinf, double)
 
 // The scalar version of __clc_isinf(half) returns an int, but the vector
 // versions return short.
-_CLC_DEF _CLC_OVERLOAD int __clc_isinf(half x) { return __builtin_isinf(x); }
+_CLC_DEFINE_ISFPCLASS(int, short, __clc_isinf, fcInf, half)
 
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isinf, half)
 #endif
diff --git a/libclc/clc/lib/generic/relational/clc_isnan.cl b/libclc/clc/lib/generic/relational/clc_isnan.cl
index fb30cd5419214..6a0672dfa87a8 100644
--- a/libclc/clc/lib/generic/relational/clc_isnan.cl
+++ b/libclc/clc/lib/generic/relational/clc_isnan.cl
@@ -1,17 +1,15 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isnan, __builtin_isnan, float)
+_CLC_DEFINE_ISFPCLASS(int, int, __clc_isnan, fcNan, float)
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // The scalar version of __clc_isnan(double) returns an int, but the vector
-// versions return long.
-_CLC_DEF _CLC_OVERLOAD int __clc_isnan(double x) { return __builtin_isnan(x); }
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isnan, double)
+// versions return a long.
+_CLC_DEFINE_ISFPCLASS(int, long, __clc_isnan, fcNan, double)
 
 #endif
 
@@ -20,9 +18,7 @@ _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isnan, double)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isnan(half) returns an int, but the vector
-// versions return short.
-_CLC_DEF _CLC_OVERLOAD int __clc_isnan(half x) { return __builtin_isnan(x); }
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isnan, half)
+// versions return a short.
+_CLC_DEFINE_ISFPCLASS(int, short, __clc_isnan, fcNan, half)
 
 #endif
diff --git a/libclc/clc/lib/generic/relational/clc_isnormal.cl b/libclc/clc/lib/generic/relational/clc_isnormal.cl
index e0da8cc0756f4..1e31b04e2a6fd 100644
--- a/libclc/clc/lib/generic/relational/clc_isnormal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isnormal.cl
@@ -1,7 +1,7 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isnormal, __builtin_isnormal, float)
+_CLC_DEFINE_ISFPCLASS(int, int, __clc_isnormal, fcNormal, float)
 
 #ifdef cl_khr_fp64
 
@@ -9,23 +9,16 @@ _CLC_DEFINE_RELATIONAL_UNARY(int, __clc_isnormal, __builtin_isnormal, float)
 
 // The scalar version of __clc_isnormal(double) returns an int, but the vector
 // versions return long.
-_CLC_DEF _CLC_OVERLOAD int __clc_isnormal(double x) {
-  return __builtin_isnormal(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(long, __clc_isnormal, double)
+_CLC_DEFINE_ISFPCLASS(int, long, __clc_isnormal, fcNormal, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isnormal(half) returns an int, but the vector
 // versions return short.
-_CLC_DEF _CLC_OVERLOAD int __clc_isnormal(half x) {
-  return __builtin_isnormal(x);
-}
-
-_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(short, __clc_isnormal, half)
+_CLC_DEFINE_ISFPCLASS(int, short, __clc_isnormal, fcNormal, half)
 
 #endif
diff --git a/libclc/clc/lib/generic/relational/clc_signbit.cl b/libclc/clc/lib/generic/relational/clc_signbit.cl
index b1b294379e5a8..67043c42d0ebb 100644
--- a/libclc/clc/lib/generic/relational/clc_signbit.cl
+++ b/libclc/clc/lib/generic/relational/clc_signbit.cl
@@ -1,7 +1,56 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-_CLC_DEFINE_RELATIONAL_UNARY(int, __clc_signbit, __builtin_signbitf, float)
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} !=            \
+                      (RET_TYPE)0);                                            \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
+                                 FUNCTION(x.s2)} != (RET_TYPE)0);              \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1),               \
+                                 FUNCTION(x.s2),                               \
+                                 FUNCTION(x.s3)} != (RET_TYPE)0);              \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE)        \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (                                                                   \
+        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
+                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
+                             FUNCTION(x.s6), FUNCTION(x.s7)} != (RET_TYPE)0);  \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE)       \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) {                       \
+    return (                                                                   \
+        RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2),   \
+                             FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5),   \
+                             FUNCTION(x.s6), FUNCTION(x.s7), FUNCTION(x.s8),   \
+                             FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb),   \
+                             FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se),   \
+                             FUNCTION(x.sf)} != (RET_TYPE)0);                  \
+  }
+
+#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE)     \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8)        \
+  _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16)
+
+_CLC_DEF _CLC_OVERLOAD int __clc_signbit(float x) {
+  return __builtin_signbitf(x);
+}
+
+_CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(int, __clc_signbit, float)
 
 #ifdef cl_khr_fp64
 

From 5a8fe9e9a777094dba05bdaca8f2dd83af183a0f Mon Sep 17 00:00:00 2001
From: c8ef <c8ef@outlook.com>
Date: Wed, 29 Jan 2025 00:24:11 +0800
Subject: [PATCH 430/432] [libc] Revise the definition of `{get, set}rlimit`.
 (#124701)

Closes #124633.

Some parameter types in the definition of `{get, set}rlimit` currently
do not match the standard. This patch resolves the issue.
ref: https://man7.org/linux/man-pages/man2/getrlimit.2.html
---
 libc/include/sys/resource.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/include/sys/resource.yaml b/libc/include/sys/resource.yaml
index 3652d6d490a49..e7645a3b4f0b3 100644
--- a/libc/include/sys/resource.yaml
+++ b/libc/include/sys/resource.yaml
@@ -12,10 +12,12 @@ functions:
       - POSIX
     return_type: int
     arguments:
+      - type: int
       - type: struct rlimit *
   - name: setrlimit
     standards:
       - POSIX
     return_type: int
     arguments:
-      - type: const struct rlimit
+      - type: int
+      - type: const struct rlimit *

From ec5f5e65ccd3beb6e3fe350ed1f00dbe1095e088 Mon Sep 17 00:00:00 2001
From: Jonas Rickert <jonas.rickert@amd.com>
Date: Fri, 20 Jun 2025 02:01:12 -0600
Subject: [PATCH 431/432] Update conv_acc helper function with changes in
 torch-mlir

---
 mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
index 42cc3bfab7e2f..e20c5462e4988 100644
--- a/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
+++ b/mlir/lib/Dialect/Tosa/Utils/ConversionUtils.cpp
@@ -184,11 +184,11 @@ Value mlir::tosa::getTosaConstShape(PatternRewriter &rewriter, Location loc,
 
 // AMD: Picked from torch-mlir 12250739bfe85b702f9503cad45c2e535ea8eb18
 // Get accumulator type for TOSA convolution ops
-LogicalResult mlir::tosa ::getConvOpsAccType(PatternRewriter &rewriter,
-                                             RankedTensorType inputTy,
-                                             RankedTensorType weightTy,
-                                             RankedTensorType outputTy,
-                                             TypeAttr &accType) {
+LogicalResult mlir::tosa::getConvOpsAccType(PatternRewriter &rewriter,
+                                            RankedTensorType inputTy,
+                                            RankedTensorType weightTy,
+                                            RankedTensorType outputTy,
+                                            TypeAttr &accType) {
   auto inputElemTy = inputTy.getElementType();
   auto weightElemTy = weightTy.getElementType();
   auto outputElemTy = outputTy.getElementType();
@@ -218,8 +218,8 @@ LogicalResult mlir::tosa ::getConvOpsAccType(PatternRewriter &rewriter,
   } else if (inputElemTy.isInteger(16) && weightElemTy.isInteger(8) &&
              outputElemTy.isInteger(48)) {
     accType = mlir::TypeAttr::get(rewriter.getIntegerType(48));
-  } else if ((isa<Float8E4M3FNType>(inputElemTy) &&
-              isa<Float8E4M3FNType>(weightElemTy) && outputElemTy.isF16()) ||
+  } else if ((isa<Float8E4M3Type>(inputElemTy) &&
+              isa<Float8E4M3Type>(weightElemTy) && outputElemTy.isF16()) ||
              (isa<Float8E5M2Type>(inputElemTy) &&
               isa<Float8E5M2Type>(weightElemTy) && outputElemTy.isF16())) {
     accType = mlir::TypeAttr::get(rewriter.getF16Type());

From 590a988c1ddf5dd7507f48b45a10bbf2dac84e01 Mon Sep 17 00:00:00 2001
From: Jonas Rickert <jonas.rickert@amd.com>
Date: Mon, 23 Jun 2025 01:33:22 -0600
Subject: [PATCH 432/432] Revert "[TOSA] bug fix infer shape for slice
 (#113497)"

This reverts commit 7986e0cad10f3bf9efbbe31110ece68af5cb8751.
Reason for revert: This shape inference si not following tosa spec and causing problems when lowering from onnx to tosa
---
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          | 36 +---------------
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 42 -------------------
 2 files changed, 2 insertions(+), 76 deletions(-)

diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 3eb3360a7601c..3b51eff8940f5 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -926,40 +926,8 @@ LogicalResult tosa::SliceOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     SliceOp::Adaptor adaptor,
     SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
-  auto start = adaptor.getStart();
-  auto size = adaptor.getSize();
-
-  // if size[i] is -1, all remaining elements in dimension i are included
-  // in the slice, similar to TF.
-  ShapeAdaptor inputShape(adaptor.getInput1().getType());
-  // initialize outputShape to all unknown
-  SmallVector<int64_t> outputShape(size.size(), ShapedType::kDynamic);
-  if (inputShape.hasRank()) {
-    for (size_t i = 0; i < size.size(); i++) {
-      if (size[i] != 0 && size[i] >= -1 && start[i] >= 0 &&
-          (ShapedType::isDynamic(inputShape.getDimSize(i)) ||
-           start[i] < inputShape.getDimSize(i))) {
-        // size[i] is not 0 and not < -1, and start[i] is in valid range
-        if (ShapedType::isDynamic(inputShape.getDimSize(i))) {
-          // input shape has unknown dim[i] - only valid if size[i] > 0
-          if (size[i] > 0) {
-            outputShape[i] = size[i];
-          }
-        } else {
-          // input shape has known dim[i]
-          if (size[i] == -1) {
-            outputShape[i] = inputShape.getDimSize(i) - start[i];
-          } else if (start[i] + size[i] <= inputShape.getDimSize(i)) {
-            // start[i] + size[i] is within bound of input shape's dim[i]
-            outputShape[i] = size[i];
-          }
-        }
-      }
-    }
-  } else {
-    outputShape = convertToMlirShape(size);
-  }
-  inferredReturnShapes.push_back(ShapedTypeComponents(outputShape));
+  inferredReturnShapes.push_back(
+      ShapedTypeComponents(convertToMlirShape(adaptor.getSize())));
   return success();
 }
 
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 65ceb53db8766..b7af09bdfad62 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -524,48 +524,6 @@ func.func @test_slice(%arg0 : tensor<?xi32>) -> () {
 
 // -----
 
-// CHECK-LABEL: @test_slice_size_minus_one
-func.func @test_slice_size_minus_one(%arg0 : tensor<?x8x8x8xi32>) -> () {
-  // CHECK: tosa.slice %arg0 {size = array<i64: -1, -1, -1, -1>, start = array<i64: 0, 1, -1, 8>} : (tensor<?x8x8x8xi32>) -> tensor<?x7x?x?xi32>
-  // this checks following
-  //  dim 0: size=-1, input dim=? => inferred output dim is ?
-  //  dim 1: size=-1 => inferred output dim is input_dim - start
-  //  dim 2: size=-1, start=-1 => inferred output dim is ?
-  //  dim 3: size=-1, start=8 => inferred output dim is ? because start is out of bound
-  %2= tosa.slice %arg0 { start = array<i64: 0, 1, -1, 8>, size = array<i64: -1, -1, -1, -1> } : (tensor<?x8x8x8xi32>) -> tensor<?x?x?x?xi32>
-  return
-}
-
-// -----
-// COM: AMD: disabled, input is invalid
-// // COM-LABEL: @test_slice_size_out_of_bound
-// func.func @test_slice_size_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () {
-//   // COM: tosa.slice %arg0 {size = array<i64: 0, -2, 9, 4>, start = array<i64: 0, 0, 0, 0>} : (tensor<8x8x8x?xi32>) -> tensor<?x?x?x4xi32>
-//   // this checks following
-//   //  dim 0: size=0 => inferred output dim is ?
-//   //  dim 1: size=-2 => inferred output dim is ?
-//   //  dim 3: start+size out of bound because size too big: inferred output dim is ?
-//   //  dim 4: size=4, input dim=? => inferred output dim is 4
-//   %2= tosa.slice %arg0 { start = array<i64: 0, 0, 0, 0>, size = array<i64: 0, -2, 9, 4> } : (tensor<8x8x8x?xi32>) -> tensor<?x?x?x?xi32>
-//   return
-// }
-
-// -----
-// COM: AMD: disabled, input is invalid
-// // COM-LABEL: @test_slice_start_out_of_bound
-// func.func @test_slice_start_out_of_bound(%arg0 : tensor<8x8x8x?xi32>) -> () {
-//   // COM: tosa.slice %arg0 {size = array<i64: 1, 1, 3, 4>, start = array<i64: -1, 8, 6, 8000000>} : (tensor<8x8x8x?xi32>) -> tensor<?x?x?x4xi32>
-//   // this checks following
-//   //  dim 0: start=-1 => inferred output dim is ?
-//   //  dim 1: start=8 => inferred output dim is ?
-//   //  dim 2: start+size out of bound: inferred output dim is ?
-//   //  dim 3: start=8000000, size=4, input dim=? => inferred output dim is 4
-//   %2= tosa.slice %arg0 { start = array<i64: -1, 8, 6, 8000000>, size = array<i64: 1, 1, 3, 4> } : (tensor<8x8x8x?xi32>) -> tensor<?x?x?x?xi32>
-//   return
-// }
-
-// -----
-
 // CHECK-LABEL: @test_slice_dynamic
 func.func @test_slice_dynamic(%arg0 : tensor<10x?x2xf32>) -> () {
   // CHECK: tosa.slice %arg0 {size = array<i64: 7, -1, 1>, start = array<i64: 1, 0, 0>} : (tensor<10x?x2xf32>) -> tensor<7x?x1xf32>